diff --git "a/checkpoint-5364/trainer_state.json" "b/checkpoint-5364/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-5364/trainer_state.json" @@ -0,0 +1,37581 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 5364, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0011185682326621924, + "grad_norm": 2.231250286102295, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.0507, + "step": 1 + }, + { + "epoch": 0.0022371364653243847, + "grad_norm": 2.1123249530792236, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.04, + "step": 2 + }, + { + "epoch": 0.003355704697986577, + "grad_norm": 2.0946707725524902, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.0307, + "step": 3 + }, + { + "epoch": 0.0044742729306487695, + "grad_norm": 2.0837416648864746, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.0484, + "step": 4 + }, + { + "epoch": 0.005592841163310962, + "grad_norm": 1.9843275547027588, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.0012, + "step": 5 + }, + { + "epoch": 0.006711409395973154, + "grad_norm": 2.121988296508789, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.0603, + "step": 6 + }, + { + "epoch": 0.007829977628635347, + "grad_norm": 2.029029369354248, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.0323, + "step": 7 + }, + { + "epoch": 0.008948545861297539, + "grad_norm": 1.9815905094146729, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.0273, + "step": 8 + }, + { + "epoch": 0.010067114093959731, + "grad_norm": 2.3339314460754395, + "learning_rate": 4.5000000000000003e-07, + "loss": 1.0805, + "step": 9 + }, + { + "epoch": 0.011185682326621925, + "grad_norm": 2.1078243255615234, + "learning_rate": 5.000000000000001e-07, + "loss": 1.0382, + "step": 10 + }, + { + "epoch": 0.012304250559284116, + "grad_norm": 1.8874777555465698, + "learning_rate": 5.5e-07, + "loss": 1.008, + "step": 11 + }, + { + "epoch": 0.013422818791946308, + "grad_norm": 1.9720211029052734, + "learning_rate": 6.000000000000001e-07, + "loss": 1.0065, + "step": 12 + }, + { + "epoch": 0.0145413870246085, + "grad_norm": 2.0002245903015137, + "learning_rate": 6.5e-07, + "loss": 1.0379, + "step": 13 + }, + { + "epoch": 0.015659955257270694, + "grad_norm": 1.983207106590271, + "learning_rate": 7.000000000000001e-07, + "loss": 1.0271, + "step": 14 + }, + { + "epoch": 0.016778523489932886, + "grad_norm": 1.886121153831482, + "learning_rate": 7.5e-07, + "loss": 1.0019, + "step": 15 + }, + { + "epoch": 0.017897091722595078, + "grad_norm": 1.9403958320617676, + "learning_rate": 8.000000000000001e-07, + "loss": 0.9885, + "step": 16 + }, + { + "epoch": 0.01901565995525727, + "grad_norm": 1.9739996194839478, + "learning_rate": 8.500000000000001e-07, + "loss": 0.9904, + "step": 17 + }, + { + "epoch": 0.020134228187919462, + "grad_norm": 1.7419469356536865, + "learning_rate": 9.000000000000001e-07, + "loss": 0.9709, + "step": 18 + }, + { + "epoch": 0.021252796420581657, + "grad_norm": 1.7856152057647705, + "learning_rate": 9.500000000000001e-07, + "loss": 0.9859, + "step": 19 + }, + { + "epoch": 0.02237136465324385, + "grad_norm": 1.6159933805465698, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9895, + "step": 20 + }, + { + "epoch": 0.02348993288590604, + "grad_norm": 1.7010679244995117, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.0115, + "step": 21 + }, + { + "epoch": 0.024608501118568233, + "grad_norm": 1.7860039472579956, + "learning_rate": 1.1e-06, + "loss": 0.9917, + "step": 22 + }, + { + "epoch": 0.025727069351230425, + "grad_norm": 1.3735058307647705, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.9441, + "step": 23 + }, + { + "epoch": 0.026845637583892617, + "grad_norm": 1.439109206199646, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.9304, + "step": 24 + }, + { + "epoch": 0.02796420581655481, + "grad_norm": 1.380369782447815, + "learning_rate": 1.25e-06, + "loss": 0.9469, + "step": 25 + }, + { + "epoch": 0.029082774049217, + "grad_norm": 1.2287472486495972, + "learning_rate": 1.3e-06, + "loss": 0.8808, + "step": 26 + }, + { + "epoch": 0.030201342281879196, + "grad_norm": 1.0899194478988647, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.8912, + "step": 27 + }, + { + "epoch": 0.03131991051454139, + "grad_norm": 1.0445002317428589, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.876, + "step": 28 + }, + { + "epoch": 0.03243847874720358, + "grad_norm": 1.0201383829116821, + "learning_rate": 1.45e-06, + "loss": 0.9003, + "step": 29 + }, + { + "epoch": 0.03355704697986577, + "grad_norm": 0.9528365731239319, + "learning_rate": 1.5e-06, + "loss": 0.8537, + "step": 30 + }, + { + "epoch": 0.03467561521252797, + "grad_norm": 0.9615768194198608, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.8819, + "step": 31 + }, + { + "epoch": 0.035794183445190156, + "grad_norm": 0.9578896760940552, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.8859, + "step": 32 + }, + { + "epoch": 0.03691275167785235, + "grad_norm": 0.977853536605835, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.8835, + "step": 33 + }, + { + "epoch": 0.03803131991051454, + "grad_norm": 0.8976068496704102, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.8599, + "step": 34 + }, + { + "epoch": 0.039149888143176735, + "grad_norm": 0.8779590725898743, + "learning_rate": 1.75e-06, + "loss": 0.8708, + "step": 35 + }, + { + "epoch": 0.040268456375838924, + "grad_norm": 0.853705644607544, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.8465, + "step": 36 + }, + { + "epoch": 0.04138702460850112, + "grad_norm": 0.8480839729309082, + "learning_rate": 1.85e-06, + "loss": 0.8292, + "step": 37 + }, + { + "epoch": 0.042505592841163314, + "grad_norm": 0.8372538089752197, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.8026, + "step": 38 + }, + { + "epoch": 0.0436241610738255, + "grad_norm": 0.8592961430549622, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.8153, + "step": 39 + }, + { + "epoch": 0.0447427293064877, + "grad_norm": 0.8222276568412781, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.814, + "step": 40 + }, + { + "epoch": 0.04586129753914989, + "grad_norm": 0.825672447681427, + "learning_rate": 2.05e-06, + "loss": 0.7793, + "step": 41 + }, + { + "epoch": 0.04697986577181208, + "grad_norm": 0.8016732335090637, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.771, + "step": 42 + }, + { + "epoch": 0.04809843400447427, + "grad_norm": 0.7026550769805908, + "learning_rate": 2.15e-06, + "loss": 0.7664, + "step": 43 + }, + { + "epoch": 0.049217002237136466, + "grad_norm": 0.6678670644760132, + "learning_rate": 2.2e-06, + "loss": 0.7774, + "step": 44 + }, + { + "epoch": 0.050335570469798654, + "grad_norm": 0.6766750812530518, + "learning_rate": 2.25e-06, + "loss": 0.7832, + "step": 45 + }, + { + "epoch": 0.05145413870246085, + "grad_norm": 0.7094117999076843, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.7861, + "step": 46 + }, + { + "epoch": 0.052572706935123045, + "grad_norm": 0.6871191263198853, + "learning_rate": 2.35e-06, + "loss": 0.7848, + "step": 47 + }, + { + "epoch": 0.053691275167785234, + "grad_norm": 0.6089867353439331, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.7658, + "step": 48 + }, + { + "epoch": 0.05480984340044743, + "grad_norm": 0.5112010836601257, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.7921, + "step": 49 + }, + { + "epoch": 0.05592841163310962, + "grad_norm": 0.5008496046066284, + "learning_rate": 2.5e-06, + "loss": 0.7105, + "step": 50 + }, + { + "epoch": 0.05704697986577181, + "grad_norm": 0.5599631071090698, + "learning_rate": 2.55e-06, + "loss": 0.7526, + "step": 51 + }, + { + "epoch": 0.058165548098434, + "grad_norm": 0.6905913352966309, + "learning_rate": 2.6e-06, + "loss": 0.7496, + "step": 52 + }, + { + "epoch": 0.0592841163310962, + "grad_norm": 0.6198621392250061, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.7297, + "step": 53 + }, + { + "epoch": 0.06040268456375839, + "grad_norm": 0.6158658862113953, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.7309, + "step": 54 + }, + { + "epoch": 0.06152125279642058, + "grad_norm": 0.5798735618591309, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.7102, + "step": 55 + }, + { + "epoch": 0.06263982102908278, + "grad_norm": 0.5550254583358765, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.7488, + "step": 56 + }, + { + "epoch": 0.06375838926174497, + "grad_norm": 0.4888734221458435, + "learning_rate": 2.85e-06, + "loss": 0.75, + "step": 57 + }, + { + "epoch": 0.06487695749440715, + "grad_norm": 0.4579496383666992, + "learning_rate": 2.9e-06, + "loss": 0.7108, + "step": 58 + }, + { + "epoch": 0.06599552572706935, + "grad_norm": 0.5775673389434814, + "learning_rate": 2.95e-06, + "loss": 0.7337, + "step": 59 + }, + { + "epoch": 0.06711409395973154, + "grad_norm": 0.5035051703453064, + "learning_rate": 3e-06, + "loss": 0.7677, + "step": 60 + }, + { + "epoch": 0.06823266219239374, + "grad_norm": 0.4771614074707031, + "learning_rate": 3.05e-06, + "loss": 0.724, + "step": 61 + }, + { + "epoch": 0.06935123042505593, + "grad_norm": 0.45495525002479553, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.7393, + "step": 62 + }, + { + "epoch": 0.07046979865771812, + "grad_norm": 0.36385607719421387, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.7029, + "step": 63 + }, + { + "epoch": 0.07158836689038031, + "grad_norm": 0.3554967939853668, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.6991, + "step": 64 + }, + { + "epoch": 0.07270693512304251, + "grad_norm": 0.36548176407814026, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.7292, + "step": 65 + }, + { + "epoch": 0.0738255033557047, + "grad_norm": 0.35280168056488037, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.7295, + "step": 66 + }, + { + "epoch": 0.07494407158836688, + "grad_norm": 0.3599022924900055, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.6956, + "step": 67 + }, + { + "epoch": 0.07606263982102908, + "grad_norm": 0.3802206516265869, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.6796, + "step": 68 + }, + { + "epoch": 0.07718120805369127, + "grad_norm": 0.3787902891635895, + "learning_rate": 3.45e-06, + "loss": 0.7141, + "step": 69 + }, + { + "epoch": 0.07829977628635347, + "grad_norm": 0.374461829662323, + "learning_rate": 3.5e-06, + "loss": 0.7043, + "step": 70 + }, + { + "epoch": 0.07941834451901567, + "grad_norm": 0.34469330310821533, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.7037, + "step": 71 + }, + { + "epoch": 0.08053691275167785, + "grad_norm": 0.346836119890213, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.7246, + "step": 72 + }, + { + "epoch": 0.08165548098434004, + "grad_norm": 0.34163376688957214, + "learning_rate": 3.65e-06, + "loss": 0.6977, + "step": 73 + }, + { + "epoch": 0.08277404921700224, + "grad_norm": 0.3481418788433075, + "learning_rate": 3.7e-06, + "loss": 0.7356, + "step": 74 + }, + { + "epoch": 0.08389261744966443, + "grad_norm": 0.3230934739112854, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.6869, + "step": 75 + }, + { + "epoch": 0.08501118568232663, + "grad_norm": 0.319917231798172, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.6722, + "step": 76 + }, + { + "epoch": 0.08612975391498881, + "grad_norm": 0.3535120487213135, + "learning_rate": 3.85e-06, + "loss": 0.6951, + "step": 77 + }, + { + "epoch": 0.087248322147651, + "grad_norm": 0.3229662775993347, + "learning_rate": 3.900000000000001e-06, + "loss": 0.69, + "step": 78 + }, + { + "epoch": 0.0883668903803132, + "grad_norm": 0.33365264534950256, + "learning_rate": 3.95e-06, + "loss": 0.701, + "step": 79 + }, + { + "epoch": 0.0894854586129754, + "grad_norm": 0.3302946984767914, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6733, + "step": 80 + }, + { + "epoch": 0.09060402684563758, + "grad_norm": 0.3478582799434662, + "learning_rate": 4.05e-06, + "loss": 0.7022, + "step": 81 + }, + { + "epoch": 0.09172259507829977, + "grad_norm": 0.33355170488357544, + "learning_rate": 4.1e-06, + "loss": 0.7141, + "step": 82 + }, + { + "epoch": 0.09284116331096197, + "grad_norm": 0.3217330574989319, + "learning_rate": 4.15e-06, + "loss": 0.6799, + "step": 83 + }, + { + "epoch": 0.09395973154362416, + "grad_norm": 0.328838050365448, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.6943, + "step": 84 + }, + { + "epoch": 0.09507829977628636, + "grad_norm": 0.3279136121273041, + "learning_rate": 4.25e-06, + "loss": 0.6699, + "step": 85 + }, + { + "epoch": 0.09619686800894854, + "grad_norm": 0.333351194858551, + "learning_rate": 4.3e-06, + "loss": 0.712, + "step": 86 + }, + { + "epoch": 0.09731543624161074, + "grad_norm": 0.33052128553390503, + "learning_rate": 4.350000000000001e-06, + "loss": 0.7169, + "step": 87 + }, + { + "epoch": 0.09843400447427293, + "grad_norm": 0.31631597876548767, + "learning_rate": 4.4e-06, + "loss": 0.6772, + "step": 88 + }, + { + "epoch": 0.09955257270693513, + "grad_norm": 0.327311635017395, + "learning_rate": 4.450000000000001e-06, + "loss": 0.6873, + "step": 89 + }, + { + "epoch": 0.10067114093959731, + "grad_norm": 0.32048892974853516, + "learning_rate": 4.5e-06, + "loss": 0.6614, + "step": 90 + }, + { + "epoch": 0.1017897091722595, + "grad_norm": 0.32614201307296753, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.7197, + "step": 91 + }, + { + "epoch": 0.1029082774049217, + "grad_norm": 0.31145355105400085, + "learning_rate": 4.600000000000001e-06, + "loss": 0.6567, + "step": 92 + }, + { + "epoch": 0.1040268456375839, + "grad_norm": 0.31379351019859314, + "learning_rate": 4.65e-06, + "loss": 0.7013, + "step": 93 + }, + { + "epoch": 0.10514541387024609, + "grad_norm": 0.32741424441337585, + "learning_rate": 4.7e-06, + "loss": 0.6737, + "step": 94 + }, + { + "epoch": 0.10626398210290827, + "grad_norm": 0.325630247592926, + "learning_rate": 4.75e-06, + "loss": 0.6673, + "step": 95 + }, + { + "epoch": 0.10738255033557047, + "grad_norm": 0.3153480291366577, + "learning_rate": 4.800000000000001e-06, + "loss": 0.6943, + "step": 96 + }, + { + "epoch": 0.10850111856823266, + "grad_norm": 0.3244793117046356, + "learning_rate": 4.85e-06, + "loss": 0.6896, + "step": 97 + }, + { + "epoch": 0.10961968680089486, + "grad_norm": 0.3078743517398834, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.6837, + "step": 98 + }, + { + "epoch": 0.11073825503355705, + "grad_norm": 0.3314874470233917, + "learning_rate": 4.95e-06, + "loss": 0.7416, + "step": 99 + }, + { + "epoch": 0.11185682326621924, + "grad_norm": 0.31931284070014954, + "learning_rate": 5e-06, + "loss": 0.6903, + "step": 100 + }, + { + "epoch": 0.11297539149888143, + "grad_norm": 0.3176276981830597, + "learning_rate": 4.999999554776598e-06, + "loss": 0.6761, + "step": 101 + }, + { + "epoch": 0.11409395973154363, + "grad_norm": 0.3285365700721741, + "learning_rate": 4.999998219106549e-06, + "loss": 0.6892, + "step": 102 + }, + { + "epoch": 0.11521252796420582, + "grad_norm": 0.31144094467163086, + "learning_rate": 4.99999599299033e-06, + "loss": 0.6586, + "step": 103 + }, + { + "epoch": 0.116331096196868, + "grad_norm": 0.313289076089859, + "learning_rate": 4.999992876428732e-06, + "loss": 0.708, + "step": 104 + }, + { + "epoch": 0.1174496644295302, + "grad_norm": 0.3252837061882019, + "learning_rate": 4.999988869422867e-06, + "loss": 0.7083, + "step": 105 + }, + { + "epoch": 0.1185682326621924, + "grad_norm": 0.3168275058269501, + "learning_rate": 4.9999839719741615e-06, + "loss": 0.6806, + "step": 106 + }, + { + "epoch": 0.11968680089485459, + "grad_norm": 0.31589415669441223, + "learning_rate": 4.9999781840843594e-06, + "loss": 0.6702, + "step": 107 + }, + { + "epoch": 0.12080536912751678, + "grad_norm": 0.318037748336792, + "learning_rate": 4.999971505755523e-06, + "loss": 0.6601, + "step": 108 + }, + { + "epoch": 0.12192393736017897, + "grad_norm": 0.33259475231170654, + "learning_rate": 4.999963936990031e-06, + "loss": 0.7001, + "step": 109 + }, + { + "epoch": 0.12304250559284116, + "grad_norm": 0.33322346210479736, + "learning_rate": 4.999955477790579e-06, + "loss": 0.6731, + "step": 110 + }, + { + "epoch": 0.12416107382550336, + "grad_norm": 0.31344881653785706, + "learning_rate": 4.999946128160179e-06, + "loss": 0.6667, + "step": 111 + }, + { + "epoch": 0.12527964205816555, + "grad_norm": 0.32769575715065, + "learning_rate": 4.999935888102162e-06, + "loss": 0.7123, + "step": 112 + }, + { + "epoch": 0.12639821029082773, + "grad_norm": 0.314619243144989, + "learning_rate": 4.9999247576201765e-06, + "loss": 0.683, + "step": 113 + }, + { + "epoch": 0.12751677852348994, + "grad_norm": 0.3301268219947815, + "learning_rate": 4.999912736718185e-06, + "loss": 0.6761, + "step": 114 + }, + { + "epoch": 0.12863534675615212, + "grad_norm": 0.31477460265159607, + "learning_rate": 4.99989982540047e-06, + "loss": 0.6722, + "step": 115 + }, + { + "epoch": 0.1297539149888143, + "grad_norm": 0.31430870294570923, + "learning_rate": 4.999886023671629e-06, + "loss": 0.6693, + "step": 116 + }, + { + "epoch": 0.13087248322147652, + "grad_norm": 0.31705909967422485, + "learning_rate": 4.999871331536581e-06, + "loss": 0.6567, + "step": 117 + }, + { + "epoch": 0.1319910514541387, + "grad_norm": 0.3331652879714966, + "learning_rate": 4.999855749000555e-06, + "loss": 0.6895, + "step": 118 + }, + { + "epoch": 0.1331096196868009, + "grad_norm": 0.32147714495658875, + "learning_rate": 4.999839276069105e-06, + "loss": 0.6693, + "step": 119 + }, + { + "epoch": 0.1342281879194631, + "grad_norm": 0.3312559127807617, + "learning_rate": 4.999821912748095e-06, + "loss": 0.6843, + "step": 120 + }, + { + "epoch": 0.13534675615212527, + "grad_norm": 0.34178397059440613, + "learning_rate": 4.999803659043712e-06, + "loss": 0.6774, + "step": 121 + }, + { + "epoch": 0.13646532438478748, + "grad_norm": 0.3154846727848053, + "learning_rate": 4.999784514962456e-06, + "loss": 0.6638, + "step": 122 + }, + { + "epoch": 0.13758389261744966, + "grad_norm": 0.31137940287590027, + "learning_rate": 4.999764480511145e-06, + "loss": 0.6467, + "step": 123 + }, + { + "epoch": 0.13870246085011187, + "grad_norm": 0.3188192546367645, + "learning_rate": 4.999743555696918e-06, + "loss": 0.6511, + "step": 124 + }, + { + "epoch": 0.13982102908277405, + "grad_norm": 0.30495911836624146, + "learning_rate": 4.999721740527225e-06, + "loss": 0.6637, + "step": 125 + }, + { + "epoch": 0.14093959731543623, + "grad_norm": 0.3152139186859131, + "learning_rate": 4.999699035009837e-06, + "loss": 0.6631, + "step": 126 + }, + { + "epoch": 0.14205816554809844, + "grad_norm": 0.32285481691360474, + "learning_rate": 4.999675439152842e-06, + "loss": 0.6621, + "step": 127 + }, + { + "epoch": 0.14317673378076062, + "grad_norm": 0.3176666796207428, + "learning_rate": 4.999650952964643e-06, + "loss": 0.6654, + "step": 128 + }, + { + "epoch": 0.14429530201342283, + "grad_norm": 0.314035028219223, + "learning_rate": 4.999625576453962e-06, + "loss": 0.6927, + "step": 129 + }, + { + "epoch": 0.14541387024608501, + "grad_norm": 0.3227815628051758, + "learning_rate": 4.999599309629839e-06, + "loss": 0.6865, + "step": 130 + }, + { + "epoch": 0.1465324384787472, + "grad_norm": 0.3137218952178955, + "learning_rate": 4.9995721525016275e-06, + "loss": 0.6499, + "step": 131 + }, + { + "epoch": 0.1476510067114094, + "grad_norm": 0.32401353120803833, + "learning_rate": 4.999544105079001e-06, + "loss": 0.64, + "step": 132 + }, + { + "epoch": 0.1487695749440716, + "grad_norm": 0.3110584020614624, + "learning_rate": 4.99951516737195e-06, + "loss": 0.6747, + "step": 133 + }, + { + "epoch": 0.14988814317673377, + "grad_norm": 0.3246876895427704, + "learning_rate": 4.999485339390781e-06, + "loss": 0.6943, + "step": 134 + }, + { + "epoch": 0.15100671140939598, + "grad_norm": 0.3346574008464813, + "learning_rate": 4.999454621146117e-06, + "loss": 0.6675, + "step": 135 + }, + { + "epoch": 0.15212527964205816, + "grad_norm": 0.3305971920490265, + "learning_rate": 4.999423012648902e-06, + "loss": 0.7065, + "step": 136 + }, + { + "epoch": 0.15324384787472037, + "grad_norm": 0.31732234358787537, + "learning_rate": 4.9993905139103924e-06, + "loss": 0.7038, + "step": 137 + }, + { + "epoch": 0.15436241610738255, + "grad_norm": 0.3233291208744049, + "learning_rate": 4.999357124942163e-06, + "loss": 0.6856, + "step": 138 + }, + { + "epoch": 0.15548098434004473, + "grad_norm": 0.31733304262161255, + "learning_rate": 4.999322845756107e-06, + "loss": 0.702, + "step": 139 + }, + { + "epoch": 0.15659955257270694, + "grad_norm": 0.33124351501464844, + "learning_rate": 4.9992876763644346e-06, + "loss": 0.6616, + "step": 140 + }, + { + "epoch": 0.15771812080536912, + "grad_norm": 0.3264501094818115, + "learning_rate": 4.999251616779671e-06, + "loss": 0.6773, + "step": 141 + }, + { + "epoch": 0.15883668903803133, + "grad_norm": 0.34606418013572693, + "learning_rate": 4.999214667014662e-06, + "loss": 0.6765, + "step": 142 + }, + { + "epoch": 0.1599552572706935, + "grad_norm": 0.3292436897754669, + "learning_rate": 4.999176827082566e-06, + "loss": 0.6692, + "step": 143 + }, + { + "epoch": 0.1610738255033557, + "grad_norm": 0.31322377920150757, + "learning_rate": 4.9991380969968615e-06, + "loss": 0.6811, + "step": 144 + }, + { + "epoch": 0.1621923937360179, + "grad_norm": 0.32053160667419434, + "learning_rate": 4.999098476771344e-06, + "loss": 0.6544, + "step": 145 + }, + { + "epoch": 0.16331096196868009, + "grad_norm": 0.34363314509391785, + "learning_rate": 4.9990579664201244e-06, + "loss": 0.6839, + "step": 146 + }, + { + "epoch": 0.1644295302013423, + "grad_norm": 0.3260481357574463, + "learning_rate": 4.999016565957633e-06, + "loss": 0.7048, + "step": 147 + }, + { + "epoch": 0.16554809843400448, + "grad_norm": 0.3410928547382355, + "learning_rate": 4.998974275398614e-06, + "loss": 0.6846, + "step": 148 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.33661726117134094, + "learning_rate": 4.998931094758132e-06, + "loss": 0.6468, + "step": 149 + }, + { + "epoch": 0.16778523489932887, + "grad_norm": 0.32493966817855835, + "learning_rate": 4.998887024051565e-06, + "loss": 0.6741, + "step": 150 + }, + { + "epoch": 0.16890380313199105, + "grad_norm": 0.3396671414375305, + "learning_rate": 4.998842063294613e-06, + "loss": 0.6703, + "step": 151 + }, + { + "epoch": 0.17002237136465326, + "grad_norm": 0.32851850986480713, + "learning_rate": 4.998796212503287e-06, + "loss": 0.6589, + "step": 152 + }, + { + "epoch": 0.17114093959731544, + "grad_norm": 0.33433711528778076, + "learning_rate": 4.99874947169392e-06, + "loss": 0.6594, + "step": 153 + }, + { + "epoch": 0.17225950782997762, + "grad_norm": 0.3388006091117859, + "learning_rate": 4.99870184088316e-06, + "loss": 0.6604, + "step": 154 + }, + { + "epoch": 0.17337807606263983, + "grad_norm": 0.31802693009376526, + "learning_rate": 4.998653320087971e-06, + "loss": 0.6467, + "step": 155 + }, + { + "epoch": 0.174496644295302, + "grad_norm": 0.33016613125801086, + "learning_rate": 4.998603909325636e-06, + "loss": 0.6599, + "step": 156 + }, + { + "epoch": 0.1756152125279642, + "grad_norm": 0.32546237111091614, + "learning_rate": 4.998553608613755e-06, + "loss": 0.6519, + "step": 157 + }, + { + "epoch": 0.1767337807606264, + "grad_norm": 0.3362942337989807, + "learning_rate": 4.998502417970242e-06, + "loss": 0.671, + "step": 158 + }, + { + "epoch": 0.17785234899328858, + "grad_norm": 0.33070167899131775, + "learning_rate": 4.998450337413331e-06, + "loss": 0.6624, + "step": 159 + }, + { + "epoch": 0.1789709172259508, + "grad_norm": 0.32430973649024963, + "learning_rate": 4.998397366961571e-06, + "loss": 0.6263, + "step": 160 + }, + { + "epoch": 0.18008948545861297, + "grad_norm": 0.32481464743614197, + "learning_rate": 4.998343506633831e-06, + "loss": 0.6683, + "step": 161 + }, + { + "epoch": 0.18120805369127516, + "grad_norm": 0.33035483956336975, + "learning_rate": 4.998288756449292e-06, + "loss": 0.6816, + "step": 162 + }, + { + "epoch": 0.18232662192393737, + "grad_norm": 0.33188679814338684, + "learning_rate": 4.998233116427458e-06, + "loss": 0.6693, + "step": 163 + }, + { + "epoch": 0.18344519015659955, + "grad_norm": 0.33667680621147156, + "learning_rate": 4.998176586588145e-06, + "loss": 0.6619, + "step": 164 + }, + { + "epoch": 0.18456375838926176, + "grad_norm": 0.33836281299591064, + "learning_rate": 4.998119166951488e-06, + "loss": 0.6697, + "step": 165 + }, + { + "epoch": 0.18568232662192394, + "grad_norm": 0.31710004806518555, + "learning_rate": 4.998060857537938e-06, + "loss": 0.6386, + "step": 166 + }, + { + "epoch": 0.18680089485458612, + "grad_norm": 0.3220674395561218, + "learning_rate": 4.9980016583682655e-06, + "loss": 0.6477, + "step": 167 + }, + { + "epoch": 0.18791946308724833, + "grad_norm": 0.31624945998191833, + "learning_rate": 4.997941569463554e-06, + "loss": 0.6771, + "step": 168 + }, + { + "epoch": 0.1890380313199105, + "grad_norm": 0.33520039916038513, + "learning_rate": 4.997880590845208e-06, + "loss": 0.6777, + "step": 169 + }, + { + "epoch": 0.19015659955257272, + "grad_norm": 0.33738774061203003, + "learning_rate": 4.997818722534944e-06, + "loss": 0.6603, + "step": 170 + }, + { + "epoch": 0.1912751677852349, + "grad_norm": 0.33408045768737793, + "learning_rate": 4.9977559645548e-06, + "loss": 0.6581, + "step": 171 + }, + { + "epoch": 0.19239373601789708, + "grad_norm": 0.3269501328468323, + "learning_rate": 4.997692316927129e-06, + "loss": 0.6623, + "step": 172 + }, + { + "epoch": 0.1935123042505593, + "grad_norm": 0.33073386549949646, + "learning_rate": 4.997627779674601e-06, + "loss": 0.6465, + "step": 173 + }, + { + "epoch": 0.19463087248322147, + "grad_norm": 0.33027294278144836, + "learning_rate": 4.997562352820201e-06, + "loss": 0.6795, + "step": 174 + }, + { + "epoch": 0.19574944071588368, + "grad_norm": 0.3329165577888489, + "learning_rate": 4.997496036387235e-06, + "loss": 0.6717, + "step": 175 + }, + { + "epoch": 0.19686800894854586, + "grad_norm": 0.3321872353553772, + "learning_rate": 4.997428830399322e-06, + "loss": 0.6415, + "step": 176 + }, + { + "epoch": 0.19798657718120805, + "grad_norm": 0.3222750723361969, + "learning_rate": 4.997360734880401e-06, + "loss": 0.657, + "step": 177 + }, + { + "epoch": 0.19910514541387025, + "grad_norm": 0.33294835686683655, + "learning_rate": 4.997291749854725e-06, + "loss": 0.6931, + "step": 178 + }, + { + "epoch": 0.20022371364653244, + "grad_norm": 0.3413322865962982, + "learning_rate": 4.997221875346863e-06, + "loss": 0.6761, + "step": 179 + }, + { + "epoch": 0.20134228187919462, + "grad_norm": 0.3300095796585083, + "learning_rate": 4.997151111381707e-06, + "loss": 0.6626, + "step": 180 + }, + { + "epoch": 0.20246085011185683, + "grad_norm": 0.337289035320282, + "learning_rate": 4.997079457984459e-06, + "loss": 0.6861, + "step": 181 + }, + { + "epoch": 0.203579418344519, + "grad_norm": 0.3266119658946991, + "learning_rate": 4.997006915180642e-06, + "loss": 0.6687, + "step": 182 + }, + { + "epoch": 0.20469798657718122, + "grad_norm": 0.33044853806495667, + "learning_rate": 4.996933482996092e-06, + "loss": 0.6637, + "step": 183 + }, + { + "epoch": 0.2058165548098434, + "grad_norm": 0.33716171979904175, + "learning_rate": 4.996859161456965e-06, + "loss": 0.6644, + "step": 184 + }, + { + "epoch": 0.20693512304250558, + "grad_norm": 0.32554203271865845, + "learning_rate": 4.996783950589733e-06, + "loss": 0.6524, + "step": 185 + }, + { + "epoch": 0.2080536912751678, + "grad_norm": 0.3271404504776001, + "learning_rate": 4.996707850421184e-06, + "loss": 0.6581, + "step": 186 + }, + { + "epoch": 0.20917225950782997, + "grad_norm": 0.34464138746261597, + "learning_rate": 4.996630860978424e-06, + "loss": 0.6768, + "step": 187 + }, + { + "epoch": 0.21029082774049218, + "grad_norm": 0.3408767282962799, + "learning_rate": 4.996552982288875e-06, + "loss": 0.6556, + "step": 188 + }, + { + "epoch": 0.21140939597315436, + "grad_norm": 0.3375307023525238, + "learning_rate": 4.996474214380276e-06, + "loss": 0.6819, + "step": 189 + }, + { + "epoch": 0.21252796420581654, + "grad_norm": 0.3313542902469635, + "learning_rate": 4.99639455728068e-06, + "loss": 0.6483, + "step": 190 + }, + { + "epoch": 0.21364653243847875, + "grad_norm": 0.3327822685241699, + "learning_rate": 4.996314011018462e-06, + "loss": 0.6669, + "step": 191 + }, + { + "epoch": 0.21476510067114093, + "grad_norm": 0.33021339774131775, + "learning_rate": 4.99623257562231e-06, + "loss": 0.6734, + "step": 192 + }, + { + "epoch": 0.21588366890380314, + "grad_norm": 0.32687169313430786, + "learning_rate": 4.996150251121229e-06, + "loss": 0.6387, + "step": 193 + }, + { + "epoch": 0.21700223713646533, + "grad_norm": 0.3394392728805542, + "learning_rate": 4.996067037544542e-06, + "loss": 0.6623, + "step": 194 + }, + { + "epoch": 0.2181208053691275, + "grad_norm": 0.33284223079681396, + "learning_rate": 4.995982934921887e-06, + "loss": 0.6405, + "step": 195 + }, + { + "epoch": 0.21923937360178972, + "grad_norm": 0.344235360622406, + "learning_rate": 4.995897943283221e-06, + "loss": 0.6741, + "step": 196 + }, + { + "epoch": 0.2203579418344519, + "grad_norm": 0.33437255024909973, + "learning_rate": 4.995812062658815e-06, + "loss": 0.6718, + "step": 197 + }, + { + "epoch": 0.2214765100671141, + "grad_norm": 0.3216111361980438, + "learning_rate": 4.995725293079257e-06, + "loss": 0.6709, + "step": 198 + }, + { + "epoch": 0.2225950782997763, + "grad_norm": 0.3448997139930725, + "learning_rate": 4.9956376345754556e-06, + "loss": 0.6458, + "step": 199 + }, + { + "epoch": 0.22371364653243847, + "grad_norm": 0.34354478120803833, + "learning_rate": 4.99554908717863e-06, + "loss": 0.6615, + "step": 200 + }, + { + "epoch": 0.22483221476510068, + "grad_norm": 0.3417740762233734, + "learning_rate": 4.99545965092032e-06, + "loss": 0.6847, + "step": 201 + }, + { + "epoch": 0.22595078299776286, + "grad_norm": 0.3366676867008209, + "learning_rate": 4.99536932583238e-06, + "loss": 0.649, + "step": 202 + }, + { + "epoch": 0.22706935123042504, + "grad_norm": 0.3610089421272278, + "learning_rate": 4.995278111946983e-06, + "loss": 0.6616, + "step": 203 + }, + { + "epoch": 0.22818791946308725, + "grad_norm": 0.3359774053096771, + "learning_rate": 4.995186009296618e-06, + "loss": 0.6519, + "step": 204 + }, + { + "epoch": 0.22930648769574943, + "grad_norm": 0.34075963497161865, + "learning_rate": 4.9950930179140885e-06, + "loss": 0.6762, + "step": 205 + }, + { + "epoch": 0.23042505592841164, + "grad_norm": 0.32416507601737976, + "learning_rate": 4.994999137832517e-06, + "loss": 0.6499, + "step": 206 + }, + { + "epoch": 0.23154362416107382, + "grad_norm": 0.32749176025390625, + "learning_rate": 4.99490436908534e-06, + "loss": 0.645, + "step": 207 + }, + { + "epoch": 0.232662192393736, + "grad_norm": 0.3349708318710327, + "learning_rate": 4.994808711706314e-06, + "loss": 0.6676, + "step": 208 + }, + { + "epoch": 0.23378076062639822, + "grad_norm": 0.3491227328777313, + "learning_rate": 4.9947121657295094e-06, + "loss": 0.6287, + "step": 209 + }, + { + "epoch": 0.2348993288590604, + "grad_norm": 0.3400874733924866, + "learning_rate": 4.994614731189314e-06, + "loss": 0.6473, + "step": 210 + }, + { + "epoch": 0.2360178970917226, + "grad_norm": 0.3388952612876892, + "learning_rate": 4.994516408120432e-06, + "loss": 0.6821, + "step": 211 + }, + { + "epoch": 0.2371364653243848, + "grad_norm": 0.33224812150001526, + "learning_rate": 4.994417196557884e-06, + "loss": 0.649, + "step": 212 + }, + { + "epoch": 0.23825503355704697, + "grad_norm": 0.3307199478149414, + "learning_rate": 4.994317096537006e-06, + "loss": 0.6581, + "step": 213 + }, + { + "epoch": 0.23937360178970918, + "grad_norm": 0.3505164682865143, + "learning_rate": 4.994216108093452e-06, + "loss": 0.6498, + "step": 214 + }, + { + "epoch": 0.24049217002237136, + "grad_norm": 0.3284938335418701, + "learning_rate": 4.994114231263193e-06, + "loss": 0.6503, + "step": 215 + }, + { + "epoch": 0.24161073825503357, + "grad_norm": 0.3475011885166168, + "learning_rate": 4.994011466082514e-06, + "loss": 0.6724, + "step": 216 + }, + { + "epoch": 0.24272930648769575, + "grad_norm": 0.34667858481407166, + "learning_rate": 4.993907812588019e-06, + "loss": 0.6373, + "step": 217 + }, + { + "epoch": 0.24384787472035793, + "grad_norm": 0.34276899695396423, + "learning_rate": 4.993803270816627e-06, + "loss": 0.6513, + "step": 218 + }, + { + "epoch": 0.24496644295302014, + "grad_norm": 0.34308409690856934, + "learning_rate": 4.993697840805572e-06, + "loss": 0.6596, + "step": 219 + }, + { + "epoch": 0.24608501118568232, + "grad_norm": 0.33108261227607727, + "learning_rate": 4.9935915225924075e-06, + "loss": 0.6623, + "step": 220 + }, + { + "epoch": 0.24720357941834453, + "grad_norm": 0.3529888093471527, + "learning_rate": 4.9934843162150015e-06, + "loss": 0.658, + "step": 221 + }, + { + "epoch": 0.2483221476510067, + "grad_norm": 0.3457166850566864, + "learning_rate": 4.993376221711538e-06, + "loss": 0.6342, + "step": 222 + }, + { + "epoch": 0.2494407158836689, + "grad_norm": 0.35108813643455505, + "learning_rate": 4.993267239120519e-06, + "loss": 0.6325, + "step": 223 + }, + { + "epoch": 0.2505592841163311, + "grad_norm": 0.3448682129383087, + "learning_rate": 4.993157368480761e-06, + "loss": 0.6746, + "step": 224 + }, + { + "epoch": 0.2516778523489933, + "grad_norm": 0.34094589948654175, + "learning_rate": 4.993046609831397e-06, + "loss": 0.6313, + "step": 225 + }, + { + "epoch": 0.25279642058165547, + "grad_norm": 0.33934327960014343, + "learning_rate": 4.9929349632118785e-06, + "loss": 0.6371, + "step": 226 + }, + { + "epoch": 0.2539149888143177, + "grad_norm": 0.3517382740974426, + "learning_rate": 4.99282242866197e-06, + "loss": 0.6411, + "step": 227 + }, + { + "epoch": 0.2550335570469799, + "grad_norm": 0.34098172187805176, + "learning_rate": 4.992709006221755e-06, + "loss": 0.6648, + "step": 228 + }, + { + "epoch": 0.25615212527964204, + "grad_norm": 0.3397183120250702, + "learning_rate": 4.992594695931632e-06, + "loss": 0.6038, + "step": 229 + }, + { + "epoch": 0.25727069351230425, + "grad_norm": 0.35994404554367065, + "learning_rate": 4.992479497832316e-06, + "loss": 0.6832, + "step": 230 + }, + { + "epoch": 0.25838926174496646, + "grad_norm": 0.3505656123161316, + "learning_rate": 4.992363411964838e-06, + "loss": 0.682, + "step": 231 + }, + { + "epoch": 0.2595078299776286, + "grad_norm": 0.343159556388855, + "learning_rate": 4.992246438370545e-06, + "loss": 0.6597, + "step": 232 + }, + { + "epoch": 0.2606263982102908, + "grad_norm": 0.36491280794143677, + "learning_rate": 4.9921285770911e-06, + "loss": 0.6422, + "step": 233 + }, + { + "epoch": 0.26174496644295303, + "grad_norm": 0.3656606078147888, + "learning_rate": 4.992009828168484e-06, + "loss": 0.6988, + "step": 234 + }, + { + "epoch": 0.26286353467561524, + "grad_norm": 0.3348519206047058, + "learning_rate": 4.991890191644993e-06, + "loss": 0.6281, + "step": 235 + }, + { + "epoch": 0.2639821029082774, + "grad_norm": 0.3407367467880249, + "learning_rate": 4.991769667563237e-06, + "loss": 0.6487, + "step": 236 + }, + { + "epoch": 0.2651006711409396, + "grad_norm": 0.3644556999206543, + "learning_rate": 4.991648255966145e-06, + "loss": 0.6443, + "step": 237 + }, + { + "epoch": 0.2662192393736018, + "grad_norm": 0.3346659243106842, + "learning_rate": 4.991525956896962e-06, + "loss": 0.632, + "step": 238 + }, + { + "epoch": 0.26733780760626397, + "grad_norm": 0.36535120010375977, + "learning_rate": 4.991402770399249e-06, + "loss": 0.6347, + "step": 239 + }, + { + "epoch": 0.2684563758389262, + "grad_norm": 0.34253549575805664, + "learning_rate": 4.991278696516879e-06, + "loss": 0.6946, + "step": 240 + }, + { + "epoch": 0.2695749440715884, + "grad_norm": 0.35125476121902466, + "learning_rate": 4.9911537352940485e-06, + "loss": 0.6669, + "step": 241 + }, + { + "epoch": 0.27069351230425054, + "grad_norm": 0.35836276412010193, + "learning_rate": 4.991027886775264e-06, + "loss": 0.6534, + "step": 242 + }, + { + "epoch": 0.27181208053691275, + "grad_norm": 0.34344252943992615, + "learning_rate": 4.990901151005349e-06, + "loss": 0.6595, + "step": 243 + }, + { + "epoch": 0.27293064876957496, + "grad_norm": 0.35524797439575195, + "learning_rate": 4.9907735280294465e-06, + "loss": 0.6612, + "step": 244 + }, + { + "epoch": 0.2740492170022371, + "grad_norm": 0.3483973741531372, + "learning_rate": 4.990645017893013e-06, + "loss": 0.6694, + "step": 245 + }, + { + "epoch": 0.2751677852348993, + "grad_norm": 0.34605199098587036, + "learning_rate": 4.990515620641819e-06, + "loss": 0.6453, + "step": 246 + }, + { + "epoch": 0.27628635346756153, + "grad_norm": 0.3441944122314453, + "learning_rate": 4.990385336321954e-06, + "loss": 0.6356, + "step": 247 + }, + { + "epoch": 0.27740492170022374, + "grad_norm": 0.36233291029930115, + "learning_rate": 4.990254164979823e-06, + "loss": 0.673, + "step": 248 + }, + { + "epoch": 0.2785234899328859, + "grad_norm": 0.3624320328235626, + "learning_rate": 4.990122106662145e-06, + "loss": 0.6459, + "step": 249 + }, + { + "epoch": 0.2796420581655481, + "grad_norm": 0.3635922372341156, + "learning_rate": 4.989989161415959e-06, + "loss": 0.6552, + "step": 250 + }, + { + "epoch": 0.2807606263982103, + "grad_norm": 0.3370678424835205, + "learning_rate": 4.989855329288615e-06, + "loss": 0.6098, + "step": 251 + }, + { + "epoch": 0.28187919463087246, + "grad_norm": 0.35488393902778625, + "learning_rate": 4.989720610327782e-06, + "loss": 0.6554, + "step": 252 + }, + { + "epoch": 0.2829977628635347, + "grad_norm": 0.3495752513408661, + "learning_rate": 4.989585004581444e-06, + "loss": 0.6339, + "step": 253 + }, + { + "epoch": 0.2841163310961969, + "grad_norm": 0.3451905846595764, + "learning_rate": 4.989448512097901e-06, + "loss": 0.6954, + "step": 254 + }, + { + "epoch": 0.28523489932885904, + "grad_norm": 0.3532280921936035, + "learning_rate": 4.989311132925768e-06, + "loss": 0.6198, + "step": 255 + }, + { + "epoch": 0.28635346756152125, + "grad_norm": 0.3570882976055145, + "learning_rate": 4.989172867113976e-06, + "loss": 0.6492, + "step": 256 + }, + { + "epoch": 0.28747203579418346, + "grad_norm": 0.33201339840888977, + "learning_rate": 4.9890337147117755e-06, + "loss": 0.6324, + "step": 257 + }, + { + "epoch": 0.28859060402684567, + "grad_norm": 0.3354071080684662, + "learning_rate": 4.988893675768726e-06, + "loss": 0.628, + "step": 258 + }, + { + "epoch": 0.2897091722595078, + "grad_norm": 0.341133713722229, + "learning_rate": 4.988752750334708e-06, + "loss": 0.6316, + "step": 259 + }, + { + "epoch": 0.29082774049217003, + "grad_norm": 0.33613571524620056, + "learning_rate": 4.9886109384599165e-06, + "loss": 0.6401, + "step": 260 + }, + { + "epoch": 0.29194630872483224, + "grad_norm": 0.3657302260398865, + "learning_rate": 4.988468240194861e-06, + "loss": 0.6743, + "step": 261 + }, + { + "epoch": 0.2930648769574944, + "grad_norm": 0.3349529504776001, + "learning_rate": 4.988324655590369e-06, + "loss": 0.6121, + "step": 262 + }, + { + "epoch": 0.2941834451901566, + "grad_norm": 0.34364601969718933, + "learning_rate": 4.98818018469758e-06, + "loss": 0.6427, + "step": 263 + }, + { + "epoch": 0.2953020134228188, + "grad_norm": 0.35895341634750366, + "learning_rate": 4.988034827567953e-06, + "loss": 0.6913, + "step": 264 + }, + { + "epoch": 0.29642058165548096, + "grad_norm": 0.3571792244911194, + "learning_rate": 4.987888584253262e-06, + "loss": 0.6286, + "step": 265 + }, + { + "epoch": 0.2975391498881432, + "grad_norm": 0.3684253394603729, + "learning_rate": 4.987741454805594e-06, + "loss": 0.6365, + "step": 266 + }, + { + "epoch": 0.2986577181208054, + "grad_norm": 0.3521265983581543, + "learning_rate": 4.987593439277353e-06, + "loss": 0.6172, + "step": 267 + }, + { + "epoch": 0.29977628635346754, + "grad_norm": 0.34943872690200806, + "learning_rate": 4.98744453772126e-06, + "loss": 0.6382, + "step": 268 + }, + { + "epoch": 0.30089485458612975, + "grad_norm": 0.35075268149375916, + "learning_rate": 4.9872947501903515e-06, + "loss": 0.6497, + "step": 269 + }, + { + "epoch": 0.30201342281879195, + "grad_norm": 0.3558429479598999, + "learning_rate": 4.987144076737978e-06, + "loss": 0.6561, + "step": 270 + }, + { + "epoch": 0.30313199105145416, + "grad_norm": 0.35898861289024353, + "learning_rate": 4.986992517417805e-06, + "loss": 0.6613, + "step": 271 + }, + { + "epoch": 0.3042505592841163, + "grad_norm": 0.35135966539382935, + "learning_rate": 4.986840072283815e-06, + "loss": 0.6507, + "step": 272 + }, + { + "epoch": 0.3053691275167785, + "grad_norm": 0.3530326783657074, + "learning_rate": 4.986686741390308e-06, + "loss": 0.6459, + "step": 273 + }, + { + "epoch": 0.30648769574944074, + "grad_norm": 0.34631597995758057, + "learning_rate": 4.986532524791894e-06, + "loss": 0.6074, + "step": 274 + }, + { + "epoch": 0.3076062639821029, + "grad_norm": 0.37036123871803284, + "learning_rate": 4.986377422543503e-06, + "loss": 0.6416, + "step": 275 + }, + { + "epoch": 0.3087248322147651, + "grad_norm": 0.3576701283454895, + "learning_rate": 4.98622143470038e-06, + "loss": 0.5939, + "step": 276 + }, + { + "epoch": 0.3098434004474273, + "grad_norm": 0.35285863280296326, + "learning_rate": 4.986064561318083e-06, + "loss": 0.6405, + "step": 277 + }, + { + "epoch": 0.31096196868008946, + "grad_norm": 0.35576099157333374, + "learning_rate": 4.985906802452488e-06, + "loss": 0.6348, + "step": 278 + }, + { + "epoch": 0.31208053691275167, + "grad_norm": 0.35739952325820923, + "learning_rate": 4.985748158159785e-06, + "loss": 0.65, + "step": 279 + }, + { + "epoch": 0.3131991051454139, + "grad_norm": 0.35885411500930786, + "learning_rate": 4.985588628496481e-06, + "loss": 0.6575, + "step": 280 + }, + { + "epoch": 0.3143176733780761, + "grad_norm": 0.36760038137435913, + "learning_rate": 4.985428213519396e-06, + "loss": 0.6606, + "step": 281 + }, + { + "epoch": 0.31543624161073824, + "grad_norm": 0.35371875762939453, + "learning_rate": 4.9852669132856645e-06, + "loss": 0.6495, + "step": 282 + }, + { + "epoch": 0.31655480984340045, + "grad_norm": 0.3489198684692383, + "learning_rate": 4.985104727852741e-06, + "loss": 0.6402, + "step": 283 + }, + { + "epoch": 0.31767337807606266, + "grad_norm": 0.3595716953277588, + "learning_rate": 4.984941657278392e-06, + "loss": 0.6495, + "step": 284 + }, + { + "epoch": 0.3187919463087248, + "grad_norm": 0.3551527261734009, + "learning_rate": 4.984777701620698e-06, + "loss": 0.6555, + "step": 285 + }, + { + "epoch": 0.319910514541387, + "grad_norm": 0.35428524017333984, + "learning_rate": 4.984612860938059e-06, + "loss": 0.6435, + "step": 286 + }, + { + "epoch": 0.32102908277404923, + "grad_norm": 0.36495980620384216, + "learning_rate": 4.984447135289185e-06, + "loss": 0.6375, + "step": 287 + }, + { + "epoch": 0.3221476510067114, + "grad_norm": 0.36956459283828735, + "learning_rate": 4.984280524733107e-06, + "loss": 0.654, + "step": 288 + }, + { + "epoch": 0.3232662192393736, + "grad_norm": 0.34770330786705017, + "learning_rate": 4.984113029329166e-06, + "loss": 0.6313, + "step": 289 + }, + { + "epoch": 0.3243847874720358, + "grad_norm": 0.3676060140132904, + "learning_rate": 4.9839446491370215e-06, + "loss": 0.6697, + "step": 290 + }, + { + "epoch": 0.32550335570469796, + "grad_norm": 0.3490992486476898, + "learning_rate": 4.983775384216646e-06, + "loss": 0.6343, + "step": 291 + }, + { + "epoch": 0.32662192393736017, + "grad_norm": 0.36720773577690125, + "learning_rate": 4.983605234628328e-06, + "loss": 0.6609, + "step": 292 + }, + { + "epoch": 0.3277404921700224, + "grad_norm": 0.3619595766067505, + "learning_rate": 4.983434200432672e-06, + "loss": 0.6635, + "step": 293 + }, + { + "epoch": 0.3288590604026846, + "grad_norm": 0.35261741280555725, + "learning_rate": 4.983262281690596e-06, + "loss": 0.6273, + "step": 294 + }, + { + "epoch": 0.32997762863534674, + "grad_norm": 0.34182801842689514, + "learning_rate": 4.983089478463335e-06, + "loss": 0.6271, + "step": 295 + }, + { + "epoch": 0.33109619686800895, + "grad_norm": 0.3623373806476593, + "learning_rate": 4.982915790812436e-06, + "loss": 0.6491, + "step": 296 + }, + { + "epoch": 0.33221476510067116, + "grad_norm": 0.3656613826751709, + "learning_rate": 4.982741218799763e-06, + "loss": 0.6672, + "step": 297 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.3533206880092621, + "learning_rate": 4.982565762487498e-06, + "loss": 0.6257, + "step": 298 + }, + { + "epoch": 0.3344519015659955, + "grad_norm": 0.35345301032066345, + "learning_rate": 4.982389421938131e-06, + "loss": 0.6486, + "step": 299 + }, + { + "epoch": 0.33557046979865773, + "grad_norm": 0.36566492915153503, + "learning_rate": 4.982212197214472e-06, + "loss": 0.6411, + "step": 300 + }, + { + "epoch": 0.3366890380313199, + "grad_norm": 0.3681536018848419, + "learning_rate": 4.982034088379646e-06, + "loss": 0.6208, + "step": 301 + }, + { + "epoch": 0.3378076062639821, + "grad_norm": 0.3649962246417999, + "learning_rate": 4.98185509549709e-06, + "loss": 0.6557, + "step": 302 + }, + { + "epoch": 0.3389261744966443, + "grad_norm": 0.350563108921051, + "learning_rate": 4.981675218630557e-06, + "loss": 0.6361, + "step": 303 + }, + { + "epoch": 0.3400447427293065, + "grad_norm": 0.35432034730911255, + "learning_rate": 4.981494457844117e-06, + "loss": 0.6294, + "step": 304 + }, + { + "epoch": 0.34116331096196867, + "grad_norm": 0.3590239882469177, + "learning_rate": 4.981312813202153e-06, + "loss": 0.6318, + "step": 305 + }, + { + "epoch": 0.3422818791946309, + "grad_norm": 0.3664059042930603, + "learning_rate": 4.981130284769361e-06, + "loss": 0.648, + "step": 306 + }, + { + "epoch": 0.3434004474272931, + "grad_norm": 0.35878944396972656, + "learning_rate": 4.9809468726107555e-06, + "loss": 0.619, + "step": 307 + }, + { + "epoch": 0.34451901565995524, + "grad_norm": 0.35763007402420044, + "learning_rate": 4.980762576791664e-06, + "loss": 0.6567, + "step": 308 + }, + { + "epoch": 0.34563758389261745, + "grad_norm": 0.35878288745880127, + "learning_rate": 4.980577397377728e-06, + "loss": 0.6421, + "step": 309 + }, + { + "epoch": 0.34675615212527966, + "grad_norm": 0.3597519099712372, + "learning_rate": 4.980391334434906e-06, + "loss": 0.6352, + "step": 310 + }, + { + "epoch": 0.3478747203579418, + "grad_norm": 0.3498169481754303, + "learning_rate": 4.980204388029466e-06, + "loss": 0.626, + "step": 311 + }, + { + "epoch": 0.348993288590604, + "grad_norm": 0.36353105306625366, + "learning_rate": 4.980016558227998e-06, + "loss": 0.6513, + "step": 312 + }, + { + "epoch": 0.35011185682326623, + "grad_norm": 0.355794221162796, + "learning_rate": 4.979827845097402e-06, + "loss": 0.6385, + "step": 313 + }, + { + "epoch": 0.3512304250559284, + "grad_norm": 0.3594406545162201, + "learning_rate": 4.979638248704894e-06, + "loss": 0.6134, + "step": 314 + }, + { + "epoch": 0.3523489932885906, + "grad_norm": 0.3639025092124939, + "learning_rate": 4.979447769118002e-06, + "loss": 0.6386, + "step": 315 + }, + { + "epoch": 0.3534675615212528, + "grad_norm": 0.3681359887123108, + "learning_rate": 4.979256406404574e-06, + "loss": 0.6213, + "step": 316 + }, + { + "epoch": 0.354586129753915, + "grad_norm": 0.35021546483039856, + "learning_rate": 4.979064160632766e-06, + "loss": 0.5933, + "step": 317 + }, + { + "epoch": 0.35570469798657717, + "grad_norm": 0.35783469676971436, + "learning_rate": 4.978871031871054e-06, + "loss": 0.6054, + "step": 318 + }, + { + "epoch": 0.3568232662192394, + "grad_norm": 0.3768575191497803, + "learning_rate": 4.978677020188226e-06, + "loss": 0.651, + "step": 319 + }, + { + "epoch": 0.3579418344519016, + "grad_norm": 0.3841581642627716, + "learning_rate": 4.978482125653385e-06, + "loss": 0.6447, + "step": 320 + }, + { + "epoch": 0.35906040268456374, + "grad_norm": 0.3749678134918213, + "learning_rate": 4.978286348335949e-06, + "loss": 0.6403, + "step": 321 + }, + { + "epoch": 0.36017897091722595, + "grad_norm": 0.35757121443748474, + "learning_rate": 4.978089688305647e-06, + "loss": 0.6297, + "step": 322 + }, + { + "epoch": 0.36129753914988816, + "grad_norm": 0.36303797364234924, + "learning_rate": 4.977892145632528e-06, + "loss": 0.6438, + "step": 323 + }, + { + "epoch": 0.3624161073825503, + "grad_norm": 0.3670295476913452, + "learning_rate": 4.977693720386951e-06, + "loss": 0.6055, + "step": 324 + }, + { + "epoch": 0.3635346756152125, + "grad_norm": 0.3521486818790436, + "learning_rate": 4.977494412639591e-06, + "loss": 0.6072, + "step": 325 + }, + { + "epoch": 0.36465324384787473, + "grad_norm": 0.35688498616218567, + "learning_rate": 4.9772942224614375e-06, + "loss": 0.6252, + "step": 326 + }, + { + "epoch": 0.36577181208053694, + "grad_norm": 0.3542022109031677, + "learning_rate": 4.9770931499237925e-06, + "loss": 0.6407, + "step": 327 + }, + { + "epoch": 0.3668903803131991, + "grad_norm": 0.3773319125175476, + "learning_rate": 4.976891195098277e-06, + "loss": 0.6524, + "step": 328 + }, + { + "epoch": 0.3680089485458613, + "grad_norm": 0.3697628080844879, + "learning_rate": 4.97668835805682e-06, + "loss": 0.6503, + "step": 329 + }, + { + "epoch": 0.3691275167785235, + "grad_norm": 0.36458712816238403, + "learning_rate": 4.976484638871669e-06, + "loss": 0.6722, + "step": 330 + }, + { + "epoch": 0.37024608501118567, + "grad_norm": 0.3523670434951782, + "learning_rate": 4.976280037615385e-06, + "loss": 0.6273, + "step": 331 + }, + { + "epoch": 0.3713646532438479, + "grad_norm": 0.35473543405532837, + "learning_rate": 4.9760745543608414e-06, + "loss": 0.6243, + "step": 332 + }, + { + "epoch": 0.3724832214765101, + "grad_norm": 0.36719077825546265, + "learning_rate": 4.9758681891812276e-06, + "loss": 0.6476, + "step": 333 + }, + { + "epoch": 0.37360178970917224, + "grad_norm": 0.3712293207645416, + "learning_rate": 4.9756609421500464e-06, + "loss": 0.6475, + "step": 334 + }, + { + "epoch": 0.37472035794183445, + "grad_norm": 0.36749109625816345, + "learning_rate": 4.9754528133411144e-06, + "loss": 0.6428, + "step": 335 + }, + { + "epoch": 0.37583892617449666, + "grad_norm": 0.35749316215515137, + "learning_rate": 4.975243802828563e-06, + "loss": 0.6123, + "step": 336 + }, + { + "epoch": 0.3769574944071588, + "grad_norm": 0.37466734647750854, + "learning_rate": 4.975033910686837e-06, + "loss": 0.6393, + "step": 337 + }, + { + "epoch": 0.378076062639821, + "grad_norm": 0.36750441789627075, + "learning_rate": 4.974823136990697e-06, + "loss": 0.6405, + "step": 338 + }, + { + "epoch": 0.37919463087248323, + "grad_norm": 0.3831922113895416, + "learning_rate": 4.9746114818152135e-06, + "loss": 0.6633, + "step": 339 + }, + { + "epoch": 0.38031319910514544, + "grad_norm": 0.35679274797439575, + "learning_rate": 4.974398945235776e-06, + "loss": 0.6431, + "step": 340 + }, + { + "epoch": 0.3814317673378076, + "grad_norm": 0.36524152755737305, + "learning_rate": 4.974185527328084e-06, + "loss": 0.6419, + "step": 341 + }, + { + "epoch": 0.3825503355704698, + "grad_norm": 0.3668903410434723, + "learning_rate": 4.9739712281681525e-06, + "loss": 0.6418, + "step": 342 + }, + { + "epoch": 0.383668903803132, + "grad_norm": 0.37841862440109253, + "learning_rate": 4.973756047832312e-06, + "loss": 0.6585, + "step": 343 + }, + { + "epoch": 0.38478747203579416, + "grad_norm": 0.37758868932724, + "learning_rate": 4.9735399863972024e-06, + "loss": 0.6493, + "step": 344 + }, + { + "epoch": 0.3859060402684564, + "grad_norm": 0.3663494288921356, + "learning_rate": 4.973323043939783e-06, + "loss": 0.6728, + "step": 345 + }, + { + "epoch": 0.3870246085011186, + "grad_norm": 0.3930216431617737, + "learning_rate": 4.973105220537322e-06, + "loss": 0.6608, + "step": 346 + }, + { + "epoch": 0.38814317673378074, + "grad_norm": 0.390828400850296, + "learning_rate": 4.972886516267404e-06, + "loss": 0.6497, + "step": 347 + }, + { + "epoch": 0.38926174496644295, + "grad_norm": 0.37600764632225037, + "learning_rate": 4.972666931207927e-06, + "loss": 0.6426, + "step": 348 + }, + { + "epoch": 0.39038031319910516, + "grad_norm": 0.36520275473594666, + "learning_rate": 4.972446465437103e-06, + "loss": 0.645, + "step": 349 + }, + { + "epoch": 0.39149888143176736, + "grad_norm": 0.3984422981739044, + "learning_rate": 4.972225119033457e-06, + "loss": 0.6368, + "step": 350 + }, + { + "epoch": 0.3926174496644295, + "grad_norm": 0.3725559711456299, + "learning_rate": 4.972002892075827e-06, + "loss": 0.625, + "step": 351 + }, + { + "epoch": 0.39373601789709173, + "grad_norm": 0.3889387547969818, + "learning_rate": 4.9717797846433655e-06, + "loss": 0.6258, + "step": 352 + }, + { + "epoch": 0.39485458612975394, + "grad_norm": 0.37537676095962524, + "learning_rate": 4.97155579681554e-06, + "loss": 0.64, + "step": 353 + }, + { + "epoch": 0.3959731543624161, + "grad_norm": 0.3795606791973114, + "learning_rate": 4.97133092867213e-06, + "loss": 0.6849, + "step": 354 + }, + { + "epoch": 0.3970917225950783, + "grad_norm": 0.38519197702407837, + "learning_rate": 4.971105180293228e-06, + "loss": 0.6493, + "step": 355 + }, + { + "epoch": 0.3982102908277405, + "grad_norm": 0.3789883553981781, + "learning_rate": 4.97087855175924e-06, + "loss": 0.6281, + "step": 356 + }, + { + "epoch": 0.39932885906040266, + "grad_norm": 0.3687867820262909, + "learning_rate": 4.970651043150887e-06, + "loss": 0.6278, + "step": 357 + }, + { + "epoch": 0.4004474272930649, + "grad_norm": 0.365581214427948, + "learning_rate": 4.970422654549204e-06, + "loss": 0.647, + "step": 358 + }, + { + "epoch": 0.4015659955257271, + "grad_norm": 0.37871256470680237, + "learning_rate": 4.970193386035537e-06, + "loss": 0.6349, + "step": 359 + }, + { + "epoch": 0.40268456375838924, + "grad_norm": 0.37639445066452026, + "learning_rate": 4.969963237691547e-06, + "loss": 0.6544, + "step": 360 + }, + { + "epoch": 0.40380313199105144, + "grad_norm": 0.38033226132392883, + "learning_rate": 4.9697322095992075e-06, + "loss": 0.6216, + "step": 361 + }, + { + "epoch": 0.40492170022371365, + "grad_norm": 0.3785533010959625, + "learning_rate": 4.969500301840805e-06, + "loss": 0.6379, + "step": 362 + }, + { + "epoch": 0.40604026845637586, + "grad_norm": 0.36831173300743103, + "learning_rate": 4.969267514498942e-06, + "loss": 0.6305, + "step": 363 + }, + { + "epoch": 0.407158836689038, + "grad_norm": 0.3860856592655182, + "learning_rate": 4.969033847656531e-06, + "loss": 0.6428, + "step": 364 + }, + { + "epoch": 0.4082774049217002, + "grad_norm": 0.37992534041404724, + "learning_rate": 4.9687993013968e-06, + "loss": 0.629, + "step": 365 + }, + { + "epoch": 0.40939597315436244, + "grad_norm": 0.37179872393608093, + "learning_rate": 4.9685638758032885e-06, + "loss": 0.6146, + "step": 366 + }, + { + "epoch": 0.4105145413870246, + "grad_norm": 0.3762771487236023, + "learning_rate": 4.96832757095985e-06, + "loss": 0.6294, + "step": 367 + }, + { + "epoch": 0.4116331096196868, + "grad_norm": 0.37078356742858887, + "learning_rate": 4.968090386950653e-06, + "loss": 0.6438, + "step": 368 + }, + { + "epoch": 0.412751677852349, + "grad_norm": 0.3619535565376282, + "learning_rate": 4.967852323860176e-06, + "loss": 0.6229, + "step": 369 + }, + { + "epoch": 0.41387024608501116, + "grad_norm": 0.3610592782497406, + "learning_rate": 4.967613381773211e-06, + "loss": 0.6332, + "step": 370 + }, + { + "epoch": 0.41498881431767337, + "grad_norm": 0.36372244358062744, + "learning_rate": 4.9673735607748665e-06, + "loss": 0.6379, + "step": 371 + }, + { + "epoch": 0.4161073825503356, + "grad_norm": 0.3713506758213043, + "learning_rate": 4.96713286095056e-06, + "loss": 0.6051, + "step": 372 + }, + { + "epoch": 0.4172259507829978, + "grad_norm": 0.37290191650390625, + "learning_rate": 4.9668912823860244e-06, + "loss": 0.6431, + "step": 373 + }, + { + "epoch": 0.41834451901565994, + "grad_norm": 0.3736407458782196, + "learning_rate": 4.966648825167305e-06, + "loss": 0.6296, + "step": 374 + }, + { + "epoch": 0.41946308724832215, + "grad_norm": 0.38261404633522034, + "learning_rate": 4.9664054893807586e-06, + "loss": 0.6559, + "step": 375 + }, + { + "epoch": 0.42058165548098436, + "grad_norm": 0.36865612864494324, + "learning_rate": 4.966161275113057e-06, + "loss": 0.6372, + "step": 376 + }, + { + "epoch": 0.4217002237136465, + "grad_norm": 0.3745094835758209, + "learning_rate": 4.965916182451185e-06, + "loss": 0.6526, + "step": 377 + }, + { + "epoch": 0.4228187919463087, + "grad_norm": 0.3758225440979004, + "learning_rate": 4.965670211482437e-06, + "loss": 0.6423, + "step": 378 + }, + { + "epoch": 0.42393736017897093, + "grad_norm": 0.37716934084892273, + "learning_rate": 4.965423362294426e-06, + "loss": 0.6431, + "step": 379 + }, + { + "epoch": 0.4250559284116331, + "grad_norm": 0.3757461905479431, + "learning_rate": 4.965175634975072e-06, + "loss": 0.6335, + "step": 380 + }, + { + "epoch": 0.4261744966442953, + "grad_norm": 0.3701077401638031, + "learning_rate": 4.964927029612611e-06, + "loss": 0.6182, + "step": 381 + }, + { + "epoch": 0.4272930648769575, + "grad_norm": 0.38263121247291565, + "learning_rate": 4.96467754629559e-06, + "loss": 0.6371, + "step": 382 + }, + { + "epoch": 0.42841163310961966, + "grad_norm": 0.3740926682949066, + "learning_rate": 4.9644271851128715e-06, + "loss": 0.6272, + "step": 383 + }, + { + "epoch": 0.42953020134228187, + "grad_norm": 0.39056089520454407, + "learning_rate": 4.964175946153627e-06, + "loss": 0.624, + "step": 384 + }, + { + "epoch": 0.4306487695749441, + "grad_norm": 0.3867873549461365, + "learning_rate": 4.963923829507343e-06, + "loss": 0.6714, + "step": 385 + }, + { + "epoch": 0.4317673378076063, + "grad_norm": 0.3808860182762146, + "learning_rate": 4.963670835263819e-06, + "loss": 0.6412, + "step": 386 + }, + { + "epoch": 0.43288590604026844, + "grad_norm": 0.3839844763278961, + "learning_rate": 4.963416963513166e-06, + "loss": 0.6288, + "step": 387 + }, + { + "epoch": 0.43400447427293065, + "grad_norm": 0.37187114357948303, + "learning_rate": 4.963162214345806e-06, + "loss": 0.6307, + "step": 388 + }, + { + "epoch": 0.43512304250559286, + "grad_norm": 0.36723873019218445, + "learning_rate": 4.962906587852477e-06, + "loss": 0.6285, + "step": 389 + }, + { + "epoch": 0.436241610738255, + "grad_norm": 0.3754160404205322, + "learning_rate": 4.962650084124226e-06, + "loss": 0.6227, + "step": 390 + }, + { + "epoch": 0.4373601789709172, + "grad_norm": 0.374239981174469, + "learning_rate": 4.962392703252417e-06, + "loss": 0.6612, + "step": 391 + }, + { + "epoch": 0.43847874720357943, + "grad_norm": 0.3758632242679596, + "learning_rate": 4.9621344453287214e-06, + "loss": 0.6408, + "step": 392 + }, + { + "epoch": 0.4395973154362416, + "grad_norm": 0.3839190602302551, + "learning_rate": 4.9618753104451254e-06, + "loss": 0.6524, + "step": 393 + }, + { + "epoch": 0.4407158836689038, + "grad_norm": 0.36766374111175537, + "learning_rate": 4.961615298693928e-06, + "loss": 0.6232, + "step": 394 + }, + { + "epoch": 0.441834451901566, + "grad_norm": 0.3692423701286316, + "learning_rate": 4.961354410167739e-06, + "loss": 0.6436, + "step": 395 + }, + { + "epoch": 0.4429530201342282, + "grad_norm": 0.3720521926879883, + "learning_rate": 4.961092644959482e-06, + "loss": 0.6346, + "step": 396 + }, + { + "epoch": 0.44407158836689037, + "grad_norm": 0.373910129070282, + "learning_rate": 4.960830003162392e-06, + "loss": 0.6211, + "step": 397 + }, + { + "epoch": 0.4451901565995526, + "grad_norm": 0.37455853819847107, + "learning_rate": 4.960566484870017e-06, + "loss": 0.6366, + "step": 398 + }, + { + "epoch": 0.4463087248322148, + "grad_norm": 0.387390673160553, + "learning_rate": 4.960302090176215e-06, + "loss": 0.6543, + "step": 399 + }, + { + "epoch": 0.44742729306487694, + "grad_norm": 0.3862502872943878, + "learning_rate": 4.960036819175159e-06, + "loss": 0.6351, + "step": 400 + }, + { + "epoch": 0.44854586129753915, + "grad_norm": 0.38686901330947876, + "learning_rate": 4.959770671961334e-06, + "loss": 0.6247, + "step": 401 + }, + { + "epoch": 0.44966442953020136, + "grad_norm": 0.38111770153045654, + "learning_rate": 4.959503648629534e-06, + "loss": 0.6624, + "step": 402 + }, + { + "epoch": 0.4507829977628635, + "grad_norm": 0.3962753713130951, + "learning_rate": 4.959235749274866e-06, + "loss": 0.6224, + "step": 403 + }, + { + "epoch": 0.4519015659955257, + "grad_norm": 0.36403393745422363, + "learning_rate": 4.958966973992754e-06, + "loss": 0.6215, + "step": 404 + }, + { + "epoch": 0.45302013422818793, + "grad_norm": 0.3858584463596344, + "learning_rate": 4.958697322878926e-06, + "loss": 0.6473, + "step": 405 + }, + { + "epoch": 0.4541387024608501, + "grad_norm": 0.39325979351997375, + "learning_rate": 4.958426796029429e-06, + "loss": 0.6664, + "step": 406 + }, + { + "epoch": 0.4552572706935123, + "grad_norm": 0.37423112988471985, + "learning_rate": 4.958155393540618e-06, + "loss": 0.6416, + "step": 407 + }, + { + "epoch": 0.4563758389261745, + "grad_norm": 0.3979191482067108, + "learning_rate": 4.9578831155091585e-06, + "loss": 0.6493, + "step": 408 + }, + { + "epoch": 0.4574944071588367, + "grad_norm": 0.375473290681839, + "learning_rate": 4.957609962032034e-06, + "loss": 0.6246, + "step": 409 + }, + { + "epoch": 0.45861297539149887, + "grad_norm": 0.37951260805130005, + "learning_rate": 4.957335933206533e-06, + "loss": 0.6374, + "step": 410 + }, + { + "epoch": 0.4597315436241611, + "grad_norm": 0.384162575006485, + "learning_rate": 4.9570610291302605e-06, + "loss": 0.6411, + "step": 411 + }, + { + "epoch": 0.4608501118568233, + "grad_norm": 0.37713801860809326, + "learning_rate": 4.95678524990113e-06, + "loss": 0.6384, + "step": 412 + }, + { + "epoch": 0.46196868008948544, + "grad_norm": 0.3779420554637909, + "learning_rate": 4.95650859561737e-06, + "loss": 0.6238, + "step": 413 + }, + { + "epoch": 0.46308724832214765, + "grad_norm": 0.3826324939727783, + "learning_rate": 4.956231066377517e-06, + "loss": 0.6373, + "step": 414 + }, + { + "epoch": 0.46420581655480986, + "grad_norm": 0.3693124055862427, + "learning_rate": 4.955952662280422e-06, + "loss": 0.6264, + "step": 415 + }, + { + "epoch": 0.465324384787472, + "grad_norm": 0.3891177177429199, + "learning_rate": 4.9556733834252465e-06, + "loss": 0.6755, + "step": 416 + }, + { + "epoch": 0.4664429530201342, + "grad_norm": 0.3732079863548279, + "learning_rate": 4.955393229911465e-06, + "loss": 0.6163, + "step": 417 + }, + { + "epoch": 0.46756152125279643, + "grad_norm": 0.39267081022262573, + "learning_rate": 4.955112201838859e-06, + "loss": 0.653, + "step": 418 + }, + { + "epoch": 0.46868008948545864, + "grad_norm": 0.37127041816711426, + "learning_rate": 4.9548302993075275e-06, + "loss": 0.6024, + "step": 419 + }, + { + "epoch": 0.4697986577181208, + "grad_norm": 0.38274380564689636, + "learning_rate": 4.954547522417878e-06, + "loss": 0.6103, + "step": 420 + }, + { + "epoch": 0.470917225950783, + "grad_norm": 0.39440205693244934, + "learning_rate": 4.954263871270627e-06, + "loss": 0.6388, + "step": 421 + }, + { + "epoch": 0.4720357941834452, + "grad_norm": 0.38207298517227173, + "learning_rate": 4.953979345966808e-06, + "loss": 0.6157, + "step": 422 + }, + { + "epoch": 0.47315436241610737, + "grad_norm": 0.37390536069869995, + "learning_rate": 4.953693946607762e-06, + "loss": 0.612, + "step": 423 + }, + { + "epoch": 0.4742729306487696, + "grad_norm": 0.3679952621459961, + "learning_rate": 4.953407673295141e-06, + "loss": 0.5962, + "step": 424 + }, + { + "epoch": 0.4753914988814318, + "grad_norm": 0.36741313338279724, + "learning_rate": 4.953120526130911e-06, + "loss": 0.5802, + "step": 425 + }, + { + "epoch": 0.47651006711409394, + "grad_norm": 0.40101951360702515, + "learning_rate": 4.952832505217347e-06, + "loss": 0.631, + "step": 426 + }, + { + "epoch": 0.47762863534675615, + "grad_norm": 0.37646785378456116, + "learning_rate": 4.952543610657036e-06, + "loss": 0.6192, + "step": 427 + }, + { + "epoch": 0.47874720357941836, + "grad_norm": 0.3909439444541931, + "learning_rate": 4.952253842552876e-06, + "loss": 0.6288, + "step": 428 + }, + { + "epoch": 0.4798657718120805, + "grad_norm": 0.379685640335083, + "learning_rate": 4.9519632010080765e-06, + "loss": 0.6296, + "step": 429 + }, + { + "epoch": 0.4809843400447427, + "grad_norm": 0.3872782588005066, + "learning_rate": 4.9516716861261575e-06, + "loss": 0.6307, + "step": 430 + }, + { + "epoch": 0.48210290827740493, + "grad_norm": 0.4066009223461151, + "learning_rate": 4.951379298010951e-06, + "loss": 0.6454, + "step": 431 + }, + { + "epoch": 0.48322147651006714, + "grad_norm": 0.38412487506866455, + "learning_rate": 4.951086036766599e-06, + "loss": 0.6254, + "step": 432 + }, + { + "epoch": 0.4843400447427293, + "grad_norm": 0.37819865345954895, + "learning_rate": 4.9507919024975545e-06, + "loss": 0.629, + "step": 433 + }, + { + "epoch": 0.4854586129753915, + "grad_norm": 0.38674691319465637, + "learning_rate": 4.950496895308582e-06, + "loss": 0.6357, + "step": 434 + }, + { + "epoch": 0.4865771812080537, + "grad_norm": 0.39304593205451965, + "learning_rate": 4.950201015304758e-06, + "loss": 0.6475, + "step": 435 + }, + { + "epoch": 0.48769574944071586, + "grad_norm": 0.381124347448349, + "learning_rate": 4.949904262591467e-06, + "loss": 0.6523, + "step": 436 + }, + { + "epoch": 0.4888143176733781, + "grad_norm": 0.4084749221801758, + "learning_rate": 4.949606637274408e-06, + "loss": 0.6773, + "step": 437 + }, + { + "epoch": 0.4899328859060403, + "grad_norm": 0.3967250883579254, + "learning_rate": 4.949308139459586e-06, + "loss": 0.6263, + "step": 438 + }, + { + "epoch": 0.49105145413870244, + "grad_norm": 0.39761948585510254, + "learning_rate": 4.949008769253322e-06, + "loss": 0.6273, + "step": 439 + }, + { + "epoch": 0.49217002237136465, + "grad_norm": 0.3865715265274048, + "learning_rate": 4.948708526762244e-06, + "loss": 0.6464, + "step": 440 + }, + { + "epoch": 0.49328859060402686, + "grad_norm": 0.3970697820186615, + "learning_rate": 4.948407412093292e-06, + "loss": 0.6229, + "step": 441 + }, + { + "epoch": 0.49440715883668906, + "grad_norm": 0.3817065954208374, + "learning_rate": 4.948105425353718e-06, + "loss": 0.6375, + "step": 442 + }, + { + "epoch": 0.4955257270693512, + "grad_norm": 0.3877985179424286, + "learning_rate": 4.947802566651082e-06, + "loss": 0.6389, + "step": 443 + }, + { + "epoch": 0.4966442953020134, + "grad_norm": 0.40800127387046814, + "learning_rate": 4.947498836093257e-06, + "loss": 0.6627, + "step": 444 + }, + { + "epoch": 0.49776286353467564, + "grad_norm": 0.40732380747795105, + "learning_rate": 4.947194233788423e-06, + "loss": 0.6156, + "step": 445 + }, + { + "epoch": 0.4988814317673378, + "grad_norm": 0.3948177695274353, + "learning_rate": 4.946888759845074e-06, + "loss": 0.6481, + "step": 446 + }, + { + "epoch": 0.5, + "grad_norm": 0.38517189025878906, + "learning_rate": 4.9465824143720145e-06, + "loss": 0.6224, + "step": 447 + }, + { + "epoch": 0.5011185682326622, + "grad_norm": 0.3713424503803253, + "learning_rate": 4.946275197478358e-06, + "loss": 0.626, + "step": 448 + }, + { + "epoch": 0.5022371364653244, + "grad_norm": 0.4172223210334778, + "learning_rate": 4.945967109273527e-06, + "loss": 0.6405, + "step": 449 + }, + { + "epoch": 0.5033557046979866, + "grad_norm": 0.4550599157810211, + "learning_rate": 4.945658149867257e-06, + "loss": 0.6103, + "step": 450 + }, + { + "epoch": 0.5044742729306487, + "grad_norm": 0.3938581347465515, + "learning_rate": 4.945348319369593e-06, + "loss": 0.6304, + "step": 451 + }, + { + "epoch": 0.5055928411633109, + "grad_norm": 0.3923262059688568, + "learning_rate": 4.94503761789089e-06, + "loss": 0.6603, + "step": 452 + }, + { + "epoch": 0.5067114093959731, + "grad_norm": 0.3978983163833618, + "learning_rate": 4.944726045541814e-06, + "loss": 0.6445, + "step": 453 + }, + { + "epoch": 0.5078299776286354, + "grad_norm": 0.4101882576942444, + "learning_rate": 4.9444136024333374e-06, + "loss": 0.6223, + "step": 454 + }, + { + "epoch": 0.5089485458612976, + "grad_norm": 0.4056575298309326, + "learning_rate": 4.944100288676749e-06, + "loss": 0.6343, + "step": 455 + }, + { + "epoch": 0.5100671140939598, + "grad_norm": 0.39720436930656433, + "learning_rate": 4.943786104383644e-06, + "loss": 0.6246, + "step": 456 + }, + { + "epoch": 0.5111856823266219, + "grad_norm": 0.3909725248813629, + "learning_rate": 4.943471049665925e-06, + "loss": 0.6339, + "step": 457 + }, + { + "epoch": 0.5123042505592841, + "grad_norm": 0.3773731291294098, + "learning_rate": 4.943155124635812e-06, + "loss": 0.6215, + "step": 458 + }, + { + "epoch": 0.5134228187919463, + "grad_norm": 0.4020001292228699, + "learning_rate": 4.9428383294058295e-06, + "loss": 0.6269, + "step": 459 + }, + { + "epoch": 0.5145413870246085, + "grad_norm": 0.3916706144809723, + "learning_rate": 4.942520664088812e-06, + "loss": 0.6233, + "step": 460 + }, + { + "epoch": 0.5156599552572707, + "grad_norm": 0.38717713952064514, + "learning_rate": 4.9422021287979076e-06, + "loss": 0.6216, + "step": 461 + }, + { + "epoch": 0.5167785234899329, + "grad_norm": 0.38645485043525696, + "learning_rate": 4.941882723646568e-06, + "loss": 0.6092, + "step": 462 + }, + { + "epoch": 0.5178970917225951, + "grad_norm": 0.38496407866477966, + "learning_rate": 4.9415624487485615e-06, + "loss": 0.6368, + "step": 463 + }, + { + "epoch": 0.5190156599552572, + "grad_norm": 0.3946744501590729, + "learning_rate": 4.941241304217962e-06, + "loss": 0.6525, + "step": 464 + }, + { + "epoch": 0.5201342281879194, + "grad_norm": 0.3994438648223877, + "learning_rate": 4.940919290169155e-06, + "loss": 0.6314, + "step": 465 + }, + { + "epoch": 0.5212527964205816, + "grad_norm": 0.3929794728755951, + "learning_rate": 4.940596406716834e-06, + "loss": 0.6148, + "step": 466 + }, + { + "epoch": 0.5223713646532439, + "grad_norm": 0.42620542645454407, + "learning_rate": 4.940272653976005e-06, + "loss": 0.6468, + "step": 467 + }, + { + "epoch": 0.5234899328859061, + "grad_norm": 0.4014374613761902, + "learning_rate": 4.9399480320619805e-06, + "loss": 0.6451, + "step": 468 + }, + { + "epoch": 0.5246085011185683, + "grad_norm": 0.39342424273490906, + "learning_rate": 4.939622541090384e-06, + "loss": 0.6696, + "step": 469 + }, + { + "epoch": 0.5257270693512305, + "grad_norm": 0.3870956301689148, + "learning_rate": 4.939296181177149e-06, + "loss": 0.6451, + "step": 470 + }, + { + "epoch": 0.5268456375838926, + "grad_norm": 0.39973214268684387, + "learning_rate": 4.938968952438518e-06, + "loss": 0.6254, + "step": 471 + }, + { + "epoch": 0.5279642058165548, + "grad_norm": 0.3956799805164337, + "learning_rate": 4.938640854991041e-06, + "loss": 0.6169, + "step": 472 + }, + { + "epoch": 0.529082774049217, + "grad_norm": 0.38881829380989075, + "learning_rate": 4.938311888951583e-06, + "loss": 0.5989, + "step": 473 + }, + { + "epoch": 0.5302013422818792, + "grad_norm": 0.392107218503952, + "learning_rate": 4.93798205443731e-06, + "loss": 0.6284, + "step": 474 + }, + { + "epoch": 0.5313199105145414, + "grad_norm": 0.4042797088623047, + "learning_rate": 4.937651351565707e-06, + "loss": 0.6235, + "step": 475 + }, + { + "epoch": 0.5324384787472036, + "grad_norm": 0.380206435918808, + "learning_rate": 4.937319780454559e-06, + "loss": 0.5894, + "step": 476 + }, + { + "epoch": 0.5335570469798657, + "grad_norm": 0.3989536166191101, + "learning_rate": 4.936987341221968e-06, + "loss": 0.6522, + "step": 477 + }, + { + "epoch": 0.5346756152125279, + "grad_norm": 0.38699498772621155, + "learning_rate": 4.9366540339863395e-06, + "loss": 0.6202, + "step": 478 + }, + { + "epoch": 0.5357941834451901, + "grad_norm": 0.4171985387802124, + "learning_rate": 4.936319858866391e-06, + "loss": 0.624, + "step": 479 + }, + { + "epoch": 0.5369127516778524, + "grad_norm": 0.3932148218154907, + "learning_rate": 4.93598481598115e-06, + "loss": 0.6215, + "step": 480 + }, + { + "epoch": 0.5380313199105146, + "grad_norm": 0.3934101462364197, + "learning_rate": 4.935648905449949e-06, + "loss": 0.6402, + "step": 481 + }, + { + "epoch": 0.5391498881431768, + "grad_norm": 0.3917444348335266, + "learning_rate": 4.935312127392434e-06, + "loss": 0.641, + "step": 482 + }, + { + "epoch": 0.540268456375839, + "grad_norm": 0.4036387503147125, + "learning_rate": 4.9349744819285584e-06, + "loss": 0.6405, + "step": 483 + }, + { + "epoch": 0.5413870246085011, + "grad_norm": 0.38611260056495667, + "learning_rate": 4.934635969178584e-06, + "loss": 0.6231, + "step": 484 + }, + { + "epoch": 0.5425055928411633, + "grad_norm": 0.39185649156570435, + "learning_rate": 4.9342965892630805e-06, + "loss": 0.6214, + "step": 485 + }, + { + "epoch": 0.5436241610738255, + "grad_norm": 0.3736090362071991, + "learning_rate": 4.933956342302929e-06, + "loss": 0.6053, + "step": 486 + }, + { + "epoch": 0.5447427293064877, + "grad_norm": 0.39649662375450134, + "learning_rate": 4.93361522841932e-06, + "loss": 0.6408, + "step": 487 + }, + { + "epoch": 0.5458612975391499, + "grad_norm": 0.3990592658519745, + "learning_rate": 4.933273247733746e-06, + "loss": 0.6081, + "step": 488 + }, + { + "epoch": 0.5469798657718121, + "grad_norm": 0.39177680015563965, + "learning_rate": 4.932930400368019e-06, + "loss": 0.6114, + "step": 489 + }, + { + "epoch": 0.5480984340044742, + "grad_norm": 0.3953116536140442, + "learning_rate": 4.9325866864442495e-06, + "loss": 0.6339, + "step": 490 + }, + { + "epoch": 0.5492170022371364, + "grad_norm": 0.38563409447669983, + "learning_rate": 4.932242106084864e-06, + "loss": 0.6331, + "step": 491 + }, + { + "epoch": 0.5503355704697986, + "grad_norm": 0.40618443489074707, + "learning_rate": 4.931896659412593e-06, + "loss": 0.6441, + "step": 492 + }, + { + "epoch": 0.5514541387024608, + "grad_norm": 0.4008066654205322, + "learning_rate": 4.931550346550479e-06, + "loss": 0.6243, + "step": 493 + }, + { + "epoch": 0.5525727069351231, + "grad_norm": 0.39776620268821716, + "learning_rate": 4.931203167621868e-06, + "loss": 0.6152, + "step": 494 + }, + { + "epoch": 0.5536912751677853, + "grad_norm": 0.38687410950660706, + "learning_rate": 4.930855122750421e-06, + "loss": 0.5969, + "step": 495 + }, + { + "epoch": 0.5548098434004475, + "grad_norm": 0.3877246081829071, + "learning_rate": 4.9305062120601035e-06, + "loss": 0.6016, + "step": 496 + }, + { + "epoch": 0.5559284116331096, + "grad_norm": 0.40948086977005005, + "learning_rate": 4.930156435675189e-06, + "loss": 0.6168, + "step": 497 + }, + { + "epoch": 0.5570469798657718, + "grad_norm": 0.404021292924881, + "learning_rate": 4.929805793720262e-06, + "loss": 0.6092, + "step": 498 + }, + { + "epoch": 0.558165548098434, + "grad_norm": 0.39509114623069763, + "learning_rate": 4.929454286320211e-06, + "loss": 0.6346, + "step": 499 + }, + { + "epoch": 0.5592841163310962, + "grad_norm": 0.39687812328338623, + "learning_rate": 4.9291019136002385e-06, + "loss": 0.639, + "step": 500 + }, + { + "epoch": 0.5604026845637584, + "grad_norm": 0.39210405945777893, + "learning_rate": 4.92874867568585e-06, + "loss": 0.6083, + "step": 501 + }, + { + "epoch": 0.5615212527964206, + "grad_norm": 0.4022452235221863, + "learning_rate": 4.928394572702862e-06, + "loss": 0.6252, + "step": 502 + }, + { + "epoch": 0.5626398210290827, + "grad_norm": 0.40317103266716003, + "learning_rate": 4.928039604777399e-06, + "loss": 0.614, + "step": 503 + }, + { + "epoch": 0.5637583892617449, + "grad_norm": 0.4097250998020172, + "learning_rate": 4.9276837720358924e-06, + "loss": 0.6218, + "step": 504 + }, + { + "epoch": 0.5648769574944071, + "grad_norm": 0.3927348554134369, + "learning_rate": 4.927327074605083e-06, + "loss": 0.6079, + "step": 505 + }, + { + "epoch": 0.5659955257270693, + "grad_norm": 0.3961605131626129, + "learning_rate": 4.9269695126120185e-06, + "loss": 0.612, + "step": 506 + }, + { + "epoch": 0.5671140939597316, + "grad_norm": 0.3945924639701843, + "learning_rate": 4.926611086184054e-06, + "loss": 0.6268, + "step": 507 + }, + { + "epoch": 0.5682326621923938, + "grad_norm": 0.39072591066360474, + "learning_rate": 4.926251795448854e-06, + "loss": 0.6176, + "step": 508 + }, + { + "epoch": 0.569351230425056, + "grad_norm": 0.39760643243789673, + "learning_rate": 4.9258916405343904e-06, + "loss": 0.6437, + "step": 509 + }, + { + "epoch": 0.5704697986577181, + "grad_norm": 0.39866000413894653, + "learning_rate": 4.925530621568942e-06, + "loss": 0.6383, + "step": 510 + }, + { + "epoch": 0.5715883668903803, + "grad_norm": 0.3932257294654846, + "learning_rate": 4.925168738681097e-06, + "loss": 0.6156, + "step": 511 + }, + { + "epoch": 0.5727069351230425, + "grad_norm": 0.39929500222206116, + "learning_rate": 4.924805991999751e-06, + "loss": 0.6069, + "step": 512 + }, + { + "epoch": 0.5738255033557047, + "grad_norm": 0.41192054748535156, + "learning_rate": 4.924442381654105e-06, + "loss": 0.6451, + "step": 513 + }, + { + "epoch": 0.5749440715883669, + "grad_norm": 0.41147273778915405, + "learning_rate": 4.92407790777367e-06, + "loss": 0.647, + "step": 514 + }, + { + "epoch": 0.5760626398210291, + "grad_norm": 0.4128178358078003, + "learning_rate": 4.923712570488264e-06, + "loss": 0.5909, + "step": 515 + }, + { + "epoch": 0.5771812080536913, + "grad_norm": 0.4081036150455475, + "learning_rate": 4.923346369928012e-06, + "loss": 0.6248, + "step": 516 + }, + { + "epoch": 0.5782997762863534, + "grad_norm": 0.3965778350830078, + "learning_rate": 4.922979306223347e-06, + "loss": 0.6019, + "step": 517 + }, + { + "epoch": 0.5794183445190156, + "grad_norm": 0.3979526162147522, + "learning_rate": 4.922611379505009e-06, + "loss": 0.6368, + "step": 518 + }, + { + "epoch": 0.5805369127516778, + "grad_norm": 0.38306665420532227, + "learning_rate": 4.922242589904046e-06, + "loss": 0.62, + "step": 519 + }, + { + "epoch": 0.5816554809843401, + "grad_norm": 0.3833399713039398, + "learning_rate": 4.921872937551814e-06, + "loss": 0.6064, + "step": 520 + }, + { + "epoch": 0.5827740492170023, + "grad_norm": 0.39361608028411865, + "learning_rate": 4.921502422579973e-06, + "loss": 0.6236, + "step": 521 + }, + { + "epoch": 0.5838926174496645, + "grad_norm": 0.39250272512435913, + "learning_rate": 4.921131045120494e-06, + "loss": 0.624, + "step": 522 + }, + { + "epoch": 0.5850111856823266, + "grad_norm": 0.40747684240341187, + "learning_rate": 4.920758805305654e-06, + "loss": 0.6096, + "step": 523 + }, + { + "epoch": 0.5861297539149888, + "grad_norm": 0.39987003803253174, + "learning_rate": 4.920385703268037e-06, + "loss": 0.6282, + "step": 524 + }, + { + "epoch": 0.587248322147651, + "grad_norm": 0.39122274518013, + "learning_rate": 4.920011739140532e-06, + "loss": 0.6479, + "step": 525 + }, + { + "epoch": 0.5883668903803132, + "grad_norm": 0.39809542894363403, + "learning_rate": 4.919636913056339e-06, + "loss": 0.6213, + "step": 526 + }, + { + "epoch": 0.5894854586129754, + "grad_norm": 0.39921343326568604, + "learning_rate": 4.919261225148963e-06, + "loss": 0.6118, + "step": 527 + }, + { + "epoch": 0.5906040268456376, + "grad_norm": 0.4086368680000305, + "learning_rate": 4.9188846755522155e-06, + "loss": 0.6214, + "step": 528 + }, + { + "epoch": 0.5917225950782998, + "grad_norm": 0.4066048264503479, + "learning_rate": 4.918507264400216e-06, + "loss": 0.6316, + "step": 529 + }, + { + "epoch": 0.5928411633109619, + "grad_norm": 0.41961807012557983, + "learning_rate": 4.91812899182739e-06, + "loss": 0.5948, + "step": 530 + }, + { + "epoch": 0.5939597315436241, + "grad_norm": 0.39992618560791016, + "learning_rate": 4.917749857968469e-06, + "loss": 0.6113, + "step": 531 + }, + { + "epoch": 0.5950782997762863, + "grad_norm": 0.41020235419273376, + "learning_rate": 4.917369862958494e-06, + "loss": 0.622, + "step": 532 + }, + { + "epoch": 0.5961968680089486, + "grad_norm": 0.40504705905914307, + "learning_rate": 4.916989006932811e-06, + "loss": 0.621, + "step": 533 + }, + { + "epoch": 0.5973154362416108, + "grad_norm": 0.3829837441444397, + "learning_rate": 4.9166072900270725e-06, + "loss": 0.5942, + "step": 534 + }, + { + "epoch": 0.598434004474273, + "grad_norm": 0.4082834720611572, + "learning_rate": 4.9162247123772375e-06, + "loss": 0.5923, + "step": 535 + }, + { + "epoch": 0.5995525727069351, + "grad_norm": 0.40038296580314636, + "learning_rate": 4.915841274119572e-06, + "loss": 0.6057, + "step": 536 + }, + { + "epoch": 0.6006711409395973, + "grad_norm": 0.40687569975852966, + "learning_rate": 4.91545697539065e-06, + "loss": 0.6343, + "step": 537 + }, + { + "epoch": 0.6017897091722595, + "grad_norm": 0.386262983083725, + "learning_rate": 4.9150718163273494e-06, + "loss": 0.6372, + "step": 538 + }, + { + "epoch": 0.6029082774049217, + "grad_norm": 0.39570850133895874, + "learning_rate": 4.914685797066855e-06, + "loss": 0.6157, + "step": 539 + }, + { + "epoch": 0.6040268456375839, + "grad_norm": 0.40055716037750244, + "learning_rate": 4.9142989177466594e-06, + "loss": 0.6141, + "step": 540 + }, + { + "epoch": 0.6051454138702461, + "grad_norm": 0.4038466811180115, + "learning_rate": 4.913911178504562e-06, + "loss": 0.6286, + "step": 541 + }, + { + "epoch": 0.6062639821029083, + "grad_norm": 0.38774847984313965, + "learning_rate": 4.913522579478664e-06, + "loss": 0.6343, + "step": 542 + }, + { + "epoch": 0.6073825503355704, + "grad_norm": 0.39426755905151367, + "learning_rate": 4.913133120807379e-06, + "loss": 0.6121, + "step": 543 + }, + { + "epoch": 0.6085011185682326, + "grad_norm": 0.4076898396015167, + "learning_rate": 4.912742802629423e-06, + "loss": 0.6273, + "step": 544 + }, + { + "epoch": 0.6096196868008948, + "grad_norm": 0.3859540820121765, + "learning_rate": 4.91235162508382e-06, + "loss": 0.6314, + "step": 545 + }, + { + "epoch": 0.610738255033557, + "grad_norm": 0.3914327621459961, + "learning_rate": 4.911959588309897e-06, + "loss": 0.6027, + "step": 546 + }, + { + "epoch": 0.6118568232662193, + "grad_norm": 0.3892766833305359, + "learning_rate": 4.9115666924472906e-06, + "loss": 0.5922, + "step": 547 + }, + { + "epoch": 0.6129753914988815, + "grad_norm": 0.3921322226524353, + "learning_rate": 4.911172937635942e-06, + "loss": 0.6066, + "step": 548 + }, + { + "epoch": 0.6140939597315436, + "grad_norm": 0.3972843885421753, + "learning_rate": 4.910778324016098e-06, + "loss": 0.614, + "step": 549 + }, + { + "epoch": 0.6152125279642058, + "grad_norm": 0.4027954638004303, + "learning_rate": 4.9103828517283105e-06, + "loss": 0.6174, + "step": 550 + }, + { + "epoch": 0.616331096196868, + "grad_norm": 0.40479257702827454, + "learning_rate": 4.909986520913441e-06, + "loss": 0.6114, + "step": 551 + }, + { + "epoch": 0.6174496644295302, + "grad_norm": 0.4246085584163666, + "learning_rate": 4.909589331712651e-06, + "loss": 0.6145, + "step": 552 + }, + { + "epoch": 0.6185682326621924, + "grad_norm": 0.4173775017261505, + "learning_rate": 4.909191284267413e-06, + "loss": 0.6375, + "step": 553 + }, + { + "epoch": 0.6196868008948546, + "grad_norm": 0.4135677218437195, + "learning_rate": 4.908792378719502e-06, + "loss": 0.6444, + "step": 554 + }, + { + "epoch": 0.6208053691275168, + "grad_norm": 0.40163061022758484, + "learning_rate": 4.9083926152110004e-06, + "loss": 0.6128, + "step": 555 + }, + { + "epoch": 0.6219239373601789, + "grad_norm": 0.41246625781059265, + "learning_rate": 4.907991993884295e-06, + "loss": 0.6229, + "step": 556 + }, + { + "epoch": 0.6230425055928411, + "grad_norm": 0.4114304780960083, + "learning_rate": 4.907590514882079e-06, + "loss": 0.6028, + "step": 557 + }, + { + "epoch": 0.6241610738255033, + "grad_norm": 0.40224742889404297, + "learning_rate": 4.90718817834735e-06, + "loss": 0.5896, + "step": 558 + }, + { + "epoch": 0.6252796420581656, + "grad_norm": 0.397650808095932, + "learning_rate": 4.906784984423411e-06, + "loss": 0.6309, + "step": 559 + }, + { + "epoch": 0.6263982102908278, + "grad_norm": 0.4087318480014801, + "learning_rate": 4.906380933253874e-06, + "loss": 0.6002, + "step": 560 + }, + { + "epoch": 0.62751677852349, + "grad_norm": 0.3988543450832367, + "learning_rate": 4.90597602498265e-06, + "loss": 0.6415, + "step": 561 + }, + { + "epoch": 0.6286353467561522, + "grad_norm": 0.38457274436950684, + "learning_rate": 4.905570259753961e-06, + "loss": 0.6105, + "step": 562 + }, + { + "epoch": 0.6297539149888143, + "grad_norm": 0.38756313920021057, + "learning_rate": 4.905163637712331e-06, + "loss": 0.5953, + "step": 563 + }, + { + "epoch": 0.6308724832214765, + "grad_norm": 0.4071662127971649, + "learning_rate": 4.90475615900259e-06, + "loss": 0.635, + "step": 564 + }, + { + "epoch": 0.6319910514541387, + "grad_norm": 0.4213521182537079, + "learning_rate": 4.904347823769875e-06, + "loss": 0.6141, + "step": 565 + }, + { + "epoch": 0.6331096196868009, + "grad_norm": 0.4104982018470764, + "learning_rate": 4.9039386321596235e-06, + "loss": 0.6235, + "step": 566 + }, + { + "epoch": 0.6342281879194631, + "grad_norm": 0.41318175196647644, + "learning_rate": 4.903528584317583e-06, + "loss": 0.6315, + "step": 567 + }, + { + "epoch": 0.6353467561521253, + "grad_norm": 0.39332863688468933, + "learning_rate": 4.903117680389802e-06, + "loss": 0.5807, + "step": 568 + }, + { + "epoch": 0.6364653243847874, + "grad_norm": 0.4188497066497803, + "learning_rate": 4.902705920522638e-06, + "loss": 0.6176, + "step": 569 + }, + { + "epoch": 0.6375838926174496, + "grad_norm": 0.41399574279785156, + "learning_rate": 4.9022933048627496e-06, + "loss": 0.6067, + "step": 570 + }, + { + "epoch": 0.6387024608501118, + "grad_norm": 0.4197136461734772, + "learning_rate": 4.901879833557102e-06, + "loss": 0.6182, + "step": 571 + }, + { + "epoch": 0.639821029082774, + "grad_norm": 0.41715213656425476, + "learning_rate": 4.9014655067529645e-06, + "loss": 0.6088, + "step": 572 + }, + { + "epoch": 0.6409395973154363, + "grad_norm": 0.40957003831863403, + "learning_rate": 4.901050324597912e-06, + "loss": 0.5942, + "step": 573 + }, + { + "epoch": 0.6420581655480985, + "grad_norm": 0.4082324206829071, + "learning_rate": 4.9006342872398235e-06, + "loss": 0.6389, + "step": 574 + }, + { + "epoch": 0.6431767337807607, + "grad_norm": 0.41336655616760254, + "learning_rate": 4.900217394826882e-06, + "loss": 0.6122, + "step": 575 + }, + { + "epoch": 0.6442953020134228, + "grad_norm": 0.40878477692604065, + "learning_rate": 4.899799647507577e-06, + "loss": 0.6372, + "step": 576 + }, + { + "epoch": 0.645413870246085, + "grad_norm": 0.4028140604496002, + "learning_rate": 4.899381045430701e-06, + "loss": 0.5949, + "step": 577 + }, + { + "epoch": 0.6465324384787472, + "grad_norm": 0.42413756251335144, + "learning_rate": 4.89896158874535e-06, + "loss": 0.6217, + "step": 578 + }, + { + "epoch": 0.6476510067114094, + "grad_norm": 0.4108542501926422, + "learning_rate": 4.898541277600927e-06, + "loss": 0.6283, + "step": 579 + }, + { + "epoch": 0.6487695749440716, + "grad_norm": 0.4101778566837311, + "learning_rate": 4.898120112147135e-06, + "loss": 0.6028, + "step": 580 + }, + { + "epoch": 0.6498881431767338, + "grad_norm": 0.4104720652103424, + "learning_rate": 4.897698092533988e-06, + "loss": 0.6481, + "step": 581 + }, + { + "epoch": 0.6510067114093959, + "grad_norm": 0.4004007577896118, + "learning_rate": 4.897275218911799e-06, + "loss": 0.6042, + "step": 582 + }, + { + "epoch": 0.6521252796420581, + "grad_norm": 0.4042847752571106, + "learning_rate": 4.896851491431185e-06, + "loss": 0.6076, + "step": 583 + }, + { + "epoch": 0.6532438478747203, + "grad_norm": 0.40685543417930603, + "learning_rate": 4.89642691024307e-06, + "loss": 0.6066, + "step": 584 + }, + { + "epoch": 0.6543624161073825, + "grad_norm": 0.41699886322021484, + "learning_rate": 4.896001475498682e-06, + "loss": 0.6091, + "step": 585 + }, + { + "epoch": 0.6554809843400448, + "grad_norm": 0.3905144929885864, + "learning_rate": 4.89557518734955e-06, + "loss": 0.6272, + "step": 586 + }, + { + "epoch": 0.656599552572707, + "grad_norm": 0.40033814311027527, + "learning_rate": 4.895148045947509e-06, + "loss": 0.6183, + "step": 587 + }, + { + "epoch": 0.6577181208053692, + "grad_norm": 0.39996397495269775, + "learning_rate": 4.894720051444698e-06, + "loss": 0.5996, + "step": 588 + }, + { + "epoch": 0.6588366890380313, + "grad_norm": 0.42592981457710266, + "learning_rate": 4.894291203993561e-06, + "loss": 0.6506, + "step": 589 + }, + { + "epoch": 0.6599552572706935, + "grad_norm": 0.40710797905921936, + "learning_rate": 4.8938615037468405e-06, + "loss": 0.6044, + "step": 590 + }, + { + "epoch": 0.6610738255033557, + "grad_norm": 0.427405446767807, + "learning_rate": 4.893430950857591e-06, + "loss": 0.6236, + "step": 591 + }, + { + "epoch": 0.6621923937360179, + "grad_norm": 0.40190666913986206, + "learning_rate": 4.892999545479163e-06, + "loss": 0.6031, + "step": 592 + }, + { + "epoch": 0.6633109619686801, + "grad_norm": 0.4019568860530853, + "learning_rate": 4.8925672877652155e-06, + "loss": 0.6232, + "step": 593 + }, + { + "epoch": 0.6644295302013423, + "grad_norm": 0.40001606941223145, + "learning_rate": 4.892134177869709e-06, + "loss": 0.6141, + "step": 594 + }, + { + "epoch": 0.6655480984340044, + "grad_norm": 0.40650853514671326, + "learning_rate": 4.891700215946909e-06, + "loss": 0.6011, + "step": 595 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.41019386053085327, + "learning_rate": 4.8912654021513815e-06, + "loss": 0.6262, + "step": 596 + }, + { + "epoch": 0.6677852348993288, + "grad_norm": 0.4064581096172333, + "learning_rate": 4.890829736638e-06, + "loss": 0.6329, + "step": 597 + }, + { + "epoch": 0.668903803131991, + "grad_norm": 0.4091740846633911, + "learning_rate": 4.890393219561938e-06, + "loss": 0.6193, + "step": 598 + }, + { + "epoch": 0.6700223713646533, + "grad_norm": 0.407552570104599, + "learning_rate": 4.889955851078674e-06, + "loss": 0.6535, + "step": 599 + }, + { + "epoch": 0.6711409395973155, + "grad_norm": 0.3950585126876831, + "learning_rate": 4.889517631343988e-06, + "loss": 0.6033, + "step": 600 + }, + { + "epoch": 0.6722595078299777, + "grad_norm": 0.39515334367752075, + "learning_rate": 4.889078560513968e-06, + "loss": 0.6006, + "step": 601 + }, + { + "epoch": 0.6733780760626398, + "grad_norm": 0.4188336431980133, + "learning_rate": 4.888638638744999e-06, + "loss": 0.6333, + "step": 602 + }, + { + "epoch": 0.674496644295302, + "grad_norm": 0.4050522446632385, + "learning_rate": 4.888197866193772e-06, + "loss": 0.6329, + "step": 603 + }, + { + "epoch": 0.6756152125279642, + "grad_norm": 0.39428868889808655, + "learning_rate": 4.887756243017282e-06, + "loss": 0.6007, + "step": 604 + }, + { + "epoch": 0.6767337807606264, + "grad_norm": 0.402410626411438, + "learning_rate": 4.887313769372823e-06, + "loss": 0.5885, + "step": 605 + }, + { + "epoch": 0.6778523489932886, + "grad_norm": 0.4062318205833435, + "learning_rate": 4.886870445417998e-06, + "loss": 0.6312, + "step": 606 + }, + { + "epoch": 0.6789709172259508, + "grad_norm": 0.4123631715774536, + "learning_rate": 4.886426271310708e-06, + "loss": 0.619, + "step": 607 + }, + { + "epoch": 0.680089485458613, + "grad_norm": 0.40353336930274963, + "learning_rate": 4.885981247209159e-06, + "loss": 0.6324, + "step": 608 + }, + { + "epoch": 0.6812080536912751, + "grad_norm": 0.40726402401924133, + "learning_rate": 4.885535373271858e-06, + "loss": 0.5819, + "step": 609 + }, + { + "epoch": 0.6823266219239373, + "grad_norm": 0.4142349660396576, + "learning_rate": 4.885088649657618e-06, + "loss": 0.6175, + "step": 610 + }, + { + "epoch": 0.6834451901565995, + "grad_norm": 0.4165160059928894, + "learning_rate": 4.884641076525549e-06, + "loss": 0.597, + "step": 611 + }, + { + "epoch": 0.6845637583892618, + "grad_norm": 0.4037843942642212, + "learning_rate": 4.884192654035069e-06, + "loss": 0.6183, + "step": 612 + }, + { + "epoch": 0.685682326621924, + "grad_norm": 0.41777777671813965, + "learning_rate": 4.883743382345898e-06, + "loss": 0.6063, + "step": 613 + }, + { + "epoch": 0.6868008948545862, + "grad_norm": 0.41021421551704407, + "learning_rate": 4.883293261618054e-06, + "loss": 0.6134, + "step": 614 + }, + { + "epoch": 0.6879194630872483, + "grad_norm": 0.4215847849845886, + "learning_rate": 4.882842292011863e-06, + "loss": 0.6458, + "step": 615 + }, + { + "epoch": 0.6890380313199105, + "grad_norm": 0.42801633477211, + "learning_rate": 4.882390473687949e-06, + "loss": 0.6259, + "step": 616 + }, + { + "epoch": 0.6901565995525727, + "grad_norm": 0.40879589319229126, + "learning_rate": 4.881937806807241e-06, + "loss": 0.6208, + "step": 617 + }, + { + "epoch": 0.6912751677852349, + "grad_norm": 0.39453622698783875, + "learning_rate": 4.881484291530969e-06, + "loss": 0.5966, + "step": 618 + }, + { + "epoch": 0.6923937360178971, + "grad_norm": 0.3992539048194885, + "learning_rate": 4.881029928020666e-06, + "loss": 0.5976, + "step": 619 + }, + { + "epoch": 0.6935123042505593, + "grad_norm": 0.4175397753715515, + "learning_rate": 4.880574716438166e-06, + "loss": 0.6261, + "step": 620 + }, + { + "epoch": 0.6946308724832215, + "grad_norm": 0.40533408522605896, + "learning_rate": 4.880118656945606e-06, + "loss": 0.5945, + "step": 621 + }, + { + "epoch": 0.6957494407158836, + "grad_norm": 0.4089728593826294, + "learning_rate": 4.879661749705424e-06, + "loss": 0.6226, + "step": 622 + }, + { + "epoch": 0.6968680089485458, + "grad_norm": 0.4341566562652588, + "learning_rate": 4.879203994880362e-06, + "loss": 0.6463, + "step": 623 + }, + { + "epoch": 0.697986577181208, + "grad_norm": 0.44256189465522766, + "learning_rate": 4.878745392633462e-06, + "loss": 0.653, + "step": 624 + }, + { + "epoch": 0.6991051454138703, + "grad_norm": 0.4098159372806549, + "learning_rate": 4.878285943128067e-06, + "loss": 0.5808, + "step": 625 + }, + { + "epoch": 0.7002237136465325, + "grad_norm": 0.43130752444267273, + "learning_rate": 4.8778256465278245e-06, + "loss": 0.6261, + "step": 626 + }, + { + "epoch": 0.7013422818791947, + "grad_norm": 0.4110218286514282, + "learning_rate": 4.877364502996682e-06, + "loss": 0.5954, + "step": 627 + }, + { + "epoch": 0.7024608501118568, + "grad_norm": 0.42106136679649353, + "learning_rate": 4.87690251269889e-06, + "loss": 0.6026, + "step": 628 + }, + { + "epoch": 0.703579418344519, + "grad_norm": 0.4233524203300476, + "learning_rate": 4.876439675798997e-06, + "loss": 0.6432, + "step": 629 + }, + { + "epoch": 0.7046979865771812, + "grad_norm": 0.42107197642326355, + "learning_rate": 4.87597599246186e-06, + "loss": 0.6198, + "step": 630 + }, + { + "epoch": 0.7058165548098434, + "grad_norm": 0.43851831555366516, + "learning_rate": 4.875511462852628e-06, + "loss": 0.6293, + "step": 631 + }, + { + "epoch": 0.7069351230425056, + "grad_norm": 0.41345685720443726, + "learning_rate": 4.87504608713676e-06, + "loss": 0.6178, + "step": 632 + }, + { + "epoch": 0.7080536912751678, + "grad_norm": 0.41011178493499756, + "learning_rate": 4.874579865480013e-06, + "loss": 0.6441, + "step": 633 + }, + { + "epoch": 0.70917225950783, + "grad_norm": 0.41372135281562805, + "learning_rate": 4.874112798048442e-06, + "loss": 0.6142, + "step": 634 + }, + { + "epoch": 0.7102908277404921, + "grad_norm": 0.41231900453567505, + "learning_rate": 4.8736448850084105e-06, + "loss": 0.6277, + "step": 635 + }, + { + "epoch": 0.7114093959731543, + "grad_norm": 0.4147928059101105, + "learning_rate": 4.873176126526578e-06, + "loss": 0.6197, + "step": 636 + }, + { + "epoch": 0.7125279642058165, + "grad_norm": 0.4046717882156372, + "learning_rate": 4.8727065227699035e-06, + "loss": 0.6138, + "step": 637 + }, + { + "epoch": 0.7136465324384788, + "grad_norm": 0.4150887727737427, + "learning_rate": 4.872236073905654e-06, + "loss": 0.616, + "step": 638 + }, + { + "epoch": 0.714765100671141, + "grad_norm": 0.41429632902145386, + "learning_rate": 4.87176478010139e-06, + "loss": 0.6153, + "step": 639 + }, + { + "epoch": 0.7158836689038032, + "grad_norm": 0.41153407096862793, + "learning_rate": 4.8712926415249785e-06, + "loss": 0.6171, + "step": 640 + }, + { + "epoch": 0.7170022371364653, + "grad_norm": 0.4178698658943176, + "learning_rate": 4.870819658344584e-06, + "loss": 0.6417, + "step": 641 + }, + { + "epoch": 0.7181208053691275, + "grad_norm": 0.40587952733039856, + "learning_rate": 4.870345830728675e-06, + "loss": 0.6206, + "step": 642 + }, + { + "epoch": 0.7192393736017897, + "grad_norm": 0.42633864283561707, + "learning_rate": 4.869871158846016e-06, + "loss": 0.6246, + "step": 643 + }, + { + "epoch": 0.7203579418344519, + "grad_norm": 0.41023534536361694, + "learning_rate": 4.8693956428656766e-06, + "loss": 0.601, + "step": 644 + }, + { + "epoch": 0.7214765100671141, + "grad_norm": 0.40645042061805725, + "learning_rate": 4.868919282957024e-06, + "loss": 0.6193, + "step": 645 + }, + { + "epoch": 0.7225950782997763, + "grad_norm": 0.40088531374931335, + "learning_rate": 4.86844207928973e-06, + "loss": 0.5869, + "step": 646 + }, + { + "epoch": 0.7237136465324385, + "grad_norm": 0.4136696755886078, + "learning_rate": 4.8679640320337625e-06, + "loss": 0.6413, + "step": 647 + }, + { + "epoch": 0.7248322147651006, + "grad_norm": 0.40026187896728516, + "learning_rate": 4.867485141359394e-06, + "loss": 0.6075, + "step": 648 + }, + { + "epoch": 0.7259507829977628, + "grad_norm": 0.40911242365837097, + "learning_rate": 4.867005407437192e-06, + "loss": 0.6411, + "step": 649 + }, + { + "epoch": 0.727069351230425, + "grad_norm": 0.42306697368621826, + "learning_rate": 4.866524830438029e-06, + "loss": 0.6376, + "step": 650 + }, + { + "epoch": 0.7281879194630873, + "grad_norm": 0.40857061743736267, + "learning_rate": 4.866043410533077e-06, + "loss": 0.6071, + "step": 651 + }, + { + "epoch": 0.7293064876957495, + "grad_norm": 0.41601142287254333, + "learning_rate": 4.8655611478938055e-06, + "loss": 0.6079, + "step": 652 + }, + { + "epoch": 0.7304250559284117, + "grad_norm": 0.40857282280921936, + "learning_rate": 4.8650780426919895e-06, + "loss": 0.6246, + "step": 653 + }, + { + "epoch": 0.7315436241610739, + "grad_norm": 0.4063502252101898, + "learning_rate": 4.864594095099697e-06, + "loss": 0.6105, + "step": 654 + }, + { + "epoch": 0.732662192393736, + "grad_norm": 0.40278729796409607, + "learning_rate": 4.864109305289303e-06, + "loss": 0.5936, + "step": 655 + }, + { + "epoch": 0.7337807606263982, + "grad_norm": 0.4201098382472992, + "learning_rate": 4.863623673433478e-06, + "loss": 0.6081, + "step": 656 + }, + { + "epoch": 0.7348993288590604, + "grad_norm": 0.40003877878189087, + "learning_rate": 4.863137199705192e-06, + "loss": 0.6085, + "step": 657 + }, + { + "epoch": 0.7360178970917226, + "grad_norm": 0.41234898567199707, + "learning_rate": 4.86264988427772e-06, + "loss": 0.6252, + "step": 658 + }, + { + "epoch": 0.7371364653243848, + "grad_norm": 0.4233507513999939, + "learning_rate": 4.862161727324632e-06, + "loss": 0.5987, + "step": 659 + }, + { + "epoch": 0.738255033557047, + "grad_norm": 0.4099391996860504, + "learning_rate": 4.861672729019798e-06, + "loss": 0.6293, + "step": 660 + }, + { + "epoch": 0.7393736017897091, + "grad_norm": 0.4255772829055786, + "learning_rate": 4.861182889537389e-06, + "loss": 0.6268, + "step": 661 + }, + { + "epoch": 0.7404921700223713, + "grad_norm": 0.4317517578601837, + "learning_rate": 4.860692209051877e-06, + "loss": 0.6444, + "step": 662 + }, + { + "epoch": 0.7416107382550335, + "grad_norm": 0.4352816939353943, + "learning_rate": 4.86020068773803e-06, + "loss": 0.6304, + "step": 663 + }, + { + "epoch": 0.7427293064876958, + "grad_norm": 0.3987254500389099, + "learning_rate": 4.859708325770919e-06, + "loss": 0.611, + "step": 664 + }, + { + "epoch": 0.743847874720358, + "grad_norm": 0.4213384985923767, + "learning_rate": 4.859215123325912e-06, + "loss": 0.6292, + "step": 665 + }, + { + "epoch": 0.7449664429530202, + "grad_norm": 0.4062172472476959, + "learning_rate": 4.8587210805786785e-06, + "loss": 0.6197, + "step": 666 + }, + { + "epoch": 0.7460850111856824, + "grad_norm": 0.41443362832069397, + "learning_rate": 4.858226197705183e-06, + "loss": 0.6414, + "step": 667 + }, + { + "epoch": 0.7472035794183445, + "grad_norm": 0.4183506667613983, + "learning_rate": 4.857730474881696e-06, + "loss": 0.6294, + "step": 668 + }, + { + "epoch": 0.7483221476510067, + "grad_norm": 0.42685073614120483, + "learning_rate": 4.857233912284781e-06, + "loss": 0.6264, + "step": 669 + }, + { + "epoch": 0.7494407158836689, + "grad_norm": 0.4143792390823364, + "learning_rate": 4.856736510091304e-06, + "loss": 0.6575, + "step": 670 + }, + { + "epoch": 0.7505592841163311, + "grad_norm": 0.4124217629432678, + "learning_rate": 4.8562382684784284e-06, + "loss": 0.6295, + "step": 671 + }, + { + "epoch": 0.7516778523489933, + "grad_norm": 0.4060792624950409, + "learning_rate": 4.855739187623619e-06, + "loss": 0.5983, + "step": 672 + }, + { + "epoch": 0.7527964205816555, + "grad_norm": 0.4100533723831177, + "learning_rate": 4.855239267704635e-06, + "loss": 0.6271, + "step": 673 + }, + { + "epoch": 0.7539149888143176, + "grad_norm": 0.4047471582889557, + "learning_rate": 4.854738508899538e-06, + "loss": 0.5843, + "step": 674 + }, + { + "epoch": 0.7550335570469798, + "grad_norm": 0.41550201177597046, + "learning_rate": 4.854236911386689e-06, + "loss": 0.6015, + "step": 675 + }, + { + "epoch": 0.756152125279642, + "grad_norm": 0.4035356044769287, + "learning_rate": 4.853734475344745e-06, + "loss": 0.6085, + "step": 676 + }, + { + "epoch": 0.7572706935123042, + "grad_norm": 0.4054676294326782, + "learning_rate": 4.853231200952665e-06, + "loss": 0.5879, + "step": 677 + }, + { + "epoch": 0.7583892617449665, + "grad_norm": 0.4165349304676056, + "learning_rate": 4.852727088389702e-06, + "loss": 0.6065, + "step": 678 + }, + { + "epoch": 0.7595078299776287, + "grad_norm": 0.41854768991470337, + "learning_rate": 4.8522221378354125e-06, + "loss": 0.6115, + "step": 679 + }, + { + "epoch": 0.7606263982102909, + "grad_norm": 0.4189227223396301, + "learning_rate": 4.851716349469647e-06, + "loss": 0.6174, + "step": 680 + }, + { + "epoch": 0.761744966442953, + "grad_norm": 0.44432833790779114, + "learning_rate": 4.851209723472559e-06, + "loss": 0.6382, + "step": 681 + }, + { + "epoch": 0.7628635346756152, + "grad_norm": 0.4199828803539276, + "learning_rate": 4.8507022600245954e-06, + "loss": 0.6125, + "step": 682 + }, + { + "epoch": 0.7639821029082774, + "grad_norm": 0.44079893827438354, + "learning_rate": 4.850193959306506e-06, + "loss": 0.6263, + "step": 683 + }, + { + "epoch": 0.7651006711409396, + "grad_norm": 0.41406047344207764, + "learning_rate": 4.8496848214993355e-06, + "loss": 0.5979, + "step": 684 + }, + { + "epoch": 0.7662192393736018, + "grad_norm": 0.43209850788116455, + "learning_rate": 4.849174846784428e-06, + "loss": 0.6451, + "step": 685 + }, + { + "epoch": 0.767337807606264, + "grad_norm": 0.4180072844028473, + "learning_rate": 4.848664035343425e-06, + "loss": 0.6009, + "step": 686 + }, + { + "epoch": 0.7684563758389261, + "grad_norm": 0.4092356860637665, + "learning_rate": 4.8481523873582685e-06, + "loss": 0.6431, + "step": 687 + }, + { + "epoch": 0.7695749440715883, + "grad_norm": 0.41440829634666443, + "learning_rate": 4.847639903011196e-06, + "loss": 0.6001, + "step": 688 + }, + { + "epoch": 0.7706935123042505, + "grad_norm": 0.4246008098125458, + "learning_rate": 4.8471265824847415e-06, + "loss": 0.6137, + "step": 689 + }, + { + "epoch": 0.7718120805369127, + "grad_norm": 0.4177666902542114, + "learning_rate": 4.846612425961742e-06, + "loss": 0.6026, + "step": 690 + }, + { + "epoch": 0.772930648769575, + "grad_norm": 0.4130840003490448, + "learning_rate": 4.846097433625327e-06, + "loss": 0.6183, + "step": 691 + }, + { + "epoch": 0.7740492170022372, + "grad_norm": 0.406780868768692, + "learning_rate": 4.845581605658926e-06, + "loss": 0.5992, + "step": 692 + }, + { + "epoch": 0.7751677852348994, + "grad_norm": 0.42086103558540344, + "learning_rate": 4.845064942246267e-06, + "loss": 0.6057, + "step": 693 + }, + { + "epoch": 0.7762863534675615, + "grad_norm": 0.4122505486011505, + "learning_rate": 4.844547443571374e-06, + "loss": 0.6134, + "step": 694 + }, + { + "epoch": 0.7774049217002237, + "grad_norm": 0.43634387850761414, + "learning_rate": 4.8440291098185686e-06, + "loss": 0.6044, + "step": 695 + }, + { + "epoch": 0.7785234899328859, + "grad_norm": 0.4160690903663635, + "learning_rate": 4.843509941172471e-06, + "loss": 0.6046, + "step": 696 + }, + { + "epoch": 0.7796420581655481, + "grad_norm": 0.41897231340408325, + "learning_rate": 4.842989937817997e-06, + "loss": 0.6186, + "step": 697 + }, + { + "epoch": 0.7807606263982103, + "grad_norm": 0.4187341034412384, + "learning_rate": 4.842469099940361e-06, + "loss": 0.6266, + "step": 698 + }, + { + "epoch": 0.7818791946308725, + "grad_norm": 0.4075968563556671, + "learning_rate": 4.841947427725076e-06, + "loss": 0.5772, + "step": 699 + }, + { + "epoch": 0.7829977628635347, + "grad_norm": 0.4157114028930664, + "learning_rate": 4.841424921357948e-06, + "loss": 0.5999, + "step": 700 + }, + { + "epoch": 0.7841163310961968, + "grad_norm": 0.4198933243751526, + "learning_rate": 4.840901581025083e-06, + "loss": 0.6273, + "step": 701 + }, + { + "epoch": 0.785234899328859, + "grad_norm": 0.42646607756614685, + "learning_rate": 4.840377406912887e-06, + "loss": 0.6074, + "step": 702 + }, + { + "epoch": 0.7863534675615212, + "grad_norm": 0.42644554376602173, + "learning_rate": 4.839852399208056e-06, + "loss": 0.5872, + "step": 703 + }, + { + "epoch": 0.7874720357941835, + "grad_norm": 0.43172845244407654, + "learning_rate": 4.839326558097587e-06, + "loss": 0.633, + "step": 704 + }, + { + "epoch": 0.7885906040268457, + "grad_norm": 0.4165332317352295, + "learning_rate": 4.838799883768775e-06, + "loss": 0.6206, + "step": 705 + }, + { + "epoch": 0.7897091722595079, + "grad_norm": 0.4209877550601959, + "learning_rate": 4.83827237640921e-06, + "loss": 0.6015, + "step": 706 + }, + { + "epoch": 0.79082774049217, + "grad_norm": 0.4267021715641022, + "learning_rate": 4.837744036206777e-06, + "loss": 0.5975, + "step": 707 + }, + { + "epoch": 0.7919463087248322, + "grad_norm": 0.4415457546710968, + "learning_rate": 4.837214863349662e-06, + "loss": 0.6251, + "step": 708 + }, + { + "epoch": 0.7930648769574944, + "grad_norm": 0.43104031682014465, + "learning_rate": 4.836684858026343e-06, + "loss": 0.6048, + "step": 709 + }, + { + "epoch": 0.7941834451901566, + "grad_norm": 0.41736820340156555, + "learning_rate": 4.8361540204255985e-06, + "loss": 0.5948, + "step": 710 + }, + { + "epoch": 0.7953020134228188, + "grad_norm": 0.4202009439468384, + "learning_rate": 4.835622350736499e-06, + "loss": 0.6099, + "step": 711 + }, + { + "epoch": 0.796420581655481, + "grad_norm": 0.42279568314552307, + "learning_rate": 4.8350898491484175e-06, + "loss": 0.6247, + "step": 712 + }, + { + "epoch": 0.7975391498881432, + "grad_norm": 0.4266239404678345, + "learning_rate": 4.8345565158510176e-06, + "loss": 0.6136, + "step": 713 + }, + { + "epoch": 0.7986577181208053, + "grad_norm": 0.42605841159820557, + "learning_rate": 4.83402235103426e-06, + "loss": 0.6001, + "step": 714 + }, + { + "epoch": 0.7997762863534675, + "grad_norm": 0.42846307158470154, + "learning_rate": 4.8334873548884055e-06, + "loss": 0.5941, + "step": 715 + }, + { + "epoch": 0.8008948545861297, + "grad_norm": 0.44009047746658325, + "learning_rate": 4.832951527604007e-06, + "loss": 0.622, + "step": 716 + }, + { + "epoch": 0.802013422818792, + "grad_norm": 0.44512951374053955, + "learning_rate": 4.8324148693719145e-06, + "loss": 0.6507, + "step": 717 + }, + { + "epoch": 0.8031319910514542, + "grad_norm": 0.455010324716568, + "learning_rate": 4.831877380383276e-06, + "loss": 0.6201, + "step": 718 + }, + { + "epoch": 0.8042505592841164, + "grad_norm": 0.43456459045410156, + "learning_rate": 4.83133906082953e-06, + "loss": 0.623, + "step": 719 + }, + { + "epoch": 0.8053691275167785, + "grad_norm": 0.42063653469085693, + "learning_rate": 4.830799910902418e-06, + "loss": 0.5841, + "step": 720 + }, + { + "epoch": 0.8064876957494407, + "grad_norm": 0.41323843598365784, + "learning_rate": 4.8302599307939725e-06, + "loss": 0.6127, + "step": 721 + }, + { + "epoch": 0.8076062639821029, + "grad_norm": 0.41982001066207886, + "learning_rate": 4.829719120696523e-06, + "loss": 0.6274, + "step": 722 + }, + { + "epoch": 0.8087248322147651, + "grad_norm": 0.43330860137939453, + "learning_rate": 4.829177480802694e-06, + "loss": 0.6416, + "step": 723 + }, + { + "epoch": 0.8098434004474273, + "grad_norm": 0.4351330101490021, + "learning_rate": 4.828635011305407e-06, + "loss": 0.6399, + "step": 724 + }, + { + "epoch": 0.8109619686800895, + "grad_norm": 0.4017598032951355, + "learning_rate": 4.828091712397878e-06, + "loss": 0.5817, + "step": 725 + }, + { + "epoch": 0.8120805369127517, + "grad_norm": 0.42594751715660095, + "learning_rate": 4.827547584273618e-06, + "loss": 0.6438, + "step": 726 + }, + { + "epoch": 0.8131991051454138, + "grad_norm": 0.409135639667511, + "learning_rate": 4.827002627126433e-06, + "loss": 0.5797, + "step": 727 + }, + { + "epoch": 0.814317673378076, + "grad_norm": 0.4304857850074768, + "learning_rate": 4.826456841150428e-06, + "loss": 0.6173, + "step": 728 + }, + { + "epoch": 0.8154362416107382, + "grad_norm": 0.446872740983963, + "learning_rate": 4.825910226539997e-06, + "loss": 0.6059, + "step": 729 + }, + { + "epoch": 0.8165548098434005, + "grad_norm": 0.42625290155410767, + "learning_rate": 4.8253627834898355e-06, + "loss": 0.5994, + "step": 730 + }, + { + "epoch": 0.8176733780760627, + "grad_norm": 0.42183029651641846, + "learning_rate": 4.824814512194929e-06, + "loss": 0.6202, + "step": 731 + }, + { + "epoch": 0.8187919463087249, + "grad_norm": 0.4235664904117584, + "learning_rate": 4.824265412850559e-06, + "loss": 0.6263, + "step": 732 + }, + { + "epoch": 0.819910514541387, + "grad_norm": 0.4118615686893463, + "learning_rate": 4.823715485652307e-06, + "loss": 0.6058, + "step": 733 + }, + { + "epoch": 0.8210290827740492, + "grad_norm": 0.43514224886894226, + "learning_rate": 4.823164730796042e-06, + "loss": 0.6092, + "step": 734 + }, + { + "epoch": 0.8221476510067114, + "grad_norm": 0.41756734251976013, + "learning_rate": 4.8226131484779325e-06, + "loss": 0.6281, + "step": 735 + }, + { + "epoch": 0.8232662192393736, + "grad_norm": 0.438475638628006, + "learning_rate": 4.822060738894439e-06, + "loss": 0.6122, + "step": 736 + }, + { + "epoch": 0.8243847874720358, + "grad_norm": 0.426792174577713, + "learning_rate": 4.821507502242321e-06, + "loss": 0.6407, + "step": 737 + }, + { + "epoch": 0.825503355704698, + "grad_norm": 0.42697012424468994, + "learning_rate": 4.820953438718626e-06, + "loss": 0.5996, + "step": 738 + }, + { + "epoch": 0.8266219239373602, + "grad_norm": 0.42373016476631165, + "learning_rate": 4.820398548520702e-06, + "loss": 0.6075, + "step": 739 + }, + { + "epoch": 0.8277404921700223, + "grad_norm": 0.42235615849494934, + "learning_rate": 4.81984283184619e-06, + "loss": 0.608, + "step": 740 + }, + { + "epoch": 0.8288590604026845, + "grad_norm": 0.41180866956710815, + "learning_rate": 4.819286288893022e-06, + "loss": 0.6127, + "step": 741 + }, + { + "epoch": 0.8299776286353467, + "grad_norm": 0.4207548499107361, + "learning_rate": 4.818728919859426e-06, + "loss": 0.6131, + "step": 742 + }, + { + "epoch": 0.831096196868009, + "grad_norm": 0.4295390546321869, + "learning_rate": 4.818170724943928e-06, + "loss": 0.629, + "step": 743 + }, + { + "epoch": 0.8322147651006712, + "grad_norm": 0.4099291265010834, + "learning_rate": 4.817611704345344e-06, + "loss": 0.6055, + "step": 744 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.4142029583454132, + "learning_rate": 4.817051858262785e-06, + "loss": 0.6127, + "step": 745 + }, + { + "epoch": 0.8344519015659956, + "grad_norm": 0.41662877798080444, + "learning_rate": 4.816491186895656e-06, + "loss": 0.6171, + "step": 746 + }, + { + "epoch": 0.8355704697986577, + "grad_norm": 0.4345078766345978, + "learning_rate": 4.815929690443657e-06, + "loss": 0.6091, + "step": 747 + }, + { + "epoch": 0.8366890380313199, + "grad_norm": 0.43010810017585754, + "learning_rate": 4.8153673691067806e-06, + "loss": 0.626, + "step": 748 + }, + { + "epoch": 0.8378076062639821, + "grad_norm": 0.4256346821784973, + "learning_rate": 4.814804223085313e-06, + "loss": 0.6216, + "step": 749 + }, + { + "epoch": 0.8389261744966443, + "grad_norm": 0.42812031507492065, + "learning_rate": 4.814240252579836e-06, + "loss": 0.6138, + "step": 750 + }, + { + "epoch": 0.8400447427293065, + "grad_norm": 0.41682377457618713, + "learning_rate": 4.813675457791224e-06, + "loss": 0.5783, + "step": 751 + }, + { + "epoch": 0.8411633109619687, + "grad_norm": 0.4257197380065918, + "learning_rate": 4.8131098389206435e-06, + "loss": 0.6006, + "step": 752 + }, + { + "epoch": 0.8422818791946308, + "grad_norm": 0.42872053384780884, + "learning_rate": 4.812543396169557e-06, + "loss": 0.6272, + "step": 753 + }, + { + "epoch": 0.843400447427293, + "grad_norm": 0.4263060986995697, + "learning_rate": 4.81197612973972e-06, + "loss": 0.6093, + "step": 754 + }, + { + "epoch": 0.8445190156599552, + "grad_norm": 0.4151028096675873, + "learning_rate": 4.811408039833178e-06, + "loss": 0.5773, + "step": 755 + }, + { + "epoch": 0.8456375838926175, + "grad_norm": 0.4382091164588928, + "learning_rate": 4.810839126652275e-06, + "loss": 0.596, + "step": 756 + }, + { + "epoch": 0.8467561521252797, + "grad_norm": 0.4290934205055237, + "learning_rate": 4.810269390399646e-06, + "loss": 0.5904, + "step": 757 + }, + { + "epoch": 0.8478747203579419, + "grad_norm": 0.4299798011779785, + "learning_rate": 4.809698831278217e-06, + "loss": 0.6449, + "step": 758 + }, + { + "epoch": 0.8489932885906041, + "grad_norm": 0.4188365042209625, + "learning_rate": 4.809127449491211e-06, + "loss": 0.6007, + "step": 759 + }, + { + "epoch": 0.8501118568232662, + "grad_norm": 0.41594186425209045, + "learning_rate": 4.808555245242141e-06, + "loss": 0.5888, + "step": 760 + }, + { + "epoch": 0.8512304250559284, + "grad_norm": 0.4184630513191223, + "learning_rate": 4.807982218734814e-06, + "loss": 0.6495, + "step": 761 + }, + { + "epoch": 0.8523489932885906, + "grad_norm": 0.418849378824234, + "learning_rate": 4.80740837017333e-06, + "loss": 0.6128, + "step": 762 + }, + { + "epoch": 0.8534675615212528, + "grad_norm": 0.42030590772628784, + "learning_rate": 4.8068336997620804e-06, + "loss": 0.6294, + "step": 763 + }, + { + "epoch": 0.854586129753915, + "grad_norm": 0.4204208552837372, + "learning_rate": 4.806258207705753e-06, + "loss": 0.6279, + "step": 764 + }, + { + "epoch": 0.8557046979865772, + "grad_norm": 0.41544729471206665, + "learning_rate": 4.805681894209324e-06, + "loss": 0.6235, + "step": 765 + }, + { + "epoch": 0.8568232662192393, + "grad_norm": 0.430301308631897, + "learning_rate": 4.805104759478065e-06, + "loss": 0.5876, + "step": 766 + }, + { + "epoch": 0.8579418344519015, + "grad_norm": 0.4640876054763794, + "learning_rate": 4.804526803717539e-06, + "loss": 0.6264, + "step": 767 + }, + { + "epoch": 0.8590604026845637, + "grad_norm": 0.43245360255241394, + "learning_rate": 4.8039480271336005e-06, + "loss": 0.5871, + "step": 768 + }, + { + "epoch": 0.860178970917226, + "grad_norm": 0.41617804765701294, + "learning_rate": 4.803368429932399e-06, + "loss": 0.6218, + "step": 769 + }, + { + "epoch": 0.8612975391498882, + "grad_norm": 0.4325237274169922, + "learning_rate": 4.8027880123203726e-06, + "loss": 0.5874, + "step": 770 + }, + { + "epoch": 0.8624161073825504, + "grad_norm": 0.4480580687522888, + "learning_rate": 4.802206774504255e-06, + "loss": 0.6093, + "step": 771 + }, + { + "epoch": 0.8635346756152126, + "grad_norm": 0.44137299060821533, + "learning_rate": 4.801624716691072e-06, + "loss": 0.6031, + "step": 772 + }, + { + "epoch": 0.8646532438478747, + "grad_norm": 0.4269614815711975, + "learning_rate": 4.801041839088139e-06, + "loss": 0.5963, + "step": 773 + }, + { + "epoch": 0.8657718120805369, + "grad_norm": 0.4120911657810211, + "learning_rate": 4.800458141903064e-06, + "loss": 0.5959, + "step": 774 + }, + { + "epoch": 0.8668903803131991, + "grad_norm": 0.4332381784915924, + "learning_rate": 4.799873625343747e-06, + "loss": 0.6007, + "step": 775 + }, + { + "epoch": 0.8680089485458613, + "grad_norm": 0.4361085295677185, + "learning_rate": 4.7992882896183825e-06, + "loss": 0.6012, + "step": 776 + }, + { + "epoch": 0.8691275167785235, + "grad_norm": 0.43251463770866394, + "learning_rate": 4.798702134935454e-06, + "loss": 0.5799, + "step": 777 + }, + { + "epoch": 0.8702460850111857, + "grad_norm": 0.4305305778980255, + "learning_rate": 4.798115161503735e-06, + "loss": 0.6068, + "step": 778 + }, + { + "epoch": 0.8713646532438478, + "grad_norm": 0.4410499930381775, + "learning_rate": 4.797527369532296e-06, + "loss": 0.6486, + "step": 779 + }, + { + "epoch": 0.87248322147651, + "grad_norm": 0.43271416425704956, + "learning_rate": 4.796938759230494e-06, + "loss": 0.6367, + "step": 780 + }, + { + "epoch": 0.8736017897091722, + "grad_norm": 0.430905282497406, + "learning_rate": 4.7963493308079815e-06, + "loss": 0.5753, + "step": 781 + }, + { + "epoch": 0.8747203579418344, + "grad_norm": 0.4054125249385834, + "learning_rate": 4.7957590844746986e-06, + "loss": 0.5806, + "step": 782 + }, + { + "epoch": 0.8758389261744967, + "grad_norm": 0.4322145879268646, + "learning_rate": 4.795168020440878e-06, + "loss": 0.5989, + "step": 783 + }, + { + "epoch": 0.8769574944071589, + "grad_norm": 0.433716744184494, + "learning_rate": 4.7945761389170464e-06, + "loss": 0.6284, + "step": 784 + }, + { + "epoch": 0.8780760626398211, + "grad_norm": 0.4301673471927643, + "learning_rate": 4.793983440114018e-06, + "loss": 0.6469, + "step": 785 + }, + { + "epoch": 0.8791946308724832, + "grad_norm": 0.4279182553291321, + "learning_rate": 4.7933899242428986e-06, + "loss": 0.6032, + "step": 786 + }, + { + "epoch": 0.8803131991051454, + "grad_norm": 0.4133903682231903, + "learning_rate": 4.792795591515087e-06, + "loss": 0.5745, + "step": 787 + }, + { + "epoch": 0.8814317673378076, + "grad_norm": 0.45094674825668335, + "learning_rate": 4.792200442142273e-06, + "loss": 0.6212, + "step": 788 + }, + { + "epoch": 0.8825503355704698, + "grad_norm": 0.4304371774196625, + "learning_rate": 4.7916044763364344e-06, + "loss": 0.61, + "step": 789 + }, + { + "epoch": 0.883668903803132, + "grad_norm": 0.4232103228569031, + "learning_rate": 4.791007694309842e-06, + "loss": 0.5942, + "step": 790 + }, + { + "epoch": 0.8847874720357942, + "grad_norm": 0.41822549700737, + "learning_rate": 4.790410096275057e-06, + "loss": 0.5829, + "step": 791 + }, + { + "epoch": 0.8859060402684564, + "grad_norm": 0.44473201036453247, + "learning_rate": 4.789811682444931e-06, + "loss": 0.6359, + "step": 792 + }, + { + "epoch": 0.8870246085011185, + "grad_norm": 0.42226001620292664, + "learning_rate": 4.7892124530326065e-06, + "loss": 0.5966, + "step": 793 + }, + { + "epoch": 0.8881431767337807, + "grad_norm": 0.4254050552845001, + "learning_rate": 4.788612408251517e-06, + "loss": 0.6211, + "step": 794 + }, + { + "epoch": 0.889261744966443, + "grad_norm": 0.42909836769104004, + "learning_rate": 4.788011548315383e-06, + "loss": 0.6039, + "step": 795 + }, + { + "epoch": 0.8903803131991052, + "grad_norm": 0.4296148419380188, + "learning_rate": 4.78740987343822e-06, + "loss": 0.6055, + "step": 796 + }, + { + "epoch": 0.8914988814317674, + "grad_norm": 0.4225505292415619, + "learning_rate": 4.786807383834332e-06, + "loss": 0.5947, + "step": 797 + }, + { + "epoch": 0.8926174496644296, + "grad_norm": 0.4271566867828369, + "learning_rate": 4.786204079718314e-06, + "loss": 0.6002, + "step": 798 + }, + { + "epoch": 0.8937360178970917, + "grad_norm": 0.42522063851356506, + "learning_rate": 4.785599961305048e-06, + "loss": 0.6231, + "step": 799 + }, + { + "epoch": 0.8948545861297539, + "grad_norm": 0.4384607970714569, + "learning_rate": 4.784995028809707e-06, + "loss": 0.6072, + "step": 800 + }, + { + "epoch": 0.8959731543624161, + "grad_norm": 0.4194418787956238, + "learning_rate": 4.784389282447759e-06, + "loss": 0.5979, + "step": 801 + }, + { + "epoch": 0.8970917225950783, + "grad_norm": 0.43825826048851013, + "learning_rate": 4.7837827224349544e-06, + "loss": 0.6256, + "step": 802 + }, + { + "epoch": 0.8982102908277405, + "grad_norm": 0.43381622433662415, + "learning_rate": 4.783175348987339e-06, + "loss": 0.5932, + "step": 803 + }, + { + "epoch": 0.8993288590604027, + "grad_norm": 0.4468502700328827, + "learning_rate": 4.7825671623212456e-06, + "loss": 0.618, + "step": 804 + }, + { + "epoch": 0.9004474272930649, + "grad_norm": 0.4352877140045166, + "learning_rate": 4.781958162653298e-06, + "loss": 0.5898, + "step": 805 + }, + { + "epoch": 0.901565995525727, + "grad_norm": 0.4242689907550812, + "learning_rate": 4.781348350200408e-06, + "loss": 0.5856, + "step": 806 + }, + { + "epoch": 0.9026845637583892, + "grad_norm": 0.4262087941169739, + "learning_rate": 4.780737725179778e-06, + "loss": 0.5994, + "step": 807 + }, + { + "epoch": 0.9038031319910514, + "grad_norm": 0.42303264141082764, + "learning_rate": 4.780126287808899e-06, + "loss": 0.6106, + "step": 808 + }, + { + "epoch": 0.9049217002237137, + "grad_norm": 0.43589121103286743, + "learning_rate": 4.779514038305555e-06, + "loss": 0.6251, + "step": 809 + }, + { + "epoch": 0.9060402684563759, + "grad_norm": 0.43768516182899475, + "learning_rate": 4.778900976887813e-06, + "loss": 0.6124, + "step": 810 + }, + { + "epoch": 0.9071588366890381, + "grad_norm": 0.4439849257469177, + "learning_rate": 4.778287103774033e-06, + "loss": 0.6397, + "step": 811 + }, + { + "epoch": 0.9082774049217002, + "grad_norm": 0.44813254475593567, + "learning_rate": 4.777672419182863e-06, + "loss": 0.6213, + "step": 812 + }, + { + "epoch": 0.9093959731543624, + "grad_norm": 0.4133831858634949, + "learning_rate": 4.777056923333244e-06, + "loss": 0.6138, + "step": 813 + }, + { + "epoch": 0.9105145413870246, + "grad_norm": 0.4255264699459076, + "learning_rate": 4.7764406164444e-06, + "loss": 0.6143, + "step": 814 + }, + { + "epoch": 0.9116331096196868, + "grad_norm": 0.42810630798339844, + "learning_rate": 4.775823498735845e-06, + "loss": 0.6253, + "step": 815 + }, + { + "epoch": 0.912751677852349, + "grad_norm": 0.42162856459617615, + "learning_rate": 4.775205570427386e-06, + "loss": 0.602, + "step": 816 + }, + { + "epoch": 0.9138702460850112, + "grad_norm": 0.4342280328273773, + "learning_rate": 4.7745868317391135e-06, + "loss": 0.6088, + "step": 817 + }, + { + "epoch": 0.9149888143176734, + "grad_norm": 0.42438629269599915, + "learning_rate": 4.773967282891411e-06, + "loss": 0.5788, + "step": 818 + }, + { + "epoch": 0.9161073825503355, + "grad_norm": 0.437950074672699, + "learning_rate": 4.7733469241049475e-06, + "loss": 0.6277, + "step": 819 + }, + { + "epoch": 0.9172259507829977, + "grad_norm": 0.4286377429962158, + "learning_rate": 4.772725755600682e-06, + "loss": 0.6024, + "step": 820 + }, + { + "epoch": 0.9183445190156599, + "grad_norm": 0.4317566156387329, + "learning_rate": 4.772103777599861e-06, + "loss": 0.6048, + "step": 821 + }, + { + "epoch": 0.9194630872483222, + "grad_norm": 0.4509202837944031, + "learning_rate": 4.771480990324021e-06, + "loss": 0.6219, + "step": 822 + }, + { + "epoch": 0.9205816554809844, + "grad_norm": 0.4387308955192566, + "learning_rate": 4.7708573939949845e-06, + "loss": 0.6082, + "step": 823 + }, + { + "epoch": 0.9217002237136466, + "grad_norm": 0.457883358001709, + "learning_rate": 4.770232988834864e-06, + "loss": 0.6112, + "step": 824 + }, + { + "epoch": 0.9228187919463087, + "grad_norm": 0.44200408458709717, + "learning_rate": 4.769607775066058e-06, + "loss": 0.6146, + "step": 825 + }, + { + "epoch": 0.9239373601789709, + "grad_norm": 0.44704675674438477, + "learning_rate": 4.768981752911256e-06, + "loss": 0.5921, + "step": 826 + }, + { + "epoch": 0.9250559284116331, + "grad_norm": 0.4367467164993286, + "learning_rate": 4.768354922593433e-06, + "loss": 0.6075, + "step": 827 + }, + { + "epoch": 0.9261744966442953, + "grad_norm": 0.4321734309196472, + "learning_rate": 4.767727284335852e-06, + "loss": 0.6041, + "step": 828 + }, + { + "epoch": 0.9272930648769575, + "grad_norm": 0.42991605401039124, + "learning_rate": 4.767098838362065e-06, + "loss": 0.5804, + "step": 829 + }, + { + "epoch": 0.9284116331096197, + "grad_norm": 0.43791651725769043, + "learning_rate": 4.766469584895912e-06, + "loss": 0.6005, + "step": 830 + }, + { + "epoch": 0.9295302013422819, + "grad_norm": 0.41972237825393677, + "learning_rate": 4.765839524161518e-06, + "loss": 0.582, + "step": 831 + }, + { + "epoch": 0.930648769574944, + "grad_norm": 0.4424271881580353, + "learning_rate": 4.765208656383299e-06, + "loss": 0.5978, + "step": 832 + }, + { + "epoch": 0.9317673378076062, + "grad_norm": 0.45667219161987305, + "learning_rate": 4.7645769817859554e-06, + "loss": 0.6208, + "step": 833 + }, + { + "epoch": 0.9328859060402684, + "grad_norm": 0.4423377811908722, + "learning_rate": 4.763944500594476e-06, + "loss": 0.6061, + "step": 834 + }, + { + "epoch": 0.9340044742729307, + "grad_norm": 0.4316536784172058, + "learning_rate": 4.7633112130341385e-06, + "loss": 0.6116, + "step": 835 + }, + { + "epoch": 0.9351230425055929, + "grad_norm": 0.4591672718524933, + "learning_rate": 4.762677119330505e-06, + "loss": 0.5729, + "step": 836 + }, + { + "epoch": 0.9362416107382551, + "grad_norm": 0.4469880759716034, + "learning_rate": 4.762042219709427e-06, + "loss": 0.6025, + "step": 837 + }, + { + "epoch": 0.9373601789709173, + "grad_norm": 0.4560692012310028, + "learning_rate": 4.761406514397042e-06, + "loss": 0.6103, + "step": 838 + }, + { + "epoch": 0.9384787472035794, + "grad_norm": 0.4428820013999939, + "learning_rate": 4.760770003619775e-06, + "loss": 0.6258, + "step": 839 + }, + { + "epoch": 0.9395973154362416, + "grad_norm": 0.44238874316215515, + "learning_rate": 4.760132687604338e-06, + "loss": 0.6032, + "step": 840 + }, + { + "epoch": 0.9407158836689038, + "grad_norm": 0.46432724595069885, + "learning_rate": 4.759494566577727e-06, + "loss": 0.6266, + "step": 841 + }, + { + "epoch": 0.941834451901566, + "grad_norm": 0.42941901087760925, + "learning_rate": 4.75885564076723e-06, + "loss": 0.5927, + "step": 842 + }, + { + "epoch": 0.9429530201342282, + "grad_norm": 0.43781232833862305, + "learning_rate": 4.758215910400418e-06, + "loss": 0.5967, + "step": 843 + }, + { + "epoch": 0.9440715883668904, + "grad_norm": 0.45641666650772095, + "learning_rate": 4.757575375705149e-06, + "loss": 0.6423, + "step": 844 + }, + { + "epoch": 0.9451901565995525, + "grad_norm": 0.43784114718437195, + "learning_rate": 4.756934036909567e-06, + "loss": 0.606, + "step": 845 + }, + { + "epoch": 0.9463087248322147, + "grad_norm": 0.4379528760910034, + "learning_rate": 4.756291894242106e-06, + "loss": 0.6201, + "step": 846 + }, + { + "epoch": 0.9474272930648769, + "grad_norm": 0.42831459641456604, + "learning_rate": 4.755648947931479e-06, + "loss": 0.6121, + "step": 847 + }, + { + "epoch": 0.9485458612975392, + "grad_norm": 0.43790462613105774, + "learning_rate": 4.7550051982066945e-06, + "loss": 0.5785, + "step": 848 + }, + { + "epoch": 0.9496644295302014, + "grad_norm": 0.4407269358634949, + "learning_rate": 4.75436064529704e-06, + "loss": 0.6127, + "step": 849 + }, + { + "epoch": 0.9507829977628636, + "grad_norm": 0.4252265393733978, + "learning_rate": 4.753715289432092e-06, + "loss": 0.6129, + "step": 850 + }, + { + "epoch": 0.9519015659955258, + "grad_norm": 0.4376990795135498, + "learning_rate": 4.753069130841712e-06, + "loss": 0.614, + "step": 851 + }, + { + "epoch": 0.9530201342281879, + "grad_norm": 0.43123552203178406, + "learning_rate": 4.752422169756048e-06, + "loss": 0.6169, + "step": 852 + }, + { + "epoch": 0.9541387024608501, + "grad_norm": 0.4513196349143982, + "learning_rate": 4.7517744064055345e-06, + "loss": 0.6381, + "step": 853 + }, + { + "epoch": 0.9552572706935123, + "grad_norm": 0.44663751125335693, + "learning_rate": 4.751125841020891e-06, + "loss": 0.605, + "step": 854 + }, + { + "epoch": 0.9563758389261745, + "grad_norm": 0.44196903705596924, + "learning_rate": 4.750476473833123e-06, + "loss": 0.6163, + "step": 855 + }, + { + "epoch": 0.9574944071588367, + "grad_norm": 0.40786847472190857, + "learning_rate": 4.74982630507352e-06, + "loss": 0.5624, + "step": 856 + }, + { + "epoch": 0.9586129753914989, + "grad_norm": 0.43438002467155457, + "learning_rate": 4.749175334973659e-06, + "loss": 0.6183, + "step": 857 + }, + { + "epoch": 0.959731543624161, + "grad_norm": 0.43120619654655457, + "learning_rate": 4.748523563765401e-06, + "loss": 0.6097, + "step": 858 + }, + { + "epoch": 0.9608501118568232, + "grad_norm": 0.4761989414691925, + "learning_rate": 4.747870991680895e-06, + "loss": 0.6029, + "step": 859 + }, + { + "epoch": 0.9619686800894854, + "grad_norm": 0.44484785199165344, + "learning_rate": 4.747217618952571e-06, + "loss": 0.5955, + "step": 860 + }, + { + "epoch": 0.9630872483221476, + "grad_norm": 0.4473284184932709, + "learning_rate": 4.746563445813148e-06, + "loss": 0.6367, + "step": 861 + }, + { + "epoch": 0.9642058165548099, + "grad_norm": 0.4486042857170105, + "learning_rate": 4.745908472495628e-06, + "loss": 0.5917, + "step": 862 + }, + { + "epoch": 0.9653243847874721, + "grad_norm": 0.45661088824272156, + "learning_rate": 4.745252699233298e-06, + "loss": 0.61, + "step": 863 + }, + { + "epoch": 0.9664429530201343, + "grad_norm": 0.4235589802265167, + "learning_rate": 4.744596126259731e-06, + "loss": 0.5887, + "step": 864 + }, + { + "epoch": 0.9675615212527964, + "grad_norm": 0.45463138818740845, + "learning_rate": 4.743938753808785e-06, + "loss": 0.6295, + "step": 865 + }, + { + "epoch": 0.9686800894854586, + "grad_norm": 0.4576405882835388, + "learning_rate": 4.743280582114601e-06, + "loss": 0.6301, + "step": 866 + }, + { + "epoch": 0.9697986577181208, + "grad_norm": 0.4380008280277252, + "learning_rate": 4.742621611411606e-06, + "loss": 0.619, + "step": 867 + }, + { + "epoch": 0.970917225950783, + "grad_norm": 0.4485333263874054, + "learning_rate": 4.7419618419345124e-06, + "loss": 0.6311, + "step": 868 + }, + { + "epoch": 0.9720357941834452, + "grad_norm": 0.44375064969062805, + "learning_rate": 4.741301273918314e-06, + "loss": 0.6095, + "step": 869 + }, + { + "epoch": 0.9731543624161074, + "grad_norm": 0.4440104365348816, + "learning_rate": 4.740639907598293e-06, + "loss": 0.6173, + "step": 870 + }, + { + "epoch": 0.9742729306487695, + "grad_norm": 0.4545641243457794, + "learning_rate": 4.739977743210014e-06, + "loss": 0.6046, + "step": 871 + }, + { + "epoch": 0.9753914988814317, + "grad_norm": 0.41214534640312195, + "learning_rate": 4.739314780989324e-06, + "loss": 0.6072, + "step": 872 + }, + { + "epoch": 0.9765100671140939, + "grad_norm": 0.4223095178604126, + "learning_rate": 4.738651021172357e-06, + "loss": 0.5878, + "step": 873 + }, + { + "epoch": 0.9776286353467561, + "grad_norm": 0.42885127663612366, + "learning_rate": 4.7379864639955304e-06, + "loss": 0.577, + "step": 874 + }, + { + "epoch": 0.9787472035794184, + "grad_norm": 0.42921069264411926, + "learning_rate": 4.737321109695546e-06, + "loss": 0.5844, + "step": 875 + }, + { + "epoch": 0.9798657718120806, + "grad_norm": 0.43462637066841125, + "learning_rate": 4.736654958509387e-06, + "loss": 0.6135, + "step": 876 + }, + { + "epoch": 0.9809843400447428, + "grad_norm": 0.43558555841445923, + "learning_rate": 4.735988010674324e-06, + "loss": 0.6255, + "step": 877 + }, + { + "epoch": 0.9821029082774049, + "grad_norm": 0.44332823157310486, + "learning_rate": 4.735320266427909e-06, + "loss": 0.6266, + "step": 878 + }, + { + "epoch": 0.9832214765100671, + "grad_norm": 0.4158824682235718, + "learning_rate": 4.734651726007978e-06, + "loss": 0.585, + "step": 879 + }, + { + "epoch": 0.9843400447427293, + "grad_norm": 0.4264974296092987, + "learning_rate": 4.733982389652652e-06, + "loss": 0.5871, + "step": 880 + }, + { + "epoch": 0.9854586129753915, + "grad_norm": 0.44846397638320923, + "learning_rate": 4.733312257600332e-06, + "loss": 0.6441, + "step": 881 + }, + { + "epoch": 0.9865771812080537, + "grad_norm": 0.46592265367507935, + "learning_rate": 4.732641330089707e-06, + "loss": 0.6326, + "step": 882 + }, + { + "epoch": 0.9876957494407159, + "grad_norm": 0.423447847366333, + "learning_rate": 4.731969607359747e-06, + "loss": 0.5922, + "step": 883 + }, + { + "epoch": 0.9888143176733781, + "grad_norm": 0.43406492471694946, + "learning_rate": 4.731297089649704e-06, + "loss": 0.6234, + "step": 884 + }, + { + "epoch": 0.9899328859060402, + "grad_norm": 0.443352073431015, + "learning_rate": 4.730623777199115e-06, + "loss": 0.6397, + "step": 885 + }, + { + "epoch": 0.9910514541387024, + "grad_norm": 0.4311400353908539, + "learning_rate": 4.7299496702478e-06, + "loss": 0.6073, + "step": 886 + }, + { + "epoch": 0.9921700223713646, + "grad_norm": 0.4217356741428375, + "learning_rate": 4.729274769035861e-06, + "loss": 0.6177, + "step": 887 + }, + { + "epoch": 0.9932885906040269, + "grad_norm": 0.45120054483413696, + "learning_rate": 4.728599073803685e-06, + "loss": 0.6181, + "step": 888 + }, + { + "epoch": 0.9944071588366891, + "grad_norm": 0.4567187428474426, + "learning_rate": 4.7279225847919375e-06, + "loss": 0.5839, + "step": 889 + }, + { + "epoch": 0.9955257270693513, + "grad_norm": 0.4429774880409241, + "learning_rate": 4.727245302241572e-06, + "loss": 0.6033, + "step": 890 + }, + { + "epoch": 0.9966442953020134, + "grad_norm": 0.44755569100379944, + "learning_rate": 4.726567226393821e-06, + "loss": 0.5877, + "step": 891 + }, + { + "epoch": 0.9977628635346756, + "grad_norm": 0.45908474922180176, + "learning_rate": 4.725888357490201e-06, + "loss": 0.6017, + "step": 892 + }, + { + "epoch": 0.9988814317673378, + "grad_norm": 0.4454707205295563, + "learning_rate": 4.725208695772511e-06, + "loss": 0.6007, + "step": 893 + }, + { + "epoch": 1.0, + "grad_norm": 0.43688732385635376, + "learning_rate": 4.7245282414828305e-06, + "loss": 0.6202, + "step": 894 + }, + { + "epoch": 1.0011185682326622, + "grad_norm": 0.43844303488731384, + "learning_rate": 4.723846994863524e-06, + "loss": 0.5886, + "step": 895 + }, + { + "epoch": 1.0022371364653244, + "grad_norm": 0.4296126067638397, + "learning_rate": 4.7231649561572376e-06, + "loss": 0.5759, + "step": 896 + }, + { + "epoch": 1.0033557046979866, + "grad_norm": 0.4716373682022095, + "learning_rate": 4.722482125606898e-06, + "loss": 0.6078, + "step": 897 + }, + { + "epoch": 1.0044742729306488, + "grad_norm": 0.45032379031181335, + "learning_rate": 4.721798503455716e-06, + "loss": 0.6145, + "step": 898 + }, + { + "epoch": 1.005592841163311, + "grad_norm": 0.42808797955513, + "learning_rate": 4.721114089947181e-06, + "loss": 0.5446, + "step": 899 + }, + { + "epoch": 1.0067114093959733, + "grad_norm": 0.4501192271709442, + "learning_rate": 4.7204288853250694e-06, + "loss": 0.5951, + "step": 900 + }, + { + "epoch": 1.0078299776286352, + "grad_norm": 0.43108487129211426, + "learning_rate": 4.719742889833434e-06, + "loss": 0.5826, + "step": 901 + }, + { + "epoch": 1.0089485458612975, + "grad_norm": 0.44014376401901245, + "learning_rate": 4.7190561037166135e-06, + "loss": 0.6138, + "step": 902 + }, + { + "epoch": 1.0100671140939597, + "grad_norm": 0.4504956305027008, + "learning_rate": 4.718368527219226e-06, + "loss": 0.6125, + "step": 903 + }, + { + "epoch": 1.0111856823266219, + "grad_norm": 0.4215417504310608, + "learning_rate": 4.717680160586172e-06, + "loss": 0.5761, + "step": 904 + }, + { + "epoch": 1.012304250559284, + "grad_norm": 0.44780582189559937, + "learning_rate": 4.716991004062632e-06, + "loss": 0.6056, + "step": 905 + }, + { + "epoch": 1.0134228187919463, + "grad_norm": 0.462046355009079, + "learning_rate": 4.7163010578940695e-06, + "loss": 0.6188, + "step": 906 + }, + { + "epoch": 1.0145413870246085, + "grad_norm": 0.42324090003967285, + "learning_rate": 4.715610322326229e-06, + "loss": 0.6086, + "step": 907 + }, + { + "epoch": 1.0156599552572707, + "grad_norm": 0.4330340623855591, + "learning_rate": 4.714918797605135e-06, + "loss": 0.5894, + "step": 908 + }, + { + "epoch": 1.016778523489933, + "grad_norm": 0.4506567418575287, + "learning_rate": 4.714226483977095e-06, + "loss": 0.6048, + "step": 909 + }, + { + "epoch": 1.0178970917225951, + "grad_norm": 0.4513271152973175, + "learning_rate": 4.713533381688695e-06, + "loss": 0.6139, + "step": 910 + }, + { + "epoch": 1.0190156599552573, + "grad_norm": 0.46496498584747314, + "learning_rate": 4.712839490986804e-06, + "loss": 0.607, + "step": 911 + }, + { + "epoch": 1.0201342281879195, + "grad_norm": 0.4421592354774475, + "learning_rate": 4.7121448121185716e-06, + "loss": 0.6044, + "step": 912 + }, + { + "epoch": 1.0212527964205818, + "grad_norm": 0.4436202943325043, + "learning_rate": 4.711449345331427e-06, + "loss": 0.6216, + "step": 913 + }, + { + "epoch": 1.0223713646532437, + "grad_norm": 0.43512076139450073, + "learning_rate": 4.7107530908730815e-06, + "loss": 0.5738, + "step": 914 + }, + { + "epoch": 1.023489932885906, + "grad_norm": 0.4467792510986328, + "learning_rate": 4.710056048991525e-06, + "loss": 0.6205, + "step": 915 + }, + { + "epoch": 1.0246085011185682, + "grad_norm": 0.4396659731864929, + "learning_rate": 4.709358219935028e-06, + "loss": 0.5902, + "step": 916 + }, + { + "epoch": 1.0257270693512304, + "grad_norm": 0.4392754137516022, + "learning_rate": 4.708659603952146e-06, + "loss": 0.6032, + "step": 917 + }, + { + "epoch": 1.0268456375838926, + "grad_norm": 0.4519026577472687, + "learning_rate": 4.707960201291708e-06, + "loss": 0.6024, + "step": 918 + }, + { + "epoch": 1.0279642058165548, + "grad_norm": 0.45630815625190735, + "learning_rate": 4.707260012202826e-06, + "loss": 0.5928, + "step": 919 + }, + { + "epoch": 1.029082774049217, + "grad_norm": 0.43156692385673523, + "learning_rate": 4.706559036934896e-06, + "loss": 0.5819, + "step": 920 + }, + { + "epoch": 1.0302013422818792, + "grad_norm": 0.4461636543273926, + "learning_rate": 4.705857275737587e-06, + "loss": 0.5727, + "step": 921 + }, + { + "epoch": 1.0313199105145414, + "grad_norm": 0.43878501653671265, + "learning_rate": 4.705154728860853e-06, + "loss": 0.5784, + "step": 922 + }, + { + "epoch": 1.0324384787472036, + "grad_norm": 0.42566847801208496, + "learning_rate": 4.704451396554925e-06, + "loss": 0.5607, + "step": 923 + }, + { + "epoch": 1.0335570469798658, + "grad_norm": 0.43960490822792053, + "learning_rate": 4.703747279070318e-06, + "loss": 0.6085, + "step": 924 + }, + { + "epoch": 1.034675615212528, + "grad_norm": 0.4411300718784332, + "learning_rate": 4.7030423766578194e-06, + "loss": 0.5821, + "step": 925 + }, + { + "epoch": 1.0357941834451903, + "grad_norm": 0.43742841482162476, + "learning_rate": 4.702336689568503e-06, + "loss": 0.5791, + "step": 926 + }, + { + "epoch": 1.0369127516778525, + "grad_norm": 0.4498448669910431, + "learning_rate": 4.70163021805372e-06, + "loss": 0.5803, + "step": 927 + }, + { + "epoch": 1.0380313199105144, + "grad_norm": 0.44952791929244995, + "learning_rate": 4.7009229623650986e-06, + "loss": 0.5954, + "step": 928 + }, + { + "epoch": 1.0391498881431767, + "grad_norm": 0.44882333278656006, + "learning_rate": 4.7002149227545505e-06, + "loss": 0.5866, + "step": 929 + }, + { + "epoch": 1.0402684563758389, + "grad_norm": 0.43540847301483154, + "learning_rate": 4.699506099474261e-06, + "loss": 0.5742, + "step": 930 + }, + { + "epoch": 1.041387024608501, + "grad_norm": 0.44182834029197693, + "learning_rate": 4.6987964927767015e-06, + "loss": 0.5712, + "step": 931 + }, + { + "epoch": 1.0425055928411633, + "grad_norm": 0.43853750824928284, + "learning_rate": 4.6980861029146174e-06, + "loss": 0.5847, + "step": 932 + }, + { + "epoch": 1.0436241610738255, + "grad_norm": 0.4608674943447113, + "learning_rate": 4.697374930141034e-06, + "loss": 0.6076, + "step": 933 + }, + { + "epoch": 1.0447427293064877, + "grad_norm": 0.45616960525512695, + "learning_rate": 4.696662974709256e-06, + "loss": 0.6088, + "step": 934 + }, + { + "epoch": 1.04586129753915, + "grad_norm": 0.44517964124679565, + "learning_rate": 4.695950236872867e-06, + "loss": 0.5795, + "step": 935 + }, + { + "epoch": 1.0469798657718121, + "grad_norm": 0.44067806005477905, + "learning_rate": 4.69523671688573e-06, + "loss": 0.6035, + "step": 936 + }, + { + "epoch": 1.0480984340044743, + "grad_norm": 0.44141000509262085, + "learning_rate": 4.694522415001984e-06, + "loss": 0.5676, + "step": 937 + }, + { + "epoch": 1.0492170022371365, + "grad_norm": 0.47550198435783386, + "learning_rate": 4.693807331476049e-06, + "loss": 0.5774, + "step": 938 + }, + { + "epoch": 1.0503355704697988, + "grad_norm": 0.4472981095314026, + "learning_rate": 4.6930914665626215e-06, + "loss": 0.5884, + "step": 939 + }, + { + "epoch": 1.0514541387024607, + "grad_norm": 0.4382828176021576, + "learning_rate": 4.692374820516679e-06, + "loss": 0.5941, + "step": 940 + }, + { + "epoch": 1.052572706935123, + "grad_norm": 0.4618227481842041, + "learning_rate": 4.691657393593475e-06, + "loss": 0.6139, + "step": 941 + }, + { + "epoch": 1.0536912751677852, + "grad_norm": 0.44602257013320923, + "learning_rate": 4.690939186048541e-06, + "loss": 0.5824, + "step": 942 + }, + { + "epoch": 1.0548098434004474, + "grad_norm": 0.4528485834598541, + "learning_rate": 4.690220198137688e-06, + "loss": 0.6059, + "step": 943 + }, + { + "epoch": 1.0559284116331096, + "grad_norm": 0.4452129304409027, + "learning_rate": 4.689500430117005e-06, + "loss": 0.5792, + "step": 944 + }, + { + "epoch": 1.0570469798657718, + "grad_norm": 0.4420737028121948, + "learning_rate": 4.688779882242855e-06, + "loss": 0.5834, + "step": 945 + }, + { + "epoch": 1.058165548098434, + "grad_norm": 0.46812739968299866, + "learning_rate": 4.6880585547718845e-06, + "loss": 0.6169, + "step": 946 + }, + { + "epoch": 1.0592841163310962, + "grad_norm": 0.4635341465473175, + "learning_rate": 4.687336447961015e-06, + "loss": 0.5954, + "step": 947 + }, + { + "epoch": 1.0604026845637584, + "grad_norm": 0.45534729957580566, + "learning_rate": 4.686613562067444e-06, + "loss": 0.5906, + "step": 948 + }, + { + "epoch": 1.0615212527964206, + "grad_norm": 0.4358477294445038, + "learning_rate": 4.685889897348649e-06, + "loss": 0.5701, + "step": 949 + }, + { + "epoch": 1.0626398210290828, + "grad_norm": 0.4336836338043213, + "learning_rate": 4.685165454062385e-06, + "loss": 0.5552, + "step": 950 + }, + { + "epoch": 1.063758389261745, + "grad_norm": 0.4392858147621155, + "learning_rate": 4.684440232466682e-06, + "loss": 0.572, + "step": 951 + }, + { + "epoch": 1.0648769574944073, + "grad_norm": 0.4643237292766571, + "learning_rate": 4.683714232819848e-06, + "loss": 0.6037, + "step": 952 + }, + { + "epoch": 1.0659955257270695, + "grad_norm": 0.45478659868240356, + "learning_rate": 4.682987455380469e-06, + "loss": 0.6188, + "step": 953 + }, + { + "epoch": 1.0671140939597314, + "grad_norm": 0.4337981939315796, + "learning_rate": 4.682259900407409e-06, + "loss": 0.5715, + "step": 954 + }, + { + "epoch": 1.0682326621923937, + "grad_norm": 0.4659992456436157, + "learning_rate": 4.6815315681598065e-06, + "loss": 0.5939, + "step": 955 + }, + { + "epoch": 1.0693512304250559, + "grad_norm": 0.4571925103664398, + "learning_rate": 4.680802458897078e-06, + "loss": 0.605, + "step": 956 + }, + { + "epoch": 1.070469798657718, + "grad_norm": 0.4513433575630188, + "learning_rate": 4.6800725728789164e-06, + "loss": 0.5856, + "step": 957 + }, + { + "epoch": 1.0715883668903803, + "grad_norm": 0.45815905928611755, + "learning_rate": 4.6793419103652914e-06, + "loss": 0.6006, + "step": 958 + }, + { + "epoch": 1.0727069351230425, + "grad_norm": 0.4600979685783386, + "learning_rate": 4.678610471616451e-06, + "loss": 0.5863, + "step": 959 + }, + { + "epoch": 1.0738255033557047, + "grad_norm": 0.4419516623020172, + "learning_rate": 4.677878256892917e-06, + "loss": 0.5796, + "step": 960 + }, + { + "epoch": 1.074944071588367, + "grad_norm": 0.458377867937088, + "learning_rate": 4.677145266455489e-06, + "loss": 0.5719, + "step": 961 + }, + { + "epoch": 1.0760626398210291, + "grad_norm": 0.445962131023407, + "learning_rate": 4.676411500565241e-06, + "loss": 0.585, + "step": 962 + }, + { + "epoch": 1.0771812080536913, + "grad_norm": 0.4639750123023987, + "learning_rate": 4.675676959483528e-06, + "loss": 0.6117, + "step": 963 + }, + { + "epoch": 1.0782997762863535, + "grad_norm": 0.4576922059059143, + "learning_rate": 4.6749416434719755e-06, + "loss": 0.592, + "step": 964 + }, + { + "epoch": 1.0794183445190157, + "grad_norm": 0.45511624217033386, + "learning_rate": 4.674205552792487e-06, + "loss": 0.5888, + "step": 965 + }, + { + "epoch": 1.0805369127516777, + "grad_norm": 0.4598134756088257, + "learning_rate": 4.673468687707244e-06, + "loss": 0.5705, + "step": 966 + }, + { + "epoch": 1.08165548098434, + "grad_norm": 0.4522278606891632, + "learning_rate": 4.672731048478702e-06, + "loss": 0.5741, + "step": 967 + }, + { + "epoch": 1.0827740492170022, + "grad_norm": 0.4419742822647095, + "learning_rate": 4.671992635369592e-06, + "loss": 0.5832, + "step": 968 + }, + { + "epoch": 1.0838926174496644, + "grad_norm": 0.4407685399055481, + "learning_rate": 4.67125344864292e-06, + "loss": 0.5978, + "step": 969 + }, + { + "epoch": 1.0850111856823266, + "grad_norm": 0.4517025053501129, + "learning_rate": 4.67051348856197e-06, + "loss": 0.6115, + "step": 970 + }, + { + "epoch": 1.0861297539149888, + "grad_norm": 0.4652605354785919, + "learning_rate": 4.6697727553903e-06, + "loss": 0.5838, + "step": 971 + }, + { + "epoch": 1.087248322147651, + "grad_norm": 0.45211100578308105, + "learning_rate": 4.6690312493917424e-06, + "loss": 0.5905, + "step": 972 + }, + { + "epoch": 1.0883668903803132, + "grad_norm": 0.45999446511268616, + "learning_rate": 4.668288970830407e-06, + "loss": 0.589, + "step": 973 + }, + { + "epoch": 1.0894854586129754, + "grad_norm": 0.4458955228328705, + "learning_rate": 4.667545919970676e-06, + "loss": 0.5927, + "step": 974 + }, + { + "epoch": 1.0906040268456376, + "grad_norm": 0.46779292821884155, + "learning_rate": 4.666802097077211e-06, + "loss": 0.5955, + "step": 975 + }, + { + "epoch": 1.0917225950782998, + "grad_norm": 0.4617951810359955, + "learning_rate": 4.666057502414942e-06, + "loss": 0.6117, + "step": 976 + }, + { + "epoch": 1.092841163310962, + "grad_norm": 0.45327237248420715, + "learning_rate": 4.665312136249082e-06, + "loss": 0.5863, + "step": 977 + }, + { + "epoch": 1.0939597315436242, + "grad_norm": 0.4517632722854614, + "learning_rate": 4.664565998845112e-06, + "loss": 0.5832, + "step": 978 + }, + { + "epoch": 1.0950782997762865, + "grad_norm": 0.43310609459877014, + "learning_rate": 4.663819090468791e-06, + "loss": 0.5647, + "step": 979 + }, + { + "epoch": 1.0961968680089484, + "grad_norm": 0.47837111353874207, + "learning_rate": 4.6630714113861505e-06, + "loss": 0.6195, + "step": 980 + }, + { + "epoch": 1.0973154362416107, + "grad_norm": 0.4558872878551483, + "learning_rate": 4.662322961863501e-06, + "loss": 0.5933, + "step": 981 + }, + { + "epoch": 1.0984340044742729, + "grad_norm": 0.45069193840026855, + "learning_rate": 4.661573742167421e-06, + "loss": 0.6066, + "step": 982 + }, + { + "epoch": 1.099552572706935, + "grad_norm": 0.44924792647361755, + "learning_rate": 4.660823752564769e-06, + "loss": 0.5773, + "step": 983 + }, + { + "epoch": 1.1006711409395973, + "grad_norm": 0.44701021909713745, + "learning_rate": 4.660072993322674e-06, + "loss": 0.5663, + "step": 984 + }, + { + "epoch": 1.1017897091722595, + "grad_norm": 0.4506109952926636, + "learning_rate": 4.659321464708541e-06, + "loss": 0.593, + "step": 985 + }, + { + "epoch": 1.1029082774049217, + "grad_norm": 0.4586380422115326, + "learning_rate": 4.658569166990048e-06, + "loss": 0.6014, + "step": 986 + }, + { + "epoch": 1.104026845637584, + "grad_norm": 0.45552054047584534, + "learning_rate": 4.657816100435147e-06, + "loss": 0.6043, + "step": 987 + }, + { + "epoch": 1.1051454138702461, + "grad_norm": 0.4566228985786438, + "learning_rate": 4.657062265312065e-06, + "loss": 0.6088, + "step": 988 + }, + { + "epoch": 1.1062639821029083, + "grad_norm": 0.45366841554641724, + "learning_rate": 4.6563076618893045e-06, + "loss": 0.5918, + "step": 989 + }, + { + "epoch": 1.1073825503355705, + "grad_norm": 0.46025100350379944, + "learning_rate": 4.6555522904356344e-06, + "loss": 0.5876, + "step": 990 + }, + { + "epoch": 1.1085011185682327, + "grad_norm": 0.46731919050216675, + "learning_rate": 4.654796151220106e-06, + "loss": 0.5992, + "step": 991 + }, + { + "epoch": 1.109619686800895, + "grad_norm": 0.45841652154922485, + "learning_rate": 4.654039244512036e-06, + "loss": 0.5833, + "step": 992 + }, + { + "epoch": 1.110738255033557, + "grad_norm": 0.46612823009490967, + "learning_rate": 4.653281570581023e-06, + "loss": 0.6137, + "step": 993 + }, + { + "epoch": 1.1118568232662192, + "grad_norm": 0.4462451934814453, + "learning_rate": 4.6525231296969305e-06, + "loss": 0.545, + "step": 994 + }, + { + "epoch": 1.1129753914988814, + "grad_norm": 0.4724692404270172, + "learning_rate": 4.651763922129901e-06, + "loss": 0.6144, + "step": 995 + }, + { + "epoch": 1.1140939597315436, + "grad_norm": 0.47240084409713745, + "learning_rate": 4.651003948150349e-06, + "loss": 0.6037, + "step": 996 + }, + { + "epoch": 1.1152125279642058, + "grad_norm": 0.44295522570610046, + "learning_rate": 4.650243208028958e-06, + "loss": 0.5571, + "step": 997 + }, + { + "epoch": 1.116331096196868, + "grad_norm": 0.4727005958557129, + "learning_rate": 4.649481702036691e-06, + "loss": 0.5851, + "step": 998 + }, + { + "epoch": 1.1174496644295302, + "grad_norm": 0.4520561397075653, + "learning_rate": 4.648719430444777e-06, + "loss": 0.6016, + "step": 999 + }, + { + "epoch": 1.1185682326621924, + "grad_norm": 0.5022072792053223, + "learning_rate": 4.647956393524723e-06, + "loss": 0.5993, + "step": 1000 + }, + { + "epoch": 1.1196868008948546, + "grad_norm": 0.45666182041168213, + "learning_rate": 4.647192591548305e-06, + "loss": 0.6092, + "step": 1001 + }, + { + "epoch": 1.1208053691275168, + "grad_norm": 0.4551488161087036, + "learning_rate": 4.646428024787575e-06, + "loss": 0.5938, + "step": 1002 + }, + { + "epoch": 1.121923937360179, + "grad_norm": 0.4763842821121216, + "learning_rate": 4.645662693514853e-06, + "loss": 0.6016, + "step": 1003 + }, + { + "epoch": 1.1230425055928412, + "grad_norm": 0.4607378840446472, + "learning_rate": 4.644896598002736e-06, + "loss": 0.5785, + "step": 1004 + }, + { + "epoch": 1.1241610738255035, + "grad_norm": 0.4715465009212494, + "learning_rate": 4.64412973852409e-06, + "loss": 0.588, + "step": 1005 + }, + { + "epoch": 1.1252796420581657, + "grad_norm": 0.47461339831352234, + "learning_rate": 4.643362115352053e-06, + "loss": 0.6296, + "step": 1006 + }, + { + "epoch": 1.1263982102908277, + "grad_norm": 0.4571302831172943, + "learning_rate": 4.642593728760038e-06, + "loss": 0.5493, + "step": 1007 + }, + { + "epoch": 1.1275167785234899, + "grad_norm": 0.46194958686828613, + "learning_rate": 4.641824579021726e-06, + "loss": 0.5884, + "step": 1008 + }, + { + "epoch": 1.128635346756152, + "grad_norm": 0.4678124487400055, + "learning_rate": 4.6410546664110736e-06, + "loss": 0.5967, + "step": 1009 + }, + { + "epoch": 1.1297539149888143, + "grad_norm": 0.45504289865493774, + "learning_rate": 4.640283991202306e-06, + "loss": 0.5696, + "step": 1010 + }, + { + "epoch": 1.1308724832214765, + "grad_norm": 0.46907755732536316, + "learning_rate": 4.639512553669921e-06, + "loss": 0.6261, + "step": 1011 + }, + { + "epoch": 1.1319910514541387, + "grad_norm": 0.46805256605148315, + "learning_rate": 4.63874035408869e-06, + "loss": 0.5696, + "step": 1012 + }, + { + "epoch": 1.133109619686801, + "grad_norm": 0.4621639549732208, + "learning_rate": 4.637967392733652e-06, + "loss": 0.563, + "step": 1013 + }, + { + "epoch": 1.1342281879194631, + "grad_norm": 0.43852928280830383, + "learning_rate": 4.6371936698801215e-06, + "loss": 0.5434, + "step": 1014 + }, + { + "epoch": 1.1353467561521253, + "grad_norm": 0.46553272008895874, + "learning_rate": 4.636419185803681e-06, + "loss": 0.5825, + "step": 1015 + }, + { + "epoch": 1.1364653243847875, + "grad_norm": 0.46866393089294434, + "learning_rate": 4.635643940780184e-06, + "loss": 0.5723, + "step": 1016 + }, + { + "epoch": 1.1375838926174497, + "grad_norm": 0.46932780742645264, + "learning_rate": 4.634867935085758e-06, + "loss": 0.5972, + "step": 1017 + }, + { + "epoch": 1.138702460850112, + "grad_norm": 0.45771515369415283, + "learning_rate": 4.634091168996801e-06, + "loss": 0.5871, + "step": 1018 + }, + { + "epoch": 1.139821029082774, + "grad_norm": 0.461588978767395, + "learning_rate": 4.633313642789976e-06, + "loss": 0.6246, + "step": 1019 + }, + { + "epoch": 1.1409395973154361, + "grad_norm": 0.47002744674682617, + "learning_rate": 4.632535356742226e-06, + "loss": 0.581, + "step": 1020 + }, + { + "epoch": 1.1420581655480984, + "grad_norm": 0.46632397174835205, + "learning_rate": 4.631756311130757e-06, + "loss": 0.6203, + "step": 1021 + }, + { + "epoch": 1.1431767337807606, + "grad_norm": 0.46423229575157166, + "learning_rate": 4.6309765062330504e-06, + "loss": 0.6074, + "step": 1022 + }, + { + "epoch": 1.1442953020134228, + "grad_norm": 0.47234588861465454, + "learning_rate": 4.630195942326855e-06, + "loss": 0.6076, + "step": 1023 + }, + { + "epoch": 1.145413870246085, + "grad_norm": 0.46395570039749146, + "learning_rate": 4.62941461969019e-06, + "loss": 0.5775, + "step": 1024 + }, + { + "epoch": 1.1465324384787472, + "grad_norm": 0.456720232963562, + "learning_rate": 4.628632538601349e-06, + "loss": 0.5924, + "step": 1025 + }, + { + "epoch": 1.1476510067114094, + "grad_norm": 0.4563044309616089, + "learning_rate": 4.62784969933889e-06, + "loss": 0.5883, + "step": 1026 + }, + { + "epoch": 1.1487695749440716, + "grad_norm": 0.46938541531562805, + "learning_rate": 4.627066102181645e-06, + "loss": 0.5602, + "step": 1027 + }, + { + "epoch": 1.1498881431767338, + "grad_norm": 0.48113343119621277, + "learning_rate": 4.626281747408713e-06, + "loss": 0.609, + "step": 1028 + }, + { + "epoch": 1.151006711409396, + "grad_norm": 0.46784207224845886, + "learning_rate": 4.6254966352994666e-06, + "loss": 0.6018, + "step": 1029 + }, + { + "epoch": 1.1521252796420582, + "grad_norm": 0.46613121032714844, + "learning_rate": 4.624710766133544e-06, + "loss": 0.5579, + "step": 1030 + }, + { + "epoch": 1.1532438478747205, + "grad_norm": 0.4715366065502167, + "learning_rate": 4.6239241401908575e-06, + "loss": 0.5767, + "step": 1031 + }, + { + "epoch": 1.1543624161073827, + "grad_norm": 0.48206454515457153, + "learning_rate": 4.623136757751584e-06, + "loss": 0.6118, + "step": 1032 + }, + { + "epoch": 1.1554809843400446, + "grad_norm": 0.46678969264030457, + "learning_rate": 4.622348619096174e-06, + "loss": 0.5876, + "step": 1033 + }, + { + "epoch": 1.1565995525727069, + "grad_norm": 0.4533562958240509, + "learning_rate": 4.621559724505346e-06, + "loss": 0.6038, + "step": 1034 + }, + { + "epoch": 1.157718120805369, + "grad_norm": 0.473532497882843, + "learning_rate": 4.620770074260084e-06, + "loss": 0.5802, + "step": 1035 + }, + { + "epoch": 1.1588366890380313, + "grad_norm": 0.4734630882740021, + "learning_rate": 4.6199796686416505e-06, + "loss": 0.5906, + "step": 1036 + }, + { + "epoch": 1.1599552572706935, + "grad_norm": 0.48316991329193115, + "learning_rate": 4.6191885079315665e-06, + "loss": 0.561, + "step": 1037 + }, + { + "epoch": 1.1610738255033557, + "grad_norm": 0.4646005630493164, + "learning_rate": 4.618396592411628e-06, + "loss": 0.5901, + "step": 1038 + }, + { + "epoch": 1.162192393736018, + "grad_norm": 0.4684715270996094, + "learning_rate": 4.617603922363899e-06, + "loss": 0.5915, + "step": 1039 + }, + { + "epoch": 1.1633109619686801, + "grad_norm": 0.46257463097572327, + "learning_rate": 4.6168104980707105e-06, + "loss": 0.577, + "step": 1040 + }, + { + "epoch": 1.1644295302013423, + "grad_norm": 0.4703442454338074, + "learning_rate": 4.616016319814664e-06, + "loss": 0.564, + "step": 1041 + }, + { + "epoch": 1.1655480984340045, + "grad_norm": 0.4818289577960968, + "learning_rate": 4.615221387878631e-06, + "loss": 0.5955, + "step": 1042 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.4658922553062439, + "learning_rate": 4.614425702545745e-06, + "loss": 0.5944, + "step": 1043 + }, + { + "epoch": 1.167785234899329, + "grad_norm": 0.4584915041923523, + "learning_rate": 4.613629264099416e-06, + "loss": 0.5625, + "step": 1044 + }, + { + "epoch": 1.168903803131991, + "grad_norm": 0.4565418064594269, + "learning_rate": 4.6128320728233164e-06, + "loss": 0.5942, + "step": 1045 + }, + { + "epoch": 1.1700223713646531, + "grad_norm": 0.48082879185676575, + "learning_rate": 4.61203412900139e-06, + "loss": 0.6366, + "step": 1046 + }, + { + "epoch": 1.1711409395973154, + "grad_norm": 0.47267577052116394, + "learning_rate": 4.611235432917845e-06, + "loss": 0.5804, + "step": 1047 + }, + { + "epoch": 1.1722595078299776, + "grad_norm": 0.48486819863319397, + "learning_rate": 4.610435984857163e-06, + "loss": 0.5755, + "step": 1048 + }, + { + "epoch": 1.1733780760626398, + "grad_norm": 0.48791539669036865, + "learning_rate": 4.609635785104089e-06, + "loss": 0.6168, + "step": 1049 + }, + { + "epoch": 1.174496644295302, + "grad_norm": 0.45250606536865234, + "learning_rate": 4.6088348339436365e-06, + "loss": 0.5833, + "step": 1050 + }, + { + "epoch": 1.1756152125279642, + "grad_norm": 0.4777594804763794, + "learning_rate": 4.608033131661089e-06, + "loss": 0.6048, + "step": 1051 + }, + { + "epoch": 1.1767337807606264, + "grad_norm": 0.47242894768714905, + "learning_rate": 4.607230678541993e-06, + "loss": 0.5945, + "step": 1052 + }, + { + "epoch": 1.1778523489932886, + "grad_norm": 0.47580021619796753, + "learning_rate": 4.606427474872169e-06, + "loss": 0.593, + "step": 1053 + }, + { + "epoch": 1.1789709172259508, + "grad_norm": 0.45570215582847595, + "learning_rate": 4.605623520937698e-06, + "loss": 0.5617, + "step": 1054 + }, + { + "epoch": 1.180089485458613, + "grad_norm": 0.4642491936683655, + "learning_rate": 4.604818817024932e-06, + "loss": 0.604, + "step": 1055 + }, + { + "epoch": 1.1812080536912752, + "grad_norm": 0.46808069944381714, + "learning_rate": 4.60401336342049e-06, + "loss": 0.5829, + "step": 1056 + }, + { + "epoch": 1.1823266219239374, + "grad_norm": 0.482730507850647, + "learning_rate": 4.603207160411257e-06, + "loss": 0.6246, + "step": 1057 + }, + { + "epoch": 1.1834451901565997, + "grad_norm": 0.4554171562194824, + "learning_rate": 4.6024002082843855e-06, + "loss": 0.5797, + "step": 1058 + }, + { + "epoch": 1.1845637583892619, + "grad_norm": 0.4731457233428955, + "learning_rate": 4.6015925073272945e-06, + "loss": 0.5883, + "step": 1059 + }, + { + "epoch": 1.1856823266219239, + "grad_norm": 0.45422542095184326, + "learning_rate": 4.600784057827672e-06, + "loss": 0.5914, + "step": 1060 + }, + { + "epoch": 1.186800894854586, + "grad_norm": 0.4871719181537628, + "learning_rate": 4.599974860073466e-06, + "loss": 0.6146, + "step": 1061 + }, + { + "epoch": 1.1879194630872483, + "grad_norm": 0.47315844893455505, + "learning_rate": 4.5991649143529e-06, + "loss": 0.6044, + "step": 1062 + }, + { + "epoch": 1.1890380313199105, + "grad_norm": 0.45965805649757385, + "learning_rate": 4.5983542209544565e-06, + "loss": 0.5628, + "step": 1063 + }, + { + "epoch": 1.1901565995525727, + "grad_norm": 0.4573105573654175, + "learning_rate": 4.5975427801668886e-06, + "loss": 0.581, + "step": 1064 + }, + { + "epoch": 1.191275167785235, + "grad_norm": 0.45358559489250183, + "learning_rate": 4.596730592279214e-06, + "loss": 0.5851, + "step": 1065 + }, + { + "epoch": 1.192393736017897, + "grad_norm": 0.45561692118644714, + "learning_rate": 4.595917657580716e-06, + "loss": 0.5636, + "step": 1066 + }, + { + "epoch": 1.1935123042505593, + "grad_norm": 0.45746201276779175, + "learning_rate": 4.5951039763609464e-06, + "loss": 0.5943, + "step": 1067 + }, + { + "epoch": 1.1946308724832215, + "grad_norm": 0.4627397656440735, + "learning_rate": 4.594289548909719e-06, + "loss": 0.598, + "step": 1068 + }, + { + "epoch": 1.1957494407158837, + "grad_norm": 0.4608605206012726, + "learning_rate": 4.593474375517118e-06, + "loss": 0.5861, + "step": 1069 + }, + { + "epoch": 1.196868008948546, + "grad_norm": 0.45715343952178955, + "learning_rate": 4.592658456473489e-06, + "loss": 0.6024, + "step": 1070 + }, + { + "epoch": 1.197986577181208, + "grad_norm": 0.4674588143825531, + "learning_rate": 4.591841792069445e-06, + "loss": 0.5779, + "step": 1071 + }, + { + "epoch": 1.1991051454138701, + "grad_norm": 0.4705957770347595, + "learning_rate": 4.591024382595864e-06, + "loss": 0.588, + "step": 1072 + }, + { + "epoch": 1.2002237136465324, + "grad_norm": 0.47805118560791016, + "learning_rate": 4.590206228343892e-06, + "loss": 0.5796, + "step": 1073 + }, + { + "epoch": 1.2013422818791946, + "grad_norm": 0.45588305592536926, + "learning_rate": 4.589387329604936e-06, + "loss": 0.5653, + "step": 1074 + }, + { + "epoch": 1.2024608501118568, + "grad_norm": 0.47811266779899597, + "learning_rate": 4.588567686670672e-06, + "loss": 0.6207, + "step": 1075 + }, + { + "epoch": 1.203579418344519, + "grad_norm": 0.46762150526046753, + "learning_rate": 4.587747299833039e-06, + "loss": 0.5697, + "step": 1076 + }, + { + "epoch": 1.2046979865771812, + "grad_norm": 0.46018633246421814, + "learning_rate": 4.586926169384239e-06, + "loss": 0.5772, + "step": 1077 + }, + { + "epoch": 1.2058165548098434, + "grad_norm": 0.46897298097610474, + "learning_rate": 4.586104295616744e-06, + "loss": 0.5733, + "step": 1078 + }, + { + "epoch": 1.2069351230425056, + "grad_norm": 0.466189980506897, + "learning_rate": 4.585281678823288e-06, + "loss": 0.6027, + "step": 1079 + }, + { + "epoch": 1.2080536912751678, + "grad_norm": 0.4684281647205353, + "learning_rate": 4.584458319296868e-06, + "loss": 0.5766, + "step": 1080 + }, + { + "epoch": 1.20917225950783, + "grad_norm": 0.4513910710811615, + "learning_rate": 4.583634217330747e-06, + "loss": 0.5891, + "step": 1081 + }, + { + "epoch": 1.2102908277404922, + "grad_norm": 0.476510226726532, + "learning_rate": 4.5828093732184545e-06, + "loss": 0.601, + "step": 1082 + }, + { + "epoch": 1.2114093959731544, + "grad_norm": 0.47996675968170166, + "learning_rate": 4.581983787253781e-06, + "loss": 0.598, + "step": 1083 + }, + { + "epoch": 1.2125279642058167, + "grad_norm": 0.4757002890110016, + "learning_rate": 4.581157459730783e-06, + "loss": 0.5561, + "step": 1084 + }, + { + "epoch": 1.2136465324384789, + "grad_norm": 0.45650506019592285, + "learning_rate": 4.5803303909437805e-06, + "loss": 0.5702, + "step": 1085 + }, + { + "epoch": 1.2147651006711409, + "grad_norm": 0.4679136872291565, + "learning_rate": 4.579502581187358e-06, + "loss": 0.5854, + "step": 1086 + }, + { + "epoch": 1.215883668903803, + "grad_norm": 0.46434950828552246, + "learning_rate": 4.578674030756364e-06, + "loss": 0.5623, + "step": 1087 + }, + { + "epoch": 1.2170022371364653, + "grad_norm": 0.4812433123588562, + "learning_rate": 4.57784473994591e-06, + "loss": 0.6006, + "step": 1088 + }, + { + "epoch": 1.2181208053691275, + "grad_norm": 0.4599030017852783, + "learning_rate": 4.577014709051372e-06, + "loss": 0.5614, + "step": 1089 + }, + { + "epoch": 1.2192393736017897, + "grad_norm": 0.48703038692474365, + "learning_rate": 4.576183938368389e-06, + "loss": 0.5987, + "step": 1090 + }, + { + "epoch": 1.220357941834452, + "grad_norm": 0.48404622077941895, + "learning_rate": 4.575352428192865e-06, + "loss": 0.592, + "step": 1091 + }, + { + "epoch": 1.221476510067114, + "grad_norm": 0.47787612676620483, + "learning_rate": 4.574520178820965e-06, + "loss": 0.5733, + "step": 1092 + }, + { + "epoch": 1.2225950782997763, + "grad_norm": 0.4672394394874573, + "learning_rate": 4.573687190549119e-06, + "loss": 0.5736, + "step": 1093 + }, + { + "epoch": 1.2237136465324385, + "grad_norm": 0.47233250737190247, + "learning_rate": 4.57285346367402e-06, + "loss": 0.6045, + "step": 1094 + }, + { + "epoch": 1.2248322147651007, + "grad_norm": 0.4758702516555786, + "learning_rate": 4.572018998492623e-06, + "loss": 0.5734, + "step": 1095 + }, + { + "epoch": 1.225950782997763, + "grad_norm": 0.4715832769870758, + "learning_rate": 4.571183795302147e-06, + "loss": 0.599, + "step": 1096 + }, + { + "epoch": 1.227069351230425, + "grad_norm": 0.4783090651035309, + "learning_rate": 4.570347854400074e-06, + "loss": 0.608, + "step": 1097 + }, + { + "epoch": 1.2281879194630871, + "grad_norm": 0.4721231162548065, + "learning_rate": 4.569511176084148e-06, + "loss": 0.5964, + "step": 1098 + }, + { + "epoch": 1.2293064876957494, + "grad_norm": 0.47731295228004456, + "learning_rate": 4.568673760652377e-06, + "loss": 0.5968, + "step": 1099 + }, + { + "epoch": 1.2304250559284116, + "grad_norm": 0.4623177945613861, + "learning_rate": 4.5678356084030286e-06, + "loss": 0.5669, + "step": 1100 + }, + { + "epoch": 1.2315436241610738, + "grad_norm": 0.46821466088294983, + "learning_rate": 4.566996719634636e-06, + "loss": 0.5895, + "step": 1101 + }, + { + "epoch": 1.232662192393736, + "grad_norm": 0.47642552852630615, + "learning_rate": 4.566157094645994e-06, + "loss": 0.5645, + "step": 1102 + }, + { + "epoch": 1.2337807606263982, + "grad_norm": 0.46859392523765564, + "learning_rate": 4.565316733736159e-06, + "loss": 0.6204, + "step": 1103 + }, + { + "epoch": 1.2348993288590604, + "grad_norm": 0.47033944725990295, + "learning_rate": 4.564475637204449e-06, + "loss": 0.5884, + "step": 1104 + }, + { + "epoch": 1.2360178970917226, + "grad_norm": 0.500777006149292, + "learning_rate": 4.563633805350443e-06, + "loss": 0.5922, + "step": 1105 + }, + { + "epoch": 1.2371364653243848, + "grad_norm": 0.49536949396133423, + "learning_rate": 4.562791238473988e-06, + "loss": 0.5953, + "step": 1106 + }, + { + "epoch": 1.238255033557047, + "grad_norm": 0.47423887252807617, + "learning_rate": 4.5619479368751855e-06, + "loss": 0.5772, + "step": 1107 + }, + { + "epoch": 1.2393736017897092, + "grad_norm": 0.5001300573348999, + "learning_rate": 4.561103900854401e-06, + "loss": 0.6128, + "step": 1108 + }, + { + "epoch": 1.2404921700223714, + "grad_norm": 0.4864799380302429, + "learning_rate": 4.560259130712264e-06, + "loss": 0.5821, + "step": 1109 + }, + { + "epoch": 1.2416107382550337, + "grad_norm": 0.5224222540855408, + "learning_rate": 4.559413626749662e-06, + "loss": 0.6192, + "step": 1110 + }, + { + "epoch": 1.2427293064876959, + "grad_norm": 0.4829416275024414, + "learning_rate": 4.558567389267748e-06, + "loss": 0.5825, + "step": 1111 + }, + { + "epoch": 1.2438478747203578, + "grad_norm": 0.46932196617126465, + "learning_rate": 4.557720418567931e-06, + "loss": 0.5887, + "step": 1112 + }, + { + "epoch": 1.24496644295302, + "grad_norm": 0.47189292311668396, + "learning_rate": 4.556872714951886e-06, + "loss": 0.5777, + "step": 1113 + }, + { + "epoch": 1.2460850111856823, + "grad_norm": 0.47725993394851685, + "learning_rate": 4.5560242787215444e-06, + "loss": 0.5801, + "step": 1114 + }, + { + "epoch": 1.2472035794183445, + "grad_norm": 0.4747818112373352, + "learning_rate": 4.555175110179104e-06, + "loss": 0.5833, + "step": 1115 + }, + { + "epoch": 1.2483221476510067, + "grad_norm": 0.48167556524276733, + "learning_rate": 4.554325209627019e-06, + "loss": 0.5917, + "step": 1116 + }, + { + "epoch": 1.249440715883669, + "grad_norm": 0.4757511615753174, + "learning_rate": 4.553474577368006e-06, + "loss": 0.5955, + "step": 1117 + }, + { + "epoch": 1.250559284116331, + "grad_norm": 0.4677585959434509, + "learning_rate": 4.552623213705043e-06, + "loss": 0.5864, + "step": 1118 + }, + { + "epoch": 1.2516778523489933, + "grad_norm": 0.46574166417121887, + "learning_rate": 4.551771118941367e-06, + "loss": 0.562, + "step": 1119 + }, + { + "epoch": 1.2527964205816555, + "grad_norm": 0.46505433320999146, + "learning_rate": 4.5509182933804754e-06, + "loss": 0.5829, + "step": 1120 + }, + { + "epoch": 1.2539149888143177, + "grad_norm": 0.47223830223083496, + "learning_rate": 4.550064737326127e-06, + "loss": 0.5833, + "step": 1121 + }, + { + "epoch": 1.25503355704698, + "grad_norm": 0.4903958737850189, + "learning_rate": 4.549210451082342e-06, + "loss": 0.5612, + "step": 1122 + }, + { + "epoch": 1.256152125279642, + "grad_norm": 0.4613335132598877, + "learning_rate": 4.548355434953397e-06, + "loss": 0.5826, + "step": 1123 + }, + { + "epoch": 1.2572706935123041, + "grad_norm": 0.47294437885284424, + "learning_rate": 4.5474996892438296e-06, + "loss": 0.5834, + "step": 1124 + }, + { + "epoch": 1.2583892617449663, + "grad_norm": 0.4580760598182678, + "learning_rate": 4.546643214258441e-06, + "loss": 0.6002, + "step": 1125 + }, + { + "epoch": 1.2595078299776286, + "grad_norm": 0.4971761703491211, + "learning_rate": 4.545786010302287e-06, + "loss": 0.6082, + "step": 1126 + }, + { + "epoch": 1.2606263982102908, + "grad_norm": 0.47834956645965576, + "learning_rate": 4.544928077680687e-06, + "loss": 0.5697, + "step": 1127 + }, + { + "epoch": 1.261744966442953, + "grad_norm": 0.47802045941352844, + "learning_rate": 4.5440694166992175e-06, + "loss": 0.5709, + "step": 1128 + }, + { + "epoch": 1.2628635346756152, + "grad_norm": 0.46790894865989685, + "learning_rate": 4.5432100276637156e-06, + "loss": 0.574, + "step": 1129 + }, + { + "epoch": 1.2639821029082774, + "grad_norm": 0.4805370271205902, + "learning_rate": 4.542349910880277e-06, + "loss": 0.5665, + "step": 1130 + }, + { + "epoch": 1.2651006711409396, + "grad_norm": 0.4909112751483917, + "learning_rate": 4.5414890666552575e-06, + "loss": 0.5853, + "step": 1131 + }, + { + "epoch": 1.2662192393736018, + "grad_norm": 0.4830676317214966, + "learning_rate": 4.540627495295271e-06, + "loss": 0.5977, + "step": 1132 + }, + { + "epoch": 1.267337807606264, + "grad_norm": 0.48109954595565796, + "learning_rate": 4.539765197107191e-06, + "loss": 0.6083, + "step": 1133 + }, + { + "epoch": 1.2684563758389262, + "grad_norm": 0.48772624135017395, + "learning_rate": 4.538902172398151e-06, + "loss": 0.5858, + "step": 1134 + }, + { + "epoch": 1.2695749440715884, + "grad_norm": 0.4889027774333954, + "learning_rate": 4.53803842147554e-06, + "loss": 0.6107, + "step": 1135 + }, + { + "epoch": 1.2706935123042506, + "grad_norm": 0.4655384123325348, + "learning_rate": 4.5371739446470085e-06, + "loss": 0.575, + "step": 1136 + }, + { + "epoch": 1.2718120805369129, + "grad_norm": 0.5289780497550964, + "learning_rate": 4.536308742220466e-06, + "loss": 0.5875, + "step": 1137 + }, + { + "epoch": 1.272930648769575, + "grad_norm": 0.44917261600494385, + "learning_rate": 4.535442814504077e-06, + "loss": 0.5555, + "step": 1138 + }, + { + "epoch": 1.274049217002237, + "grad_norm": 0.45869240164756775, + "learning_rate": 4.534576161806269e-06, + "loss": 0.5673, + "step": 1139 + }, + { + "epoch": 1.2751677852348993, + "grad_norm": 0.47333377599716187, + "learning_rate": 4.533708784435722e-06, + "loss": 0.5843, + "step": 1140 + }, + { + "epoch": 1.2762863534675615, + "grad_norm": 0.4835321307182312, + "learning_rate": 4.5328406827013806e-06, + "loss": 0.6049, + "step": 1141 + }, + { + "epoch": 1.2774049217002237, + "grad_norm": 0.4710775315761566, + "learning_rate": 4.531971856912443e-06, + "loss": 0.5723, + "step": 1142 + }, + { + "epoch": 1.278523489932886, + "grad_norm": 0.48879000544548035, + "learning_rate": 4.531102307378366e-06, + "loss": 0.5847, + "step": 1143 + }, + { + "epoch": 1.279642058165548, + "grad_norm": 0.476485013961792, + "learning_rate": 4.530232034408864e-06, + "loss": 0.583, + "step": 1144 + }, + { + "epoch": 1.2807606263982103, + "grad_norm": 0.48762884736061096, + "learning_rate": 4.529361038313912e-06, + "loss": 0.6017, + "step": 1145 + }, + { + "epoch": 1.2818791946308725, + "grad_norm": 0.4625467360019684, + "learning_rate": 4.528489319403737e-06, + "loss": 0.5835, + "step": 1146 + }, + { + "epoch": 1.2829977628635347, + "grad_norm": 0.47126954793930054, + "learning_rate": 4.52761687798883e-06, + "loss": 0.5503, + "step": 1147 + }, + { + "epoch": 1.284116331096197, + "grad_norm": 0.46396777033805847, + "learning_rate": 4.526743714379934e-06, + "loss": 0.5669, + "step": 1148 + }, + { + "epoch": 1.285234899328859, + "grad_norm": 0.4775722920894623, + "learning_rate": 4.5258698288880535e-06, + "loss": 0.5727, + "step": 1149 + }, + { + "epoch": 1.2863534675615211, + "grad_norm": 0.5126634240150452, + "learning_rate": 4.524995221824445e-06, + "loss": 0.6249, + "step": 1150 + }, + { + "epoch": 1.2874720357941833, + "grad_norm": 0.47814854979515076, + "learning_rate": 4.524119893500627e-06, + "loss": 0.5881, + "step": 1151 + }, + { + "epoch": 1.2885906040268456, + "grad_norm": 0.47940847277641296, + "learning_rate": 4.523243844228372e-06, + "loss": 0.562, + "step": 1152 + }, + { + "epoch": 1.2897091722595078, + "grad_norm": 0.48836034536361694, + "learning_rate": 4.52236707431971e-06, + "loss": 0.5812, + "step": 1153 + }, + { + "epoch": 1.29082774049217, + "grad_norm": 0.47201383113861084, + "learning_rate": 4.521489584086929e-06, + "loss": 0.5998, + "step": 1154 + }, + { + "epoch": 1.2919463087248322, + "grad_norm": 0.4711892604827881, + "learning_rate": 4.52061137384257e-06, + "loss": 0.5868, + "step": 1155 + }, + { + "epoch": 1.2930648769574944, + "grad_norm": 0.49769148230552673, + "learning_rate": 4.519732443899435e-06, + "loss": 0.6405, + "step": 1156 + }, + { + "epoch": 1.2941834451901566, + "grad_norm": 0.4788062870502472, + "learning_rate": 4.51885279457058e-06, + "loss": 0.5838, + "step": 1157 + }, + { + "epoch": 1.2953020134228188, + "grad_norm": 0.47234484553337097, + "learning_rate": 4.5179724261693154e-06, + "loss": 0.5794, + "step": 1158 + }, + { + "epoch": 1.296420581655481, + "grad_norm": 0.464893639087677, + "learning_rate": 4.517091339009212e-06, + "loss": 0.5741, + "step": 1159 + }, + { + "epoch": 1.2975391498881432, + "grad_norm": 0.47124040126800537, + "learning_rate": 4.516209533404092e-06, + "loss": 0.5974, + "step": 1160 + }, + { + "epoch": 1.2986577181208054, + "grad_norm": 0.470726877450943, + "learning_rate": 4.5153270096680395e-06, + "loss": 0.5686, + "step": 1161 + }, + { + "epoch": 1.2997762863534676, + "grad_norm": 0.4629981815814972, + "learning_rate": 4.514443768115386e-06, + "loss": 0.5794, + "step": 1162 + }, + { + "epoch": 1.3008948545861299, + "grad_norm": 0.4796271026134491, + "learning_rate": 4.513559809060727e-06, + "loss": 0.5741, + "step": 1163 + }, + { + "epoch": 1.302013422818792, + "grad_norm": 0.4912780523300171, + "learning_rate": 4.512675132818908e-06, + "loss": 0.5726, + "step": 1164 + }, + { + "epoch": 1.3031319910514543, + "grad_norm": 0.4748340845108032, + "learning_rate": 4.511789739705033e-06, + "loss": 0.5788, + "step": 1165 + }, + { + "epoch": 1.3042505592841163, + "grad_norm": 0.47445476055145264, + "learning_rate": 4.510903630034458e-06, + "loss": 0.5924, + "step": 1166 + }, + { + "epoch": 1.3053691275167785, + "grad_norm": 0.46978822350502014, + "learning_rate": 4.510016804122799e-06, + "loss": 0.5525, + "step": 1167 + }, + { + "epoch": 1.3064876957494407, + "grad_norm": 0.48169562220573425, + "learning_rate": 4.509129262285924e-06, + "loss": 0.5944, + "step": 1168 + }, + { + "epoch": 1.307606263982103, + "grad_norm": 0.4853328764438629, + "learning_rate": 4.5082410048399555e-06, + "loss": 0.5774, + "step": 1169 + }, + { + "epoch": 1.308724832214765, + "grad_norm": 0.4685922861099243, + "learning_rate": 4.507352032101273e-06, + "loss": 0.583, + "step": 1170 + }, + { + "epoch": 1.3098434004474273, + "grad_norm": 0.466190904378891, + "learning_rate": 4.5064623443865085e-06, + "loss": 0.5676, + "step": 1171 + }, + { + "epoch": 1.3109619686800895, + "grad_norm": 0.4621398448944092, + "learning_rate": 4.50557194201255e-06, + "loss": 0.5674, + "step": 1172 + }, + { + "epoch": 1.3120805369127517, + "grad_norm": 0.48330026865005493, + "learning_rate": 4.504680825296542e-06, + "loss": 0.588, + "step": 1173 + }, + { + "epoch": 1.313199105145414, + "grad_norm": 0.48152461647987366, + "learning_rate": 4.503788994555878e-06, + "loss": 0.5975, + "step": 1174 + }, + { + "epoch": 1.3143176733780761, + "grad_norm": 0.47018247842788696, + "learning_rate": 4.502896450108211e-06, + "loss": 0.5707, + "step": 1175 + }, + { + "epoch": 1.3154362416107381, + "grad_norm": 0.48568496108055115, + "learning_rate": 4.502003192271447e-06, + "loss": 0.5923, + "step": 1176 + }, + { + "epoch": 1.3165548098434003, + "grad_norm": 0.5025852918624878, + "learning_rate": 4.501109221363744e-06, + "loss": 0.6094, + "step": 1177 + }, + { + "epoch": 1.3176733780760626, + "grad_norm": 0.46192529797554016, + "learning_rate": 4.500214537703515e-06, + "loss": 0.5815, + "step": 1178 + }, + { + "epoch": 1.3187919463087248, + "grad_norm": 0.4682995080947876, + "learning_rate": 4.499319141609429e-06, + "loss": 0.5526, + "step": 1179 + }, + { + "epoch": 1.319910514541387, + "grad_norm": 0.4849640130996704, + "learning_rate": 4.498423033400408e-06, + "loss": 0.6011, + "step": 1180 + }, + { + "epoch": 1.3210290827740492, + "grad_norm": 0.4680434465408325, + "learning_rate": 4.4975262133956235e-06, + "loss": 0.5763, + "step": 1181 + }, + { + "epoch": 1.3221476510067114, + "grad_norm": 0.47362959384918213, + "learning_rate": 4.496628681914505e-06, + "loss": 0.5793, + "step": 1182 + }, + { + "epoch": 1.3232662192393736, + "grad_norm": 0.48274803161621094, + "learning_rate": 4.495730439276734e-06, + "loss": 0.5806, + "step": 1183 + }, + { + "epoch": 1.3243847874720358, + "grad_norm": 0.5004247426986694, + "learning_rate": 4.4948314858022456e-06, + "loss": 0.5982, + "step": 1184 + }, + { + "epoch": 1.325503355704698, + "grad_norm": 0.4769462049007416, + "learning_rate": 4.4939318218112284e-06, + "loss": 0.593, + "step": 1185 + }, + { + "epoch": 1.3266219239373602, + "grad_norm": 0.4818820357322693, + "learning_rate": 4.493031447624125e-06, + "loss": 0.5999, + "step": 1186 + }, + { + "epoch": 1.3277404921700224, + "grad_norm": 0.4788765609264374, + "learning_rate": 4.492130363561625e-06, + "loss": 0.5978, + "step": 1187 + }, + { + "epoch": 1.3288590604026846, + "grad_norm": 0.4671361446380615, + "learning_rate": 4.491228569944679e-06, + "loss": 0.5715, + "step": 1188 + }, + { + "epoch": 1.3299776286353469, + "grad_norm": 0.4770296514034271, + "learning_rate": 4.4903260670944855e-06, + "loss": 0.5913, + "step": 1189 + }, + { + "epoch": 1.331096196868009, + "grad_norm": 0.47733429074287415, + "learning_rate": 4.489422855332497e-06, + "loss": 0.5914, + "step": 1190 + }, + { + "epoch": 1.3322147651006713, + "grad_norm": 0.49386340379714966, + "learning_rate": 4.488518934980419e-06, + "loss": 0.5866, + "step": 1191 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.4807666838169098, + "learning_rate": 4.487614306360208e-06, + "loss": 0.5769, + "step": 1192 + }, + { + "epoch": 1.3344519015659955, + "grad_norm": 0.46346330642700195, + "learning_rate": 4.4867089697940735e-06, + "loss": 0.5889, + "step": 1193 + }, + { + "epoch": 1.3355704697986577, + "grad_norm": 0.46165117621421814, + "learning_rate": 4.485802925604476e-06, + "loss": 0.5638, + "step": 1194 + }, + { + "epoch": 1.3366890380313199, + "grad_norm": 0.4789906442165375, + "learning_rate": 4.484896174114132e-06, + "loss": 0.5984, + "step": 1195 + }, + { + "epoch": 1.337807606263982, + "grad_norm": 0.47423994541168213, + "learning_rate": 4.4839887156460045e-06, + "loss": 0.6043, + "step": 1196 + }, + { + "epoch": 1.3389261744966443, + "grad_norm": 0.45627936720848083, + "learning_rate": 4.4830805505233125e-06, + "loss": 0.5781, + "step": 1197 + }, + { + "epoch": 1.3400447427293065, + "grad_norm": 0.49102070927619934, + "learning_rate": 4.482171679069524e-06, + "loss": 0.5854, + "step": 1198 + }, + { + "epoch": 1.3411633109619687, + "grad_norm": 0.48892414569854736, + "learning_rate": 4.48126210160836e-06, + "loss": 0.6194, + "step": 1199 + }, + { + "epoch": 1.342281879194631, + "grad_norm": 0.481579065322876, + "learning_rate": 4.480351818463793e-06, + "loss": 0.5901, + "step": 1200 + }, + { + "epoch": 1.3434004474272931, + "grad_norm": 0.4813998341560364, + "learning_rate": 4.479440829960045e-06, + "loss": 0.6095, + "step": 1201 + }, + { + "epoch": 1.3445190156599551, + "grad_norm": 0.466458261013031, + "learning_rate": 4.478529136421593e-06, + "loss": 0.5481, + "step": 1202 + }, + { + "epoch": 1.3456375838926173, + "grad_norm": 0.45974788069725037, + "learning_rate": 4.477616738173162e-06, + "loss": 0.5832, + "step": 1203 + }, + { + "epoch": 1.3467561521252795, + "grad_norm": 0.4749172329902649, + "learning_rate": 4.476703635539728e-06, + "loss": 0.6021, + "step": 1204 + }, + { + "epoch": 1.3478747203579418, + "grad_norm": 0.47382864356040955, + "learning_rate": 4.475789828846519e-06, + "loss": 0.5936, + "step": 1205 + }, + { + "epoch": 1.348993288590604, + "grad_norm": 0.4810637831687927, + "learning_rate": 4.474875318419015e-06, + "loss": 0.5914, + "step": 1206 + }, + { + "epoch": 1.3501118568232662, + "grad_norm": 0.46880605816841125, + "learning_rate": 4.473960104582943e-06, + "loss": 0.569, + "step": 1207 + }, + { + "epoch": 1.3512304250559284, + "grad_norm": 0.4803348183631897, + "learning_rate": 4.473044187664284e-06, + "loss": 0.5942, + "step": 1208 + }, + { + "epoch": 1.3523489932885906, + "grad_norm": 0.4822969436645508, + "learning_rate": 4.472127567989268e-06, + "loss": 0.5893, + "step": 1209 + }, + { + "epoch": 1.3534675615212528, + "grad_norm": 0.5106329321861267, + "learning_rate": 4.4712102458843755e-06, + "loss": 0.5954, + "step": 1210 + }, + { + "epoch": 1.354586129753915, + "grad_norm": 0.4909040629863739, + "learning_rate": 4.470292221676336e-06, + "loss": 0.5968, + "step": 1211 + }, + { + "epoch": 1.3557046979865772, + "grad_norm": 0.48394322395324707, + "learning_rate": 4.469373495692132e-06, + "loss": 0.5516, + "step": 1212 + }, + { + "epoch": 1.3568232662192394, + "grad_norm": 0.46607136726379395, + "learning_rate": 4.4684540682589925e-06, + "loss": 0.5588, + "step": 1213 + }, + { + "epoch": 1.3579418344519016, + "grad_norm": 0.48817023634910583, + "learning_rate": 4.467533939704398e-06, + "loss": 0.5877, + "step": 1214 + }, + { + "epoch": 1.3590604026845639, + "grad_norm": 0.4760921895503998, + "learning_rate": 4.466613110356081e-06, + "loss": 0.5482, + "step": 1215 + }, + { + "epoch": 1.360178970917226, + "grad_norm": 0.49914559721946716, + "learning_rate": 4.465691580542019e-06, + "loss": 0.6012, + "step": 1216 + }, + { + "epoch": 1.3612975391498883, + "grad_norm": 0.4916679561138153, + "learning_rate": 4.464769350590441e-06, + "loss": 0.6107, + "step": 1217 + }, + { + "epoch": 1.3624161073825503, + "grad_norm": 0.47448763251304626, + "learning_rate": 4.463846420829828e-06, + "loss": 0.5555, + "step": 1218 + }, + { + "epoch": 1.3635346756152125, + "grad_norm": 0.48127514123916626, + "learning_rate": 4.462922791588906e-06, + "loss": 0.5826, + "step": 1219 + }, + { + "epoch": 1.3646532438478747, + "grad_norm": 0.4912678003311157, + "learning_rate": 4.461998463196653e-06, + "loss": 0.5727, + "step": 1220 + }, + { + "epoch": 1.3657718120805369, + "grad_norm": 0.4831150472164154, + "learning_rate": 4.461073435982295e-06, + "loss": 0.6161, + "step": 1221 + }, + { + "epoch": 1.366890380313199, + "grad_norm": 0.4749965965747833, + "learning_rate": 4.460147710275306e-06, + "loss": 0.6009, + "step": 1222 + }, + { + "epoch": 1.3680089485458613, + "grad_norm": 0.5110952854156494, + "learning_rate": 4.459221286405411e-06, + "loss": 0.6029, + "step": 1223 + }, + { + "epoch": 1.3691275167785235, + "grad_norm": 0.4717468321323395, + "learning_rate": 4.458294164702582e-06, + "loss": 0.5699, + "step": 1224 + }, + { + "epoch": 1.3702460850111857, + "grad_norm": 0.4701862633228302, + "learning_rate": 4.457366345497041e-06, + "loss": 0.5575, + "step": 1225 + }, + { + "epoch": 1.371364653243848, + "grad_norm": 0.47513946890830994, + "learning_rate": 4.456437829119256e-06, + "loss": 0.565, + "step": 1226 + }, + { + "epoch": 1.3724832214765101, + "grad_norm": 0.4824097752571106, + "learning_rate": 4.455508615899945e-06, + "loss": 0.5883, + "step": 1227 + }, + { + "epoch": 1.3736017897091721, + "grad_norm": 0.4734891355037689, + "learning_rate": 4.454578706170075e-06, + "loss": 0.5682, + "step": 1228 + }, + { + "epoch": 1.3747203579418343, + "grad_norm": 0.4862266182899475, + "learning_rate": 4.453648100260859e-06, + "loss": 0.5886, + "step": 1229 + }, + { + "epoch": 1.3758389261744965, + "grad_norm": 0.4855569005012512, + "learning_rate": 4.452716798503759e-06, + "loss": 0.5971, + "step": 1230 + }, + { + "epoch": 1.3769574944071588, + "grad_norm": 0.4638337194919586, + "learning_rate": 4.451784801230487e-06, + "loss": 0.578, + "step": 1231 + }, + { + "epoch": 1.378076062639821, + "grad_norm": 0.47969067096710205, + "learning_rate": 4.4508521087729975e-06, + "loss": 0.6131, + "step": 1232 + }, + { + "epoch": 1.3791946308724832, + "grad_norm": 0.45764845609664917, + "learning_rate": 4.449918721463497e-06, + "loss": 0.5439, + "step": 1233 + }, + { + "epoch": 1.3803131991051454, + "grad_norm": 0.4725312292575836, + "learning_rate": 4.448984639634439e-06, + "loss": 0.5774, + "step": 1234 + }, + { + "epoch": 1.3814317673378076, + "grad_norm": 0.4911689758300781, + "learning_rate": 4.448049863618522e-06, + "loss": 0.5891, + "step": 1235 + }, + { + "epoch": 1.3825503355704698, + "grad_norm": 0.47672075033187866, + "learning_rate": 4.447114393748694e-06, + "loss": 0.5706, + "step": 1236 + }, + { + "epoch": 1.383668903803132, + "grad_norm": 0.4902832806110382, + "learning_rate": 4.446178230358151e-06, + "loss": 0.6043, + "step": 1237 + }, + { + "epoch": 1.3847874720357942, + "grad_norm": 0.496720552444458, + "learning_rate": 4.445241373780333e-06, + "loss": 0.5831, + "step": 1238 + }, + { + "epoch": 1.3859060402684564, + "grad_norm": 0.4805045425891876, + "learning_rate": 4.444303824348927e-06, + "loss": 0.5769, + "step": 1239 + }, + { + "epoch": 1.3870246085011186, + "grad_norm": 0.48103493452072144, + "learning_rate": 4.44336558239787e-06, + "loss": 0.5778, + "step": 1240 + }, + { + "epoch": 1.3881431767337808, + "grad_norm": 0.4719166159629822, + "learning_rate": 4.4424266482613445e-06, + "loss": 0.5666, + "step": 1241 + }, + { + "epoch": 1.389261744966443, + "grad_norm": 0.4719798266887665, + "learning_rate": 4.4414870222737775e-06, + "loss": 0.5579, + "step": 1242 + }, + { + "epoch": 1.3903803131991053, + "grad_norm": 0.49560073018074036, + "learning_rate": 4.440546704769845e-06, + "loss": 0.5858, + "step": 1243 + }, + { + "epoch": 1.3914988814317675, + "grad_norm": 0.4905194044113159, + "learning_rate": 4.439605696084465e-06, + "loss": 0.5747, + "step": 1244 + }, + { + "epoch": 1.3926174496644295, + "grad_norm": 0.502269446849823, + "learning_rate": 4.438663996552809e-06, + "loss": 0.5965, + "step": 1245 + }, + { + "epoch": 1.3937360178970917, + "grad_norm": 0.47396889328956604, + "learning_rate": 4.437721606510288e-06, + "loss": 0.6069, + "step": 1246 + }, + { + "epoch": 1.3948545861297539, + "grad_norm": 0.4756862223148346, + "learning_rate": 4.436778526292561e-06, + "loss": 0.5563, + "step": 1247 + }, + { + "epoch": 1.395973154362416, + "grad_norm": 0.47935912013053894, + "learning_rate": 4.435834756235534e-06, + "loss": 0.5811, + "step": 1248 + }, + { + "epoch": 1.3970917225950783, + "grad_norm": 0.47267550230026245, + "learning_rate": 4.434890296675358e-06, + "loss": 0.5477, + "step": 1249 + }, + { + "epoch": 1.3982102908277405, + "grad_norm": 0.4694240391254425, + "learning_rate": 4.433945147948428e-06, + "loss": 0.5586, + "step": 1250 + }, + { + "epoch": 1.3993288590604027, + "grad_norm": 0.47105926275253296, + "learning_rate": 4.432999310391387e-06, + "loss": 0.5684, + "step": 1251 + }, + { + "epoch": 1.400447427293065, + "grad_norm": 0.4700409173965454, + "learning_rate": 4.432052784341122e-06, + "loss": 0.5748, + "step": 1252 + }, + { + "epoch": 1.4015659955257271, + "grad_norm": 0.47937822341918945, + "learning_rate": 4.431105570134766e-06, + "loss": 0.5638, + "step": 1253 + }, + { + "epoch": 1.4026845637583891, + "grad_norm": 0.47045087814331055, + "learning_rate": 4.4301576681096956e-06, + "loss": 0.5597, + "step": 1254 + }, + { + "epoch": 1.4038031319910513, + "grad_norm": 0.48519641160964966, + "learning_rate": 4.429209078603534e-06, + "loss": 0.555, + "step": 1255 + }, + { + "epoch": 1.4049217002237135, + "grad_norm": 0.4835051894187927, + "learning_rate": 4.428259801954148e-06, + "loss": 0.6021, + "step": 1256 + }, + { + "epoch": 1.4060402684563758, + "grad_norm": 0.49831488728523254, + "learning_rate": 4.427309838499651e-06, + "loss": 0.5739, + "step": 1257 + }, + { + "epoch": 1.407158836689038, + "grad_norm": 0.5223401784896851, + "learning_rate": 4.4263591885783976e-06, + "loss": 0.5881, + "step": 1258 + }, + { + "epoch": 1.4082774049217002, + "grad_norm": 0.47920864820480347, + "learning_rate": 4.425407852528991e-06, + "loss": 0.5981, + "step": 1259 + }, + { + "epoch": 1.4093959731543624, + "grad_norm": 0.5153250694274902, + "learning_rate": 4.424455830690275e-06, + "loss": 0.5715, + "step": 1260 + }, + { + "epoch": 1.4105145413870246, + "grad_norm": 0.48909991979599, + "learning_rate": 4.4235031234013425e-06, + "loss": 0.602, + "step": 1261 + }, + { + "epoch": 1.4116331096196868, + "grad_norm": 0.4935949444770813, + "learning_rate": 4.422549731001524e-06, + "loss": 0.6008, + "step": 1262 + }, + { + "epoch": 1.412751677852349, + "grad_norm": 0.49626025557518005, + "learning_rate": 4.421595653830401e-06, + "loss": 0.6158, + "step": 1263 + }, + { + "epoch": 1.4138702460850112, + "grad_norm": 0.4757371246814728, + "learning_rate": 4.420640892227793e-06, + "loss": 0.573, + "step": 1264 + }, + { + "epoch": 1.4149888143176734, + "grad_norm": 0.4638062119483948, + "learning_rate": 4.4196854465337664e-06, + "loss": 0.5808, + "step": 1265 + }, + { + "epoch": 1.4161073825503356, + "grad_norm": 0.4798492193222046, + "learning_rate": 4.418729317088631e-06, + "loss": 0.5706, + "step": 1266 + }, + { + "epoch": 1.4172259507829978, + "grad_norm": 0.4708186388015747, + "learning_rate": 4.41777250423294e-06, + "loss": 0.576, + "step": 1267 + }, + { + "epoch": 1.41834451901566, + "grad_norm": 0.49962857365608215, + "learning_rate": 4.416815008307488e-06, + "loss": 0.596, + "step": 1268 + }, + { + "epoch": 1.4194630872483223, + "grad_norm": 0.4764310121536255, + "learning_rate": 4.415856829653318e-06, + "loss": 0.5903, + "step": 1269 + }, + { + "epoch": 1.4205816554809845, + "grad_norm": 0.4788293242454529, + "learning_rate": 4.4148979686117095e-06, + "loss": 0.5803, + "step": 1270 + }, + { + "epoch": 1.4217002237136465, + "grad_norm": 0.45909953117370605, + "learning_rate": 4.41393842552419e-06, + "loss": 0.5782, + "step": 1271 + }, + { + "epoch": 1.4228187919463087, + "grad_norm": 0.4760737717151642, + "learning_rate": 4.412978200732528e-06, + "loss": 0.6256, + "step": 1272 + }, + { + "epoch": 1.4239373601789709, + "grad_norm": 0.485414981842041, + "learning_rate": 4.412017294578737e-06, + "loss": 0.5895, + "step": 1273 + }, + { + "epoch": 1.425055928411633, + "grad_norm": 0.4804149568080902, + "learning_rate": 4.411055707405068e-06, + "loss": 0.5755, + "step": 1274 + }, + { + "epoch": 1.4261744966442953, + "grad_norm": 0.4825088679790497, + "learning_rate": 4.410093439554019e-06, + "loss": 0.5884, + "step": 1275 + }, + { + "epoch": 1.4272930648769575, + "grad_norm": 0.49632760882377625, + "learning_rate": 4.409130491368331e-06, + "loss": 0.6117, + "step": 1276 + }, + { + "epoch": 1.4284116331096197, + "grad_norm": 0.4739936888217926, + "learning_rate": 4.408166863190983e-06, + "loss": 0.5696, + "step": 1277 + }, + { + "epoch": 1.429530201342282, + "grad_norm": 0.5045065879821777, + "learning_rate": 4.407202555365202e-06, + "loss": 0.6204, + "step": 1278 + }, + { + "epoch": 1.4306487695749441, + "grad_norm": 0.4909072816371918, + "learning_rate": 4.406237568234451e-06, + "loss": 0.5891, + "step": 1279 + }, + { + "epoch": 1.4317673378076063, + "grad_norm": 0.4925367534160614, + "learning_rate": 4.4052719021424395e-06, + "loss": 0.5963, + "step": 1280 + }, + { + "epoch": 1.4328859060402683, + "grad_norm": 0.4968665838241577, + "learning_rate": 4.404305557433118e-06, + "loss": 0.5971, + "step": 1281 + }, + { + "epoch": 1.4340044742729305, + "grad_norm": 0.49153050780296326, + "learning_rate": 4.403338534450675e-06, + "loss": 0.6104, + "step": 1282 + }, + { + "epoch": 1.4351230425055927, + "grad_norm": 0.47348082065582275, + "learning_rate": 4.4023708335395455e-06, + "loss": 0.5746, + "step": 1283 + }, + { + "epoch": 1.436241610738255, + "grad_norm": 0.47999560832977295, + "learning_rate": 4.401402455044405e-06, + "loss": 0.6035, + "step": 1284 + }, + { + "epoch": 1.4373601789709172, + "grad_norm": 0.4812726080417633, + "learning_rate": 4.4004333993101665e-06, + "loss": 0.5946, + "step": 1285 + }, + { + "epoch": 1.4384787472035794, + "grad_norm": 0.4776507318019867, + "learning_rate": 4.39946366668199e-06, + "loss": 0.6048, + "step": 1286 + }, + { + "epoch": 1.4395973154362416, + "grad_norm": 0.4812582731246948, + "learning_rate": 4.398493257505271e-06, + "loss": 0.5949, + "step": 1287 + }, + { + "epoch": 1.4407158836689038, + "grad_norm": 0.4620709717273712, + "learning_rate": 4.397522172125651e-06, + "loss": 0.5566, + "step": 1288 + }, + { + "epoch": 1.441834451901566, + "grad_norm": 0.47055062651634216, + "learning_rate": 4.3965504108890075e-06, + "loss": 0.5507, + "step": 1289 + }, + { + "epoch": 1.4429530201342282, + "grad_norm": 0.4810755252838135, + "learning_rate": 4.395577974141464e-06, + "loss": 0.5789, + "step": 1290 + }, + { + "epoch": 1.4440715883668904, + "grad_norm": 0.4749661982059479, + "learning_rate": 4.394604862229379e-06, + "loss": 0.5686, + "step": 1291 + }, + { + "epoch": 1.4451901565995526, + "grad_norm": 0.4877249598503113, + "learning_rate": 4.393631075499356e-06, + "loss": 0.5726, + "step": 1292 + }, + { + "epoch": 1.4463087248322148, + "grad_norm": 0.4861728250980377, + "learning_rate": 4.392656614298236e-06, + "loss": 0.6094, + "step": 1293 + }, + { + "epoch": 1.447427293064877, + "grad_norm": 0.4913595914840698, + "learning_rate": 4.391681478973103e-06, + "loss": 0.5971, + "step": 1294 + }, + { + "epoch": 1.4485458612975393, + "grad_norm": 0.4858112037181854, + "learning_rate": 4.390705669871278e-06, + "loss": 0.5675, + "step": 1295 + }, + { + "epoch": 1.4496644295302015, + "grad_norm": 0.5002705454826355, + "learning_rate": 4.389729187340323e-06, + "loss": 0.5944, + "step": 1296 + }, + { + "epoch": 1.4507829977628635, + "grad_norm": 0.4777258038520813, + "learning_rate": 4.388752031728042e-06, + "loss": 0.6017, + "step": 1297 + }, + { + "epoch": 1.4519015659955257, + "grad_norm": 0.4859464466571808, + "learning_rate": 4.387774203382476e-06, + "loss": 0.5914, + "step": 1298 + }, + { + "epoch": 1.4530201342281879, + "grad_norm": 0.4924688935279846, + "learning_rate": 4.386795702651907e-06, + "loss": 0.5778, + "step": 1299 + }, + { + "epoch": 1.45413870246085, + "grad_norm": 0.4761926531791687, + "learning_rate": 4.385816529884856e-06, + "loss": 0.5977, + "step": 1300 + }, + { + "epoch": 1.4552572706935123, + "grad_norm": 0.4806959331035614, + "learning_rate": 4.3848366854300834e-06, + "loss": 0.5986, + "step": 1301 + }, + { + "epoch": 1.4563758389261745, + "grad_norm": 0.4975976049900055, + "learning_rate": 4.383856169636589e-06, + "loss": 0.5788, + "step": 1302 + }, + { + "epoch": 1.4574944071588367, + "grad_norm": 0.4763396382331848, + "learning_rate": 4.382874982853611e-06, + "loss": 0.5679, + "step": 1303 + }, + { + "epoch": 1.458612975391499, + "grad_norm": 0.47432610392570496, + "learning_rate": 4.381893125430629e-06, + "loss": 0.566, + "step": 1304 + }, + { + "epoch": 1.4597315436241611, + "grad_norm": 0.4684997498989105, + "learning_rate": 4.380910597717357e-06, + "loss": 0.578, + "step": 1305 + }, + { + "epoch": 1.4608501118568233, + "grad_norm": 0.4823257327079773, + "learning_rate": 4.379927400063754e-06, + "loss": 0.5848, + "step": 1306 + }, + { + "epoch": 1.4619686800894853, + "grad_norm": 0.47135570645332336, + "learning_rate": 4.378943532820011e-06, + "loss": 0.5781, + "step": 1307 + }, + { + "epoch": 1.4630872483221475, + "grad_norm": 0.4889175593852997, + "learning_rate": 4.377958996336563e-06, + "loss": 0.5307, + "step": 1308 + }, + { + "epoch": 1.4642058165548097, + "grad_norm": 0.49853044748306274, + "learning_rate": 4.376973790964078e-06, + "loss": 0.5791, + "step": 1309 + }, + { + "epoch": 1.465324384787472, + "grad_norm": 0.47457244992256165, + "learning_rate": 4.375987917053468e-06, + "loss": 0.5552, + "step": 1310 + }, + { + "epoch": 1.4664429530201342, + "grad_norm": 0.4792843163013458, + "learning_rate": 4.37500137495588e-06, + "loss": 0.5753, + "step": 1311 + }, + { + "epoch": 1.4675615212527964, + "grad_norm": 0.4924052059650421, + "learning_rate": 4.3740141650226975e-06, + "loss": 0.6032, + "step": 1312 + }, + { + "epoch": 1.4686800894854586, + "grad_norm": 0.485517680644989, + "learning_rate": 4.373026287605545e-06, + "loss": 0.5956, + "step": 1313 + }, + { + "epoch": 1.4697986577181208, + "grad_norm": 0.5009534358978271, + "learning_rate": 4.372037743056283e-06, + "loss": 0.587, + "step": 1314 + }, + { + "epoch": 1.470917225950783, + "grad_norm": 0.5067222118377686, + "learning_rate": 4.371048531727009e-06, + "loss": 0.5663, + "step": 1315 + }, + { + "epoch": 1.4720357941834452, + "grad_norm": 0.48260119557380676, + "learning_rate": 4.370058653970062e-06, + "loss": 0.5736, + "step": 1316 + }, + { + "epoch": 1.4731543624161074, + "grad_norm": 0.4996930658817291, + "learning_rate": 4.369068110138013e-06, + "loss": 0.5796, + "step": 1317 + }, + { + "epoch": 1.4742729306487696, + "grad_norm": 0.501705527305603, + "learning_rate": 4.368076900583673e-06, + "loss": 0.6125, + "step": 1318 + }, + { + "epoch": 1.4753914988814318, + "grad_norm": 0.4803929328918457, + "learning_rate": 4.36708502566009e-06, + "loss": 0.5737, + "step": 1319 + }, + { + "epoch": 1.476510067114094, + "grad_norm": 0.4732499122619629, + "learning_rate": 4.366092485720549e-06, + "loss": 0.597, + "step": 1320 + }, + { + "epoch": 1.4776286353467563, + "grad_norm": 0.4887276589870453, + "learning_rate": 4.365099281118571e-06, + "loss": 0.5893, + "step": 1321 + }, + { + "epoch": 1.4787472035794185, + "grad_norm": 0.48169007897377014, + "learning_rate": 4.364105412207914e-06, + "loss": 0.6035, + "step": 1322 + }, + { + "epoch": 1.4798657718120805, + "grad_norm": 0.4954248070716858, + "learning_rate": 4.363110879342575e-06, + "loss": 0.6149, + "step": 1323 + }, + { + "epoch": 1.4809843400447427, + "grad_norm": 0.47244513034820557, + "learning_rate": 4.362115682876783e-06, + "loss": 0.5794, + "step": 1324 + }, + { + "epoch": 1.4821029082774049, + "grad_norm": 0.48339948058128357, + "learning_rate": 4.361119823165007e-06, + "loss": 0.5821, + "step": 1325 + }, + { + "epoch": 1.483221476510067, + "grad_norm": 0.49129050970077515, + "learning_rate": 4.3601233005619515e-06, + "loss": 0.5851, + "step": 1326 + }, + { + "epoch": 1.4843400447427293, + "grad_norm": 0.4731924533843994, + "learning_rate": 4.359126115422555e-06, + "loss": 0.55, + "step": 1327 + }, + { + "epoch": 1.4854586129753915, + "grad_norm": 0.4816952347755432, + "learning_rate": 4.358128268101996e-06, + "loss": 0.5781, + "step": 1328 + }, + { + "epoch": 1.4865771812080537, + "grad_norm": 0.4917982518672943, + "learning_rate": 4.357129758955685e-06, + "loss": 0.6017, + "step": 1329 + }, + { + "epoch": 1.487695749440716, + "grad_norm": 0.5033697485923767, + "learning_rate": 4.356130588339269e-06, + "loss": 0.5783, + "step": 1330 + }, + { + "epoch": 1.4888143176733781, + "grad_norm": 0.48702743649482727, + "learning_rate": 4.355130756608632e-06, + "loss": 0.5757, + "step": 1331 + }, + { + "epoch": 1.4899328859060403, + "grad_norm": 0.49924343824386597, + "learning_rate": 4.354130264119894e-06, + "loss": 0.5985, + "step": 1332 + }, + { + "epoch": 1.4910514541387023, + "grad_norm": 0.4950959384441376, + "learning_rate": 4.353129111229408e-06, + "loss": 0.5688, + "step": 1333 + }, + { + "epoch": 1.4921700223713645, + "grad_norm": 0.47743096947669983, + "learning_rate": 4.352127298293764e-06, + "loss": 0.5655, + "step": 1334 + }, + { + "epoch": 1.4932885906040267, + "grad_norm": 0.4949822723865509, + "learning_rate": 4.351124825669785e-06, + "loss": 0.5915, + "step": 1335 + }, + { + "epoch": 1.494407158836689, + "grad_norm": 0.48831161856651306, + "learning_rate": 4.350121693714531e-06, + "loss": 0.5983, + "step": 1336 + }, + { + "epoch": 1.4955257270693512, + "grad_norm": 0.5018520355224609, + "learning_rate": 4.349117902785297e-06, + "loss": 0.5872, + "step": 1337 + }, + { + "epoch": 1.4966442953020134, + "grad_norm": 0.49204087257385254, + "learning_rate": 4.3481134532396116e-06, + "loss": 0.5895, + "step": 1338 + }, + { + "epoch": 1.4977628635346756, + "grad_norm": 0.47245603799819946, + "learning_rate": 4.347108345435238e-06, + "loss": 0.5585, + "step": 1339 + }, + { + "epoch": 1.4988814317673378, + "grad_norm": 0.4722137749195099, + "learning_rate": 4.3461025797301745e-06, + "loss": 0.5454, + "step": 1340 + }, + { + "epoch": 1.5, + "grad_norm": 0.4701415002346039, + "learning_rate": 4.345096156482655e-06, + "loss": 0.565, + "step": 1341 + }, + { + "epoch": 1.5011185682326622, + "grad_norm": 0.4957852065563202, + "learning_rate": 4.344089076051143e-06, + "loss": 0.6197, + "step": 1342 + }, + { + "epoch": 1.5022371364653244, + "grad_norm": 0.4819275140762329, + "learning_rate": 4.3430813387943405e-06, + "loss": 0.5635, + "step": 1343 + }, + { + "epoch": 1.5033557046979866, + "grad_norm": 0.5286908149719238, + "learning_rate": 4.342072945071183e-06, + "loss": 0.6067, + "step": 1344 + }, + { + "epoch": 1.5044742729306488, + "grad_norm": 0.4800012409687042, + "learning_rate": 4.3410638952408375e-06, + "loss": 0.58, + "step": 1345 + }, + { + "epoch": 1.505592841163311, + "grad_norm": 0.4997807443141937, + "learning_rate": 4.340054189662707e-06, + "loss": 0.5966, + "step": 1346 + }, + { + "epoch": 1.5067114093959733, + "grad_norm": 0.4921199083328247, + "learning_rate": 4.339043828696427e-06, + "loss": 0.5876, + "step": 1347 + }, + { + "epoch": 1.5078299776286355, + "grad_norm": 0.47939038276672363, + "learning_rate": 4.3380328127018666e-06, + "loss": 0.5758, + "step": 1348 + }, + { + "epoch": 1.5089485458612977, + "grad_norm": 0.5145376324653625, + "learning_rate": 4.337021142039127e-06, + "loss": 0.5815, + "step": 1349 + }, + { + "epoch": 1.5100671140939599, + "grad_norm": 0.4857310652732849, + "learning_rate": 4.336008817068546e-06, + "loss": 0.5686, + "step": 1350 + }, + { + "epoch": 1.5111856823266219, + "grad_norm": 0.4951683580875397, + "learning_rate": 4.33499583815069e-06, + "loss": 0.5872, + "step": 1351 + }, + { + "epoch": 1.512304250559284, + "grad_norm": 0.5085970163345337, + "learning_rate": 4.3339822056463624e-06, + "loss": 0.605, + "step": 1352 + }, + { + "epoch": 1.5134228187919463, + "grad_norm": 0.5023934841156006, + "learning_rate": 4.332967919916596e-06, + "loss": 0.5969, + "step": 1353 + }, + { + "epoch": 1.5145413870246085, + "grad_norm": 0.5029785633087158, + "learning_rate": 4.331952981322658e-06, + "loss": 0.5963, + "step": 1354 + }, + { + "epoch": 1.5156599552572707, + "grad_norm": 0.49423643946647644, + "learning_rate": 4.330937390226049e-06, + "loss": 0.5591, + "step": 1355 + }, + { + "epoch": 1.516778523489933, + "grad_norm": 0.48507627844810486, + "learning_rate": 4.3299211469885e-06, + "loss": 0.5816, + "step": 1356 + }, + { + "epoch": 1.5178970917225951, + "grad_norm": 0.496153861284256, + "learning_rate": 4.328904251971976e-06, + "loss": 0.5784, + "step": 1357 + }, + { + "epoch": 1.5190156599552571, + "grad_norm": 0.4916113018989563, + "learning_rate": 4.327886705538672e-06, + "loss": 0.5666, + "step": 1358 + }, + { + "epoch": 1.5201342281879193, + "grad_norm": 0.49055805802345276, + "learning_rate": 4.326868508051018e-06, + "loss": 0.5975, + "step": 1359 + }, + { + "epoch": 1.5212527964205815, + "grad_norm": 0.47937023639678955, + "learning_rate": 4.325849659871674e-06, + "loss": 0.5716, + "step": 1360 + }, + { + "epoch": 1.5223713646532437, + "grad_norm": 0.49050694704055786, + "learning_rate": 4.3248301613635306e-06, + "loss": 0.584, + "step": 1361 + }, + { + "epoch": 1.523489932885906, + "grad_norm": 0.4832146167755127, + "learning_rate": 4.323810012889713e-06, + "loss": 0.579, + "step": 1362 + }, + { + "epoch": 1.5246085011185682, + "grad_norm": 0.5009722709655762, + "learning_rate": 4.3227892148135755e-06, + "loss": 0.6014, + "step": 1363 + }, + { + "epoch": 1.5257270693512304, + "grad_norm": 0.4904753267765045, + "learning_rate": 4.321767767498705e-06, + "loss": 0.6009, + "step": 1364 + }, + { + "epoch": 1.5268456375838926, + "grad_norm": 0.5005730986595154, + "learning_rate": 4.32074567130892e-06, + "loss": 0.5975, + "step": 1365 + }, + { + "epoch": 1.5279642058165548, + "grad_norm": 0.4825516939163208, + "learning_rate": 4.319722926608268e-06, + "loss": 0.5726, + "step": 1366 + }, + { + "epoch": 1.529082774049217, + "grad_norm": 0.5070571899414062, + "learning_rate": 4.31869953376103e-06, + "loss": 0.6022, + "step": 1367 + }, + { + "epoch": 1.5302013422818792, + "grad_norm": 0.5236569046974182, + "learning_rate": 4.3176754931317154e-06, + "loss": 0.5973, + "step": 1368 + }, + { + "epoch": 1.5313199105145414, + "grad_norm": 0.49975070357322693, + "learning_rate": 4.316650805085068e-06, + "loss": 0.5881, + "step": 1369 + }, + { + "epoch": 1.5324384787472036, + "grad_norm": 0.49533969163894653, + "learning_rate": 4.315625469986058e-06, + "loss": 0.5748, + "step": 1370 + }, + { + "epoch": 1.5335570469798658, + "grad_norm": 0.4943258762359619, + "learning_rate": 4.314599488199889e-06, + "loss": 0.5832, + "step": 1371 + }, + { + "epoch": 1.534675615212528, + "grad_norm": 0.48157885670661926, + "learning_rate": 4.313572860091993e-06, + "loss": 0.5586, + "step": 1372 + }, + { + "epoch": 1.5357941834451903, + "grad_norm": 0.5046592354774475, + "learning_rate": 4.312545586028033e-06, + "loss": 0.5777, + "step": 1373 + }, + { + "epoch": 1.5369127516778525, + "grad_norm": 0.5014451146125793, + "learning_rate": 4.311517666373902e-06, + "loss": 0.5875, + "step": 1374 + }, + { + "epoch": 1.5380313199105147, + "grad_norm": 0.5050190687179565, + "learning_rate": 4.310489101495725e-06, + "loss": 0.6041, + "step": 1375 + }, + { + "epoch": 1.5391498881431769, + "grad_norm": 0.49115487933158875, + "learning_rate": 4.309459891759853e-06, + "loss": 0.5888, + "step": 1376 + }, + { + "epoch": 1.540268456375839, + "grad_norm": 0.5117396712303162, + "learning_rate": 4.308430037532871e-06, + "loss": 0.5689, + "step": 1377 + }, + { + "epoch": 1.541387024608501, + "grad_norm": 0.4883919656276703, + "learning_rate": 4.307399539181587e-06, + "loss": 0.5982, + "step": 1378 + }, + { + "epoch": 1.5425055928411633, + "grad_norm": 0.48665109276771545, + "learning_rate": 4.306368397073046e-06, + "loss": 0.5899, + "step": 1379 + }, + { + "epoch": 1.5436241610738255, + "grad_norm": 0.4853532910346985, + "learning_rate": 4.305336611574518e-06, + "loss": 0.5695, + "step": 1380 + }, + { + "epoch": 1.5447427293064877, + "grad_norm": 0.4923276901245117, + "learning_rate": 4.304304183053502e-06, + "loss": 0.5831, + "step": 1381 + }, + { + "epoch": 1.54586129753915, + "grad_norm": 0.5048404335975647, + "learning_rate": 4.303271111877729e-06, + "loss": 0.6238, + "step": 1382 + }, + { + "epoch": 1.5469798657718121, + "grad_norm": 0.49105462431907654, + "learning_rate": 4.302237398415156e-06, + "loss": 0.5976, + "step": 1383 + }, + { + "epoch": 1.548098434004474, + "grad_norm": 0.4881378412246704, + "learning_rate": 4.301203043033969e-06, + "loss": 0.5739, + "step": 1384 + }, + { + "epoch": 1.5492170022371363, + "grad_norm": 0.48858603835105896, + "learning_rate": 4.3001680461025844e-06, + "loss": 0.5792, + "step": 1385 + }, + { + "epoch": 1.5503355704697985, + "grad_norm": 0.48439061641693115, + "learning_rate": 4.299132407989646e-06, + "loss": 0.5742, + "step": 1386 + }, + { + "epoch": 1.5514541387024607, + "grad_norm": 0.47701379656791687, + "learning_rate": 4.298096129064026e-06, + "loss": 0.5844, + "step": 1387 + }, + { + "epoch": 1.552572706935123, + "grad_norm": 0.49393609166145325, + "learning_rate": 4.297059209694824e-06, + "loss": 0.5886, + "step": 1388 + }, + { + "epoch": 1.5536912751677852, + "grad_norm": 0.5089290738105774, + "learning_rate": 4.296021650251369e-06, + "loss": 0.5856, + "step": 1389 + }, + { + "epoch": 1.5548098434004474, + "grad_norm": 0.509134829044342, + "learning_rate": 4.294983451103219e-06, + "loss": 0.5939, + "step": 1390 + }, + { + "epoch": 1.5559284116331096, + "grad_norm": 0.5126070976257324, + "learning_rate": 4.293944612620157e-06, + "loss": 0.5692, + "step": 1391 + }, + { + "epoch": 1.5570469798657718, + "grad_norm": 0.5223609805107117, + "learning_rate": 4.2929051351721956e-06, + "loss": 0.599, + "step": 1392 + }, + { + "epoch": 1.558165548098434, + "grad_norm": 0.48266369104385376, + "learning_rate": 4.291865019129575e-06, + "loss": 0.5827, + "step": 1393 + }, + { + "epoch": 1.5592841163310962, + "grad_norm": 0.4978017210960388, + "learning_rate": 4.290824264862761e-06, + "loss": 0.5994, + "step": 1394 + }, + { + "epoch": 1.5604026845637584, + "grad_norm": 0.5075487494468689, + "learning_rate": 4.2897828727424495e-06, + "loss": 0.5641, + "step": 1395 + }, + { + "epoch": 1.5615212527964206, + "grad_norm": 0.5072858929634094, + "learning_rate": 4.2887408431395615e-06, + "loss": 0.5751, + "step": 1396 + }, + { + "epoch": 1.5626398210290828, + "grad_norm": 0.513059139251709, + "learning_rate": 4.287698176425246e-06, + "loss": 0.5909, + "step": 1397 + }, + { + "epoch": 1.563758389261745, + "grad_norm": 0.49780142307281494, + "learning_rate": 4.286654872970879e-06, + "loss": 0.5928, + "step": 1398 + }, + { + "epoch": 1.5648769574944073, + "grad_norm": 0.4992021918296814, + "learning_rate": 4.285610933148062e-06, + "loss": 0.583, + "step": 1399 + }, + { + "epoch": 1.5659955257270695, + "grad_norm": 0.4856356978416443, + "learning_rate": 4.284566357328625e-06, + "loss": 0.6213, + "step": 1400 + }, + { + "epoch": 1.5671140939597317, + "grad_norm": 0.48052698373794556, + "learning_rate": 4.283521145884625e-06, + "loss": 0.5753, + "step": 1401 + }, + { + "epoch": 1.5682326621923939, + "grad_norm": 0.4759695827960968, + "learning_rate": 4.2824752991883415e-06, + "loss": 0.5764, + "step": 1402 + }, + { + "epoch": 1.569351230425056, + "grad_norm": 0.49083444476127625, + "learning_rate": 4.2814288176122846e-06, + "loss": 0.5839, + "step": 1403 + }, + { + "epoch": 1.570469798657718, + "grad_norm": 0.4828498661518097, + "learning_rate": 4.280381701529187e-06, + "loss": 0.5734, + "step": 1404 + }, + { + "epoch": 1.5715883668903803, + "grad_norm": 0.4739566147327423, + "learning_rate": 4.27933395131201e-06, + "loss": 0.5958, + "step": 1405 + }, + { + "epoch": 1.5727069351230425, + "grad_norm": 0.4890132546424866, + "learning_rate": 4.278285567333942e-06, + "loss": 0.556, + "step": 1406 + }, + { + "epoch": 1.5738255033557047, + "grad_norm": 0.49774521589279175, + "learning_rate": 4.277236549968392e-06, + "loss": 0.5779, + "step": 1407 + }, + { + "epoch": 1.574944071588367, + "grad_norm": 0.5137577652931213, + "learning_rate": 4.276186899588999e-06, + "loss": 0.5947, + "step": 1408 + }, + { + "epoch": 1.5760626398210291, + "grad_norm": 0.5089177489280701, + "learning_rate": 4.275136616569626e-06, + "loss": 0.5825, + "step": 1409 + }, + { + "epoch": 1.5771812080536913, + "grad_norm": 0.49170440435409546, + "learning_rate": 4.2740857012843625e-06, + "loss": 0.5887, + "step": 1410 + }, + { + "epoch": 1.5782997762863533, + "grad_norm": 0.48788514733314514, + "learning_rate": 4.27303415410752e-06, + "loss": 0.565, + "step": 1411 + }, + { + "epoch": 1.5794183445190155, + "grad_norm": 0.4836443364620209, + "learning_rate": 4.2719819754136395e-06, + "loss": 0.5679, + "step": 1412 + }, + { + "epoch": 1.5805369127516777, + "grad_norm": 0.4887832999229431, + "learning_rate": 4.270929165577483e-06, + "loss": 0.5954, + "step": 1413 + }, + { + "epoch": 1.58165548098434, + "grad_norm": 0.5417852401733398, + "learning_rate": 4.26987572497404e-06, + "loss": 0.5747, + "step": 1414 + }, + { + "epoch": 1.5827740492170022, + "grad_norm": 0.4931381642818451, + "learning_rate": 4.268821653978522e-06, + "loss": 0.5964, + "step": 1415 + }, + { + "epoch": 1.5838926174496644, + "grad_norm": 0.48656824231147766, + "learning_rate": 4.267766952966369e-06, + "loss": 0.5919, + "step": 1416 + }, + { + "epoch": 1.5850111856823266, + "grad_norm": 0.5253188610076904, + "learning_rate": 4.266711622313242e-06, + "loss": 0.5917, + "step": 1417 + }, + { + "epoch": 1.5861297539149888, + "grad_norm": 0.5214728713035583, + "learning_rate": 4.2656556623950265e-06, + "loss": 0.5545, + "step": 1418 + }, + { + "epoch": 1.587248322147651, + "grad_norm": 0.4981340169906616, + "learning_rate": 4.264599073587834e-06, + "loss": 0.5737, + "step": 1419 + }, + { + "epoch": 1.5883668903803132, + "grad_norm": 0.49596238136291504, + "learning_rate": 4.263541856267999e-06, + "loss": 0.5841, + "step": 1420 + }, + { + "epoch": 1.5894854586129754, + "grad_norm": 0.47956663370132446, + "learning_rate": 4.262484010812079e-06, + "loss": 0.5566, + "step": 1421 + }, + { + "epoch": 1.5906040268456376, + "grad_norm": 0.4902280569076538, + "learning_rate": 4.261425537596857e-06, + "loss": 0.5907, + "step": 1422 + }, + { + "epoch": 1.5917225950782998, + "grad_norm": 0.5098991990089417, + "learning_rate": 4.260366436999338e-06, + "loss": 0.5997, + "step": 1423 + }, + { + "epoch": 1.592841163310962, + "grad_norm": 0.48146405816078186, + "learning_rate": 4.259306709396751e-06, + "loss": 0.5693, + "step": 1424 + }, + { + "epoch": 1.5939597315436242, + "grad_norm": 0.4839426577091217, + "learning_rate": 4.258246355166548e-06, + "loss": 0.5734, + "step": 1425 + }, + { + "epoch": 1.5950782997762865, + "grad_norm": 0.4717272222042084, + "learning_rate": 4.257185374686405e-06, + "loss": 0.5716, + "step": 1426 + }, + { + "epoch": 1.5961968680089487, + "grad_norm": 0.4848939776420593, + "learning_rate": 4.256123768334223e-06, + "loss": 0.5592, + "step": 1427 + }, + { + "epoch": 1.5973154362416109, + "grad_norm": 0.505172073841095, + "learning_rate": 4.25506153648812e-06, + "loss": 0.5822, + "step": 1428 + }, + { + "epoch": 1.598434004474273, + "grad_norm": 0.5064011216163635, + "learning_rate": 4.253998679526442e-06, + "loss": 0.583, + "step": 1429 + }, + { + "epoch": 1.599552572706935, + "grad_norm": 0.5151503086090088, + "learning_rate": 4.252935197827756e-06, + "loss": 0.6022, + "step": 1430 + }, + { + "epoch": 1.6006711409395973, + "grad_norm": 0.48397284746170044, + "learning_rate": 4.251871091770852e-06, + "loss": 0.5859, + "step": 1431 + }, + { + "epoch": 1.6017897091722595, + "grad_norm": 0.48410117626190186, + "learning_rate": 4.2508063617347415e-06, + "loss": 0.5724, + "step": 1432 + }, + { + "epoch": 1.6029082774049217, + "grad_norm": 0.4976502060890198, + "learning_rate": 4.249741008098658e-06, + "loss": 0.5968, + "step": 1433 + }, + { + "epoch": 1.604026845637584, + "grad_norm": 0.5036817193031311, + "learning_rate": 4.2486750312420585e-06, + "loss": 0.5922, + "step": 1434 + }, + { + "epoch": 1.6051454138702461, + "grad_norm": 0.5002356171607971, + "learning_rate": 4.247608431544622e-06, + "loss": 0.5829, + "step": 1435 + }, + { + "epoch": 1.6062639821029083, + "grad_norm": 0.4937509000301361, + "learning_rate": 4.246541209386247e-06, + "loss": 0.578, + "step": 1436 + }, + { + "epoch": 1.6073825503355703, + "grad_norm": 0.489439457654953, + "learning_rate": 4.245473365147056e-06, + "loss": 0.5793, + "step": 1437 + }, + { + "epoch": 1.6085011185682325, + "grad_norm": 0.48611974716186523, + "learning_rate": 4.244404899207393e-06, + "loss": 0.5952, + "step": 1438 + }, + { + "epoch": 1.6096196868008947, + "grad_norm": 0.48233819007873535, + "learning_rate": 4.2433358119478215e-06, + "loss": 0.586, + "step": 1439 + }, + { + "epoch": 1.610738255033557, + "grad_norm": 0.4946385324001312, + "learning_rate": 4.24226610374913e-06, + "loss": 0.5832, + "step": 1440 + }, + { + "epoch": 1.6118568232662192, + "grad_norm": 0.49822622537612915, + "learning_rate": 4.241195774992323e-06, + "loss": 0.5929, + "step": 1441 + }, + { + "epoch": 1.6129753914988814, + "grad_norm": 0.4824593663215637, + "learning_rate": 4.240124826058631e-06, + "loss": 0.5603, + "step": 1442 + }, + { + "epoch": 1.6140939597315436, + "grad_norm": 0.4935947358608246, + "learning_rate": 4.239053257329502e-06, + "loss": 0.5708, + "step": 1443 + }, + { + "epoch": 1.6152125279642058, + "grad_norm": 0.48947572708129883, + "learning_rate": 4.237981069186606e-06, + "loss": 0.5576, + "step": 1444 + }, + { + "epoch": 1.616331096196868, + "grad_norm": 0.4911848306655884, + "learning_rate": 4.236908262011834e-06, + "loss": 0.6014, + "step": 1445 + }, + { + "epoch": 1.6174496644295302, + "grad_norm": 0.5052220225334167, + "learning_rate": 4.2358348361872975e-06, + "loss": 0.6186, + "step": 1446 + }, + { + "epoch": 1.6185682326621924, + "grad_norm": 0.47427546977996826, + "learning_rate": 4.234760792095327e-06, + "loss": 0.554, + "step": 1447 + }, + { + "epoch": 1.6196868008948546, + "grad_norm": 0.495304673910141, + "learning_rate": 4.2336861301184754e-06, + "loss": 0.6066, + "step": 1448 + }, + { + "epoch": 1.6208053691275168, + "grad_norm": 0.49640989303588867, + "learning_rate": 4.2326108506395125e-06, + "loss": 0.5935, + "step": 1449 + }, + { + "epoch": 1.621923937360179, + "grad_norm": 0.4849659204483032, + "learning_rate": 4.231534954041432e-06, + "loss": 0.5707, + "step": 1450 + }, + { + "epoch": 1.6230425055928412, + "grad_norm": 0.48401832580566406, + "learning_rate": 4.230458440707443e-06, + "loss": 0.6059, + "step": 1451 + }, + { + "epoch": 1.6241610738255035, + "grad_norm": 0.48183032870292664, + "learning_rate": 4.2293813110209795e-06, + "loss": 0.5926, + "step": 1452 + }, + { + "epoch": 1.6252796420581657, + "grad_norm": 0.47466862201690674, + "learning_rate": 4.22830356536569e-06, + "loss": 0.5472, + "step": 1453 + }, + { + "epoch": 1.6263982102908279, + "grad_norm": 0.4778653383255005, + "learning_rate": 4.227225204125447e-06, + "loss": 0.5331, + "step": 1454 + }, + { + "epoch": 1.62751677852349, + "grad_norm": 0.49169987440109253, + "learning_rate": 4.226146227684337e-06, + "loss": 0.5892, + "step": 1455 + }, + { + "epoch": 1.6286353467561523, + "grad_norm": 0.4861750304698944, + "learning_rate": 4.225066636426669e-06, + "loss": 0.5902, + "step": 1456 + }, + { + "epoch": 1.6297539149888143, + "grad_norm": 0.5020110607147217, + "learning_rate": 4.223986430736972e-06, + "loss": 0.5878, + "step": 1457 + }, + { + "epoch": 1.6308724832214765, + "grad_norm": 0.5259606242179871, + "learning_rate": 4.2229056109999915e-06, + "loss": 0.5855, + "step": 1458 + }, + { + "epoch": 1.6319910514541387, + "grad_norm": 0.5020241737365723, + "learning_rate": 4.221824177600692e-06, + "loss": 0.5728, + "step": 1459 + }, + { + "epoch": 1.633109619686801, + "grad_norm": 0.4941030740737915, + "learning_rate": 4.220742130924257e-06, + "loss": 0.5834, + "step": 1460 + }, + { + "epoch": 1.6342281879194631, + "grad_norm": 0.49204495549201965, + "learning_rate": 4.21965947135609e-06, + "loss": 0.6003, + "step": 1461 + }, + { + "epoch": 1.6353467561521253, + "grad_norm": 0.5063990950584412, + "learning_rate": 4.218576199281809e-06, + "loss": 0.5886, + "step": 1462 + }, + { + "epoch": 1.6364653243847873, + "grad_norm": 0.49553683400154114, + "learning_rate": 4.217492315087255e-06, + "loss": 0.5917, + "step": 1463 + }, + { + "epoch": 1.6375838926174495, + "grad_norm": 0.48495718836784363, + "learning_rate": 4.216407819158482e-06, + "loss": 0.5649, + "step": 1464 + }, + { + "epoch": 1.6387024608501117, + "grad_norm": 0.4799686074256897, + "learning_rate": 4.215322711881766e-06, + "loss": 0.5572, + "step": 1465 + }, + { + "epoch": 1.639821029082774, + "grad_norm": 0.5015504956245422, + "learning_rate": 4.2142369936435986e-06, + "loss": 0.5829, + "step": 1466 + }, + { + "epoch": 1.6409395973154361, + "grad_norm": 0.5071560144424438, + "learning_rate": 4.21315066483069e-06, + "loss": 0.6074, + "step": 1467 + }, + { + "epoch": 1.6420581655480984, + "grad_norm": 0.5041237473487854, + "learning_rate": 4.212063725829966e-06, + "loss": 0.5996, + "step": 1468 + }, + { + "epoch": 1.6431767337807606, + "grad_norm": 0.5061616897583008, + "learning_rate": 4.210976177028573e-06, + "loss": 0.6098, + "step": 1469 + }, + { + "epoch": 1.6442953020134228, + "grad_norm": 0.488351047039032, + "learning_rate": 4.209888018813872e-06, + "loss": 0.5902, + "step": 1470 + }, + { + "epoch": 1.645413870246085, + "grad_norm": 0.494767427444458, + "learning_rate": 4.208799251573441e-06, + "loss": 0.5632, + "step": 1471 + }, + { + "epoch": 1.6465324384787472, + "grad_norm": 0.5081776976585388, + "learning_rate": 4.207709875695078e-06, + "loss": 0.6086, + "step": 1472 + }, + { + "epoch": 1.6476510067114094, + "grad_norm": 0.5012889504432678, + "learning_rate": 4.206619891566792e-06, + "loss": 0.5879, + "step": 1473 + }, + { + "epoch": 1.6487695749440716, + "grad_norm": 0.5006359219551086, + "learning_rate": 4.2055292995768145e-06, + "loss": 0.5648, + "step": 1474 + }, + { + "epoch": 1.6498881431767338, + "grad_norm": 0.4885180592536926, + "learning_rate": 4.204438100113592e-06, + "loss": 0.5629, + "step": 1475 + }, + { + "epoch": 1.651006711409396, + "grad_norm": 0.4856667220592499, + "learning_rate": 4.203346293565784e-06, + "loss": 0.598, + "step": 1476 + }, + { + "epoch": 1.6521252796420582, + "grad_norm": 0.484739750623703, + "learning_rate": 4.2022538803222714e-06, + "loss": 0.5862, + "step": 1477 + }, + { + "epoch": 1.6532438478747205, + "grad_norm": 0.4918213188648224, + "learning_rate": 4.2011608607721455e-06, + "loss": 0.5699, + "step": 1478 + }, + { + "epoch": 1.6543624161073827, + "grad_norm": 0.5036900639533997, + "learning_rate": 4.200067235304719e-06, + "loss": 0.6143, + "step": 1479 + }, + { + "epoch": 1.6554809843400449, + "grad_norm": 0.4958629310131073, + "learning_rate": 4.1989730043095175e-06, + "loss": 0.5423, + "step": 1480 + }, + { + "epoch": 1.656599552572707, + "grad_norm": 0.49514490365982056, + "learning_rate": 4.1978781681762825e-06, + "loss": 0.6092, + "step": 1481 + }, + { + "epoch": 1.6577181208053693, + "grad_norm": 0.5146138668060303, + "learning_rate": 4.1967827272949715e-06, + "loss": 0.5907, + "step": 1482 + }, + { + "epoch": 1.6588366890380313, + "grad_norm": 0.5063082575798035, + "learning_rate": 4.195686682055758e-06, + "loss": 0.5779, + "step": 1483 + }, + { + "epoch": 1.6599552572706935, + "grad_norm": 0.48410534858703613, + "learning_rate": 4.194590032849028e-06, + "loss": 0.5864, + "step": 1484 + }, + { + "epoch": 1.6610738255033557, + "grad_norm": 0.5001199841499329, + "learning_rate": 4.193492780065386e-06, + "loss": 0.5479, + "step": 1485 + }, + { + "epoch": 1.662192393736018, + "grad_norm": 0.49552416801452637, + "learning_rate": 4.19239492409565e-06, + "loss": 0.5646, + "step": 1486 + }, + { + "epoch": 1.6633109619686801, + "grad_norm": 0.4871010482311249, + "learning_rate": 4.191296465330853e-06, + "loss": 0.5601, + "step": 1487 + }, + { + "epoch": 1.6644295302013423, + "grad_norm": 0.5030876398086548, + "learning_rate": 4.190197404162242e-06, + "loss": 0.5892, + "step": 1488 + }, + { + "epoch": 1.6655480984340043, + "grad_norm": 0.4873735010623932, + "learning_rate": 4.18909774098128e-06, + "loss": 0.5981, + "step": 1489 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.5151247978210449, + "learning_rate": 4.187997476179643e-06, + "loss": 0.5813, + "step": 1490 + }, + { + "epoch": 1.6677852348993287, + "grad_norm": 0.4993012249469757, + "learning_rate": 4.1868966101492225e-06, + "loss": 0.5718, + "step": 1491 + }, + { + "epoch": 1.668903803131991, + "grad_norm": 0.48291707038879395, + "learning_rate": 4.1857951432821235e-06, + "loss": 0.566, + "step": 1492 + }, + { + "epoch": 1.6700223713646531, + "grad_norm": 0.49533194303512573, + "learning_rate": 4.184693075970665e-06, + "loss": 0.5956, + "step": 1493 + }, + { + "epoch": 1.6711409395973154, + "grad_norm": 0.5048168897628784, + "learning_rate": 4.183590408607379e-06, + "loss": 0.5735, + "step": 1494 + }, + { + "epoch": 1.6722595078299776, + "grad_norm": 0.4690997004508972, + "learning_rate": 4.1824871415850135e-06, + "loss": 0.5551, + "step": 1495 + }, + { + "epoch": 1.6733780760626398, + "grad_norm": 0.4947203993797302, + "learning_rate": 4.1813832752965275e-06, + "loss": 0.5877, + "step": 1496 + }, + { + "epoch": 1.674496644295302, + "grad_norm": 0.508201539516449, + "learning_rate": 4.180278810135096e-06, + "loss": 0.6059, + "step": 1497 + }, + { + "epoch": 1.6756152125279642, + "grad_norm": 0.4906938672065735, + "learning_rate": 4.179173746494105e-06, + "loss": 0.575, + "step": 1498 + }, + { + "epoch": 1.6767337807606264, + "grad_norm": 0.48704689741134644, + "learning_rate": 4.178068084767155e-06, + "loss": 0.5701, + "step": 1499 + }, + { + "epoch": 1.6778523489932886, + "grad_norm": 0.5050029754638672, + "learning_rate": 4.176961825348059e-06, + "loss": 0.5727, + "step": 1500 + }, + { + "epoch": 1.6789709172259508, + "grad_norm": 0.4842473566532135, + "learning_rate": 4.175854968630843e-06, + "loss": 0.5682, + "step": 1501 + }, + { + "epoch": 1.680089485458613, + "grad_norm": 0.5032848119735718, + "learning_rate": 4.1747475150097465e-06, + "loss": 0.5873, + "step": 1502 + }, + { + "epoch": 1.6812080536912752, + "grad_norm": 0.497781902551651, + "learning_rate": 4.17363946487922e-06, + "loss": 0.5914, + "step": 1503 + }, + { + "epoch": 1.6823266219239374, + "grad_norm": 0.5035775303840637, + "learning_rate": 4.172530818633929e-06, + "loss": 0.5519, + "step": 1504 + }, + { + "epoch": 1.6834451901565997, + "grad_norm": 0.47141513228416443, + "learning_rate": 4.171421576668747e-06, + "loss": 0.5913, + "step": 1505 + }, + { + "epoch": 1.6845637583892619, + "grad_norm": 0.49259573221206665, + "learning_rate": 4.170311739378765e-06, + "loss": 0.5769, + "step": 1506 + }, + { + "epoch": 1.685682326621924, + "grad_norm": 0.5033203363418579, + "learning_rate": 4.169201307159282e-06, + "loss": 0.5851, + "step": 1507 + }, + { + "epoch": 1.6868008948545863, + "grad_norm": 0.48197343945503235, + "learning_rate": 4.1680902804058095e-06, + "loss": 0.5502, + "step": 1508 + }, + { + "epoch": 1.6879194630872483, + "grad_norm": 0.5090881586074829, + "learning_rate": 4.166978659514075e-06, + "loss": 0.5684, + "step": 1509 + }, + { + "epoch": 1.6890380313199105, + "grad_norm": 0.4990635812282562, + "learning_rate": 4.1658664448800105e-06, + "loss": 0.5961, + "step": 1510 + }, + { + "epoch": 1.6901565995525727, + "grad_norm": 0.522132396697998, + "learning_rate": 4.164753636899765e-06, + "loss": 0.5941, + "step": 1511 + }, + { + "epoch": 1.691275167785235, + "grad_norm": 0.4907796382904053, + "learning_rate": 4.163640235969696e-06, + "loss": 0.5514, + "step": 1512 + }, + { + "epoch": 1.692393736017897, + "grad_norm": 0.49310702085494995, + "learning_rate": 4.1625262424863744e-06, + "loss": 0.5723, + "step": 1513 + }, + { + "epoch": 1.6935123042505593, + "grad_norm": 0.5064623951911926, + "learning_rate": 4.161411656846581e-06, + "loss": 0.5847, + "step": 1514 + }, + { + "epoch": 1.6946308724832215, + "grad_norm": 0.47868281602859497, + "learning_rate": 4.1602964794473065e-06, + "loss": 0.5757, + "step": 1515 + }, + { + "epoch": 1.6957494407158835, + "grad_norm": 0.5030828714370728, + "learning_rate": 4.159180710685753e-06, + "loss": 0.6001, + "step": 1516 + }, + { + "epoch": 1.6968680089485457, + "grad_norm": 0.4966081976890564, + "learning_rate": 4.158064350959336e-06, + "loss": 0.5643, + "step": 1517 + }, + { + "epoch": 1.697986577181208, + "grad_norm": 0.4922417104244232, + "learning_rate": 4.156947400665677e-06, + "loss": 0.5866, + "step": 1518 + }, + { + "epoch": 1.6991051454138701, + "grad_norm": 0.4993835985660553, + "learning_rate": 4.15582986020261e-06, + "loss": 0.5784, + "step": 1519 + }, + { + "epoch": 1.7002237136465324, + "grad_norm": 0.4877612292766571, + "learning_rate": 4.15471172996818e-06, + "loss": 0.5736, + "step": 1520 + }, + { + "epoch": 1.7013422818791946, + "grad_norm": 0.5129001140594482, + "learning_rate": 4.1535930103606406e-06, + "loss": 0.5798, + "step": 1521 + }, + { + "epoch": 1.7024608501118568, + "grad_norm": 0.48489999771118164, + "learning_rate": 4.152473701778455e-06, + "loss": 0.556, + "step": 1522 + }, + { + "epoch": 1.703579418344519, + "grad_norm": 0.4901261627674103, + "learning_rate": 4.1513538046202995e-06, + "loss": 0.578, + "step": 1523 + }, + { + "epoch": 1.7046979865771812, + "grad_norm": 0.49826791882514954, + "learning_rate": 4.150233319285055e-06, + "loss": 0.5671, + "step": 1524 + }, + { + "epoch": 1.7058165548098434, + "grad_norm": 0.49227479100227356, + "learning_rate": 4.149112246171817e-06, + "loss": 0.5561, + "step": 1525 + }, + { + "epoch": 1.7069351230425056, + "grad_norm": 0.49544984102249146, + "learning_rate": 4.147990585679886e-06, + "loss": 0.5682, + "step": 1526 + }, + { + "epoch": 1.7080536912751678, + "grad_norm": 0.49339771270751953, + "learning_rate": 4.146868338208775e-06, + "loss": 0.5607, + "step": 1527 + }, + { + "epoch": 1.70917225950783, + "grad_norm": 0.506488561630249, + "learning_rate": 4.1457455041582044e-06, + "loss": 0.5778, + "step": 1528 + }, + { + "epoch": 1.7102908277404922, + "grad_norm": 0.5283234715461731, + "learning_rate": 4.144622083928102e-06, + "loss": 0.5777, + "step": 1529 + }, + { + "epoch": 1.7114093959731544, + "grad_norm": 0.5093042254447937, + "learning_rate": 4.14349807791861e-06, + "loss": 0.5832, + "step": 1530 + }, + { + "epoch": 1.7125279642058167, + "grad_norm": 0.49732184410095215, + "learning_rate": 4.142373486530071e-06, + "loss": 0.5965, + "step": 1531 + }, + { + "epoch": 1.7136465324384789, + "grad_norm": 0.5034535527229309, + "learning_rate": 4.141248310163042e-06, + "loss": 0.5896, + "step": 1532 + }, + { + "epoch": 1.714765100671141, + "grad_norm": 0.484789103269577, + "learning_rate": 4.140122549218289e-06, + "loss": 0.5636, + "step": 1533 + }, + { + "epoch": 1.7158836689038033, + "grad_norm": 0.49596208333969116, + "learning_rate": 4.138996204096781e-06, + "loss": 0.5794, + "step": 1534 + }, + { + "epoch": 1.7170022371364653, + "grad_norm": 0.4908808767795563, + "learning_rate": 4.137869275199701e-06, + "loss": 0.5846, + "step": 1535 + }, + { + "epoch": 1.7181208053691275, + "grad_norm": 0.4921574890613556, + "learning_rate": 4.1367417629284356e-06, + "loss": 0.5834, + "step": 1536 + }, + { + "epoch": 1.7192393736017897, + "grad_norm": 0.48451635241508484, + "learning_rate": 4.13561366768458e-06, + "loss": 0.5624, + "step": 1537 + }, + { + "epoch": 1.720357941834452, + "grad_norm": 0.5193755626678467, + "learning_rate": 4.134484989869939e-06, + "loss": 0.5908, + "step": 1538 + }, + { + "epoch": 1.721476510067114, + "grad_norm": 0.497798889875412, + "learning_rate": 4.133355729886523e-06, + "loss": 0.5668, + "step": 1539 + }, + { + "epoch": 1.7225950782997763, + "grad_norm": 0.5026369690895081, + "learning_rate": 4.132225888136552e-06, + "loss": 0.5932, + "step": 1540 + }, + { + "epoch": 1.7237136465324385, + "grad_norm": 0.5131746530532837, + "learning_rate": 4.131095465022449e-06, + "loss": 0.5965, + "step": 1541 + }, + { + "epoch": 1.7248322147651005, + "grad_norm": 0.4941963851451874, + "learning_rate": 4.129964460946847e-06, + "loss": 0.5797, + "step": 1542 + }, + { + "epoch": 1.7259507829977627, + "grad_norm": 0.5222018957138062, + "learning_rate": 4.128832876312587e-06, + "loss": 0.5948, + "step": 1543 + }, + { + "epoch": 1.727069351230425, + "grad_norm": 0.5069666504859924, + "learning_rate": 4.127700711522715e-06, + "loss": 0.5855, + "step": 1544 + }, + { + "epoch": 1.7281879194630871, + "grad_norm": 0.5024001002311707, + "learning_rate": 4.126567966980484e-06, + "loss": 0.6114, + "step": 1545 + }, + { + "epoch": 1.7293064876957494, + "grad_norm": 0.49587997794151306, + "learning_rate": 4.125434643089353e-06, + "loss": 0.568, + "step": 1546 + }, + { + "epoch": 1.7304250559284116, + "grad_norm": 0.48950231075286865, + "learning_rate": 4.124300740252989e-06, + "loss": 0.5772, + "step": 1547 + }, + { + "epoch": 1.7315436241610738, + "grad_norm": 0.5003415942192078, + "learning_rate": 4.123166258875262e-06, + "loss": 0.5942, + "step": 1548 + }, + { + "epoch": 1.732662192393736, + "grad_norm": 0.5020825266838074, + "learning_rate": 4.1220311993602515e-06, + "loss": 0.5826, + "step": 1549 + }, + { + "epoch": 1.7337807606263982, + "grad_norm": 0.5122042894363403, + "learning_rate": 4.120895562112242e-06, + "loss": 0.6112, + "step": 1550 + }, + { + "epoch": 1.7348993288590604, + "grad_norm": 0.5022247433662415, + "learning_rate": 4.119759347535722e-06, + "loss": 0.556, + "step": 1551 + }, + { + "epoch": 1.7360178970917226, + "grad_norm": 0.5053969025611877, + "learning_rate": 4.118622556035387e-06, + "loss": 0.6139, + "step": 1552 + }, + { + "epoch": 1.7371364653243848, + "grad_norm": 0.5022400617599487, + "learning_rate": 4.11748518801614e-06, + "loss": 0.5976, + "step": 1553 + }, + { + "epoch": 1.738255033557047, + "grad_norm": 0.49503523111343384, + "learning_rate": 4.116347243883086e-06, + "loss": 0.5742, + "step": 1554 + }, + { + "epoch": 1.7393736017897092, + "grad_norm": 0.5141612887382507, + "learning_rate": 4.115208724041536e-06, + "loss": 0.599, + "step": 1555 + }, + { + "epoch": 1.7404921700223714, + "grad_norm": 0.484074205160141, + "learning_rate": 4.114069628897006e-06, + "loss": 0.5675, + "step": 1556 + }, + { + "epoch": 1.7416107382550337, + "grad_norm": 0.5334796905517578, + "learning_rate": 4.11292995885522e-06, + "loss": 0.6095, + "step": 1557 + }, + { + "epoch": 1.7427293064876959, + "grad_norm": 0.48460689187049866, + "learning_rate": 4.111789714322101e-06, + "loss": 0.5477, + "step": 1558 + }, + { + "epoch": 1.743847874720358, + "grad_norm": 0.5085021257400513, + "learning_rate": 4.110648895703782e-06, + "loss": 0.5941, + "step": 1559 + }, + { + "epoch": 1.7449664429530203, + "grad_norm": 0.5090224146842957, + "learning_rate": 4.109507503406599e-06, + "loss": 0.5753, + "step": 1560 + }, + { + "epoch": 1.7460850111856825, + "grad_norm": 0.5055748224258423, + "learning_rate": 4.108365537837088e-06, + "loss": 0.5914, + "step": 1561 + }, + { + "epoch": 1.7472035794183445, + "grad_norm": 0.48406150937080383, + "learning_rate": 4.107222999401997e-06, + "loss": 0.5805, + "step": 1562 + }, + { + "epoch": 1.7483221476510067, + "grad_norm": 0.48651930689811707, + "learning_rate": 4.106079888508272e-06, + "loss": 0.5848, + "step": 1563 + }, + { + "epoch": 1.749440715883669, + "grad_norm": 0.4852994978427887, + "learning_rate": 4.104936205563064e-06, + "loss": 0.5796, + "step": 1564 + }, + { + "epoch": 1.750559284116331, + "grad_norm": 0.4920569658279419, + "learning_rate": 4.10379195097373e-06, + "loss": 0.5981, + "step": 1565 + }, + { + "epoch": 1.7516778523489933, + "grad_norm": 0.5077376961708069, + "learning_rate": 4.1026471251478285e-06, + "loss": 0.6071, + "step": 1566 + }, + { + "epoch": 1.7527964205816555, + "grad_norm": 0.4932427406311035, + "learning_rate": 4.101501728493121e-06, + "loss": 0.5917, + "step": 1567 + }, + { + "epoch": 1.7539149888143175, + "grad_norm": 0.49199509620666504, + "learning_rate": 4.100355761417577e-06, + "loss": 0.5814, + "step": 1568 + }, + { + "epoch": 1.7550335570469797, + "grad_norm": 0.4920978546142578, + "learning_rate": 4.099209224329361e-06, + "loss": 0.5931, + "step": 1569 + }, + { + "epoch": 1.756152125279642, + "grad_norm": 0.5020889043807983, + "learning_rate": 4.098062117636849e-06, + "loss": 0.5669, + "step": 1570 + }, + { + "epoch": 1.7572706935123041, + "grad_norm": 0.5099507570266724, + "learning_rate": 4.096914441748613e-06, + "loss": 0.5758, + "step": 1571 + }, + { + "epoch": 1.7583892617449663, + "grad_norm": 0.49736326932907104, + "learning_rate": 4.095766197073433e-06, + "loss": 0.5624, + "step": 1572 + }, + { + "epoch": 1.7595078299776286, + "grad_norm": 0.494096577167511, + "learning_rate": 4.094617384020287e-06, + "loss": 0.5902, + "step": 1573 + }, + { + "epoch": 1.7606263982102908, + "grad_norm": 0.5103378295898438, + "learning_rate": 4.09346800299836e-06, + "loss": 0.5903, + "step": 1574 + }, + { + "epoch": 1.761744966442953, + "grad_norm": 0.49033430218696594, + "learning_rate": 4.092318054417036e-06, + "loss": 0.5703, + "step": 1575 + }, + { + "epoch": 1.7628635346756152, + "grad_norm": 0.5065776705741882, + "learning_rate": 4.0911675386859015e-06, + "loss": 0.5648, + "step": 1576 + }, + { + "epoch": 1.7639821029082774, + "grad_norm": 0.5001108050346375, + "learning_rate": 4.0900164562147485e-06, + "loss": 0.5857, + "step": 1577 + }, + { + "epoch": 1.7651006711409396, + "grad_norm": 0.5082598924636841, + "learning_rate": 4.0888648074135645e-06, + "loss": 0.5488, + "step": 1578 + }, + { + "epoch": 1.7662192393736018, + "grad_norm": 0.4972035586833954, + "learning_rate": 4.087712592692544e-06, + "loss": 0.594, + "step": 1579 + }, + { + "epoch": 1.767337807606264, + "grad_norm": 0.4995720088481903, + "learning_rate": 4.086559812462082e-06, + "loss": 0.5974, + "step": 1580 + }, + { + "epoch": 1.7684563758389262, + "grad_norm": 0.5213293433189392, + "learning_rate": 4.085406467132774e-06, + "loss": 0.5956, + "step": 1581 + }, + { + "epoch": 1.7695749440715884, + "grad_norm": 0.4914456903934479, + "learning_rate": 4.0842525571154165e-06, + "loss": 0.5804, + "step": 1582 + }, + { + "epoch": 1.7706935123042506, + "grad_norm": 0.4833807945251465, + "learning_rate": 4.083098082821007e-06, + "loss": 0.5468, + "step": 1583 + }, + { + "epoch": 1.7718120805369129, + "grad_norm": 0.4896129071712494, + "learning_rate": 4.081943044660746e-06, + "loss": 0.5949, + "step": 1584 + }, + { + "epoch": 1.772930648769575, + "grad_norm": 0.4938727021217346, + "learning_rate": 4.080787443046034e-06, + "loss": 0.576, + "step": 1585 + }, + { + "epoch": 1.7740492170022373, + "grad_norm": 0.49900153279304504, + "learning_rate": 4.07963127838847e-06, + "loss": 0.5788, + "step": 1586 + }, + { + "epoch": 1.7751677852348995, + "grad_norm": 0.5096091628074646, + "learning_rate": 4.0784745510998556e-06, + "loss": 0.6012, + "step": 1587 + }, + { + "epoch": 1.7762863534675615, + "grad_norm": 0.4790339469909668, + "learning_rate": 4.077317261592194e-06, + "loss": 0.572, + "step": 1588 + }, + { + "epoch": 1.7774049217002237, + "grad_norm": 0.49272850155830383, + "learning_rate": 4.076159410277685e-06, + "loss": 0.559, + "step": 1589 + }, + { + "epoch": 1.778523489932886, + "grad_norm": 0.46964558959007263, + "learning_rate": 4.075000997568732e-06, + "loss": 0.5498, + "step": 1590 + }, + { + "epoch": 1.779642058165548, + "grad_norm": 0.4938308298587799, + "learning_rate": 4.0738420238779365e-06, + "loss": 0.5915, + "step": 1591 + }, + { + "epoch": 1.7807606263982103, + "grad_norm": 0.5084967017173767, + "learning_rate": 4.072682489618101e-06, + "loss": 0.5918, + "step": 1592 + }, + { + "epoch": 1.7818791946308725, + "grad_norm": 0.49874448776245117, + "learning_rate": 4.071522395202226e-06, + "loss": 0.5622, + "step": 1593 + }, + { + "epoch": 1.7829977628635347, + "grad_norm": 0.49470290541648865, + "learning_rate": 4.070361741043511e-06, + "loss": 0.5466, + "step": 1594 + }, + { + "epoch": 1.7841163310961967, + "grad_norm": 0.5095946192741394, + "learning_rate": 4.0692005275553595e-06, + "loss": 0.5731, + "step": 1595 + }, + { + "epoch": 1.785234899328859, + "grad_norm": 0.5301234722137451, + "learning_rate": 4.06803875515137e-06, + "loss": 0.5867, + "step": 1596 + }, + { + "epoch": 1.7863534675615211, + "grad_norm": 0.514214813709259, + "learning_rate": 4.06687642424534e-06, + "loss": 0.6005, + "step": 1597 + }, + { + "epoch": 1.7874720357941833, + "grad_norm": 0.5104812383651733, + "learning_rate": 4.065713535251268e-06, + "loss": 0.5935, + "step": 1598 + }, + { + "epoch": 1.7885906040268456, + "grad_norm": 0.5091798901557922, + "learning_rate": 4.064550088583349e-06, + "loss": 0.5944, + "step": 1599 + }, + { + "epoch": 1.7897091722595078, + "grad_norm": 0.4886806607246399, + "learning_rate": 4.0633860846559794e-06, + "loss": 0.551, + "step": 1600 + }, + { + "epoch": 1.79082774049217, + "grad_norm": 0.49812769889831543, + "learning_rate": 4.062221523883753e-06, + "loss": 0.5491, + "step": 1601 + }, + { + "epoch": 1.7919463087248322, + "grad_norm": 0.5070154070854187, + "learning_rate": 4.06105640668146e-06, + "loss": 0.5942, + "step": 1602 + }, + { + "epoch": 1.7930648769574944, + "grad_norm": 0.5108977556228638, + "learning_rate": 4.059890733464091e-06, + "loss": 0.5943, + "step": 1603 + }, + { + "epoch": 1.7941834451901566, + "grad_norm": 0.5012465119361877, + "learning_rate": 4.058724504646834e-06, + "loss": 0.5642, + "step": 1604 + }, + { + "epoch": 1.7953020134228188, + "grad_norm": 0.49801158905029297, + "learning_rate": 4.057557720645076e-06, + "loss": 0.5709, + "step": 1605 + }, + { + "epoch": 1.796420581655481, + "grad_norm": 0.4962095320224762, + "learning_rate": 4.0563903818743984e-06, + "loss": 0.5965, + "step": 1606 + }, + { + "epoch": 1.7975391498881432, + "grad_norm": 0.4680939018726349, + "learning_rate": 4.055222488750584e-06, + "loss": 0.5476, + "step": 1607 + }, + { + "epoch": 1.7986577181208054, + "grad_norm": 0.48658618330955505, + "learning_rate": 4.054054041689612e-06, + "loss": 0.5959, + "step": 1608 + }, + { + "epoch": 1.7997762863534676, + "grad_norm": 0.4988051652908325, + "learning_rate": 4.052885041107656e-06, + "loss": 0.5821, + "step": 1609 + }, + { + "epoch": 1.8008948545861299, + "grad_norm": 0.49107739329338074, + "learning_rate": 4.051715487421092e-06, + "loss": 0.6017, + "step": 1610 + }, + { + "epoch": 1.802013422818792, + "grad_norm": 0.5018040537834167, + "learning_rate": 4.050545381046488e-06, + "loss": 0.5873, + "step": 1611 + }, + { + "epoch": 1.8031319910514543, + "grad_norm": 0.49129587411880493, + "learning_rate": 4.049374722400613e-06, + "loss": 0.5474, + "step": 1612 + }, + { + "epoch": 1.8042505592841165, + "grad_norm": 0.49445709586143494, + "learning_rate": 4.048203511900428e-06, + "loss": 0.5787, + "step": 1613 + }, + { + "epoch": 1.8053691275167785, + "grad_norm": 0.4987739026546478, + "learning_rate": 4.047031749963095e-06, + "loss": 0.5673, + "step": 1614 + }, + { + "epoch": 1.8064876957494407, + "grad_norm": 0.4809098243713379, + "learning_rate": 4.045859437005971e-06, + "loss": 0.5701, + "step": 1615 + }, + { + "epoch": 1.807606263982103, + "grad_norm": 0.5122459530830383, + "learning_rate": 4.044686573446608e-06, + "loss": 0.5996, + "step": 1616 + }, + { + "epoch": 1.808724832214765, + "grad_norm": 0.49991941452026367, + "learning_rate": 4.0435131597027564e-06, + "loss": 0.5981, + "step": 1617 + }, + { + "epoch": 1.8098434004474273, + "grad_norm": 0.4867013990879059, + "learning_rate": 4.042339196192358e-06, + "loss": 0.5762, + "step": 1618 + }, + { + "epoch": 1.8109619686800895, + "grad_norm": 0.4812503159046173, + "learning_rate": 4.041164683333558e-06, + "loss": 0.5484, + "step": 1619 + }, + { + "epoch": 1.8120805369127517, + "grad_norm": 0.4890724718570709, + "learning_rate": 4.03998962154469e-06, + "loss": 0.6001, + "step": 1620 + }, + { + "epoch": 1.8131991051454137, + "grad_norm": 0.5026843547821045, + "learning_rate": 4.038814011244286e-06, + "loss": 0.5838, + "step": 1621 + }, + { + "epoch": 1.814317673378076, + "grad_norm": 0.49154821038246155, + "learning_rate": 4.037637852851075e-06, + "loss": 0.5607, + "step": 1622 + }, + { + "epoch": 1.8154362416107381, + "grad_norm": 0.4974023699760437, + "learning_rate": 4.036461146783979e-06, + "loss": 0.5528, + "step": 1623 + }, + { + "epoch": 1.8165548098434003, + "grad_norm": 0.5217268466949463, + "learning_rate": 4.035283893462114e-06, + "loss": 0.5978, + "step": 1624 + }, + { + "epoch": 1.8176733780760626, + "grad_norm": 0.5096696019172668, + "learning_rate": 4.034106093304795e-06, + "loss": 0.6219, + "step": 1625 + }, + { + "epoch": 1.8187919463087248, + "grad_norm": 0.49499067664146423, + "learning_rate": 4.032927746731528e-06, + "loss": 0.6066, + "step": 1626 + }, + { + "epoch": 1.819910514541387, + "grad_norm": 0.48196083307266235, + "learning_rate": 4.031748854162014e-06, + "loss": 0.562, + "step": 1627 + }, + { + "epoch": 1.8210290827740492, + "grad_norm": 0.5044178366661072, + "learning_rate": 4.030569416016152e-06, + "loss": 0.6103, + "step": 1628 + }, + { + "epoch": 1.8221476510067114, + "grad_norm": 0.48558661341667175, + "learning_rate": 4.0293894327140315e-06, + "loss": 0.5558, + "step": 1629 + }, + { + "epoch": 1.8232662192393736, + "grad_norm": 0.492214173078537, + "learning_rate": 4.0282089046759365e-06, + "loss": 0.5665, + "step": 1630 + }, + { + "epoch": 1.8243847874720358, + "grad_norm": 0.4871746599674225, + "learning_rate": 4.027027832322348e-06, + "loss": 0.5497, + "step": 1631 + }, + { + "epoch": 1.825503355704698, + "grad_norm": 0.4865874648094177, + "learning_rate": 4.025846216073938e-06, + "loss": 0.5558, + "step": 1632 + }, + { + "epoch": 1.8266219239373602, + "grad_norm": 0.4919774532318115, + "learning_rate": 4.024664056351572e-06, + "loss": 0.5804, + "step": 1633 + }, + { + "epoch": 1.8277404921700224, + "grad_norm": 0.49305927753448486, + "learning_rate": 4.02348135357631e-06, + "loss": 0.5869, + "step": 1634 + }, + { + "epoch": 1.8288590604026846, + "grad_norm": 0.523074209690094, + "learning_rate": 4.022298108169408e-06, + "loss": 0.5828, + "step": 1635 + }, + { + "epoch": 1.8299776286353469, + "grad_norm": 0.4927634298801422, + "learning_rate": 4.021114320552311e-06, + "loss": 0.5614, + "step": 1636 + }, + { + "epoch": 1.831096196868009, + "grad_norm": 0.49153605103492737, + "learning_rate": 4.019929991146659e-06, + "loss": 0.5589, + "step": 1637 + }, + { + "epoch": 1.8322147651006713, + "grad_norm": 0.5093767046928406, + "learning_rate": 4.018745120374286e-06, + "loss": 0.5923, + "step": 1638 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.5137583017349243, + "learning_rate": 4.017559708657216e-06, + "loss": 0.6154, + "step": 1639 + }, + { + "epoch": 1.8344519015659957, + "grad_norm": 0.5136058926582336, + "learning_rate": 4.016373756417669e-06, + "loss": 0.5634, + "step": 1640 + }, + { + "epoch": 1.8355704697986577, + "grad_norm": 0.4985203146934509, + "learning_rate": 4.0151872640780554e-06, + "loss": 0.5906, + "step": 1641 + }, + { + "epoch": 1.8366890380313199, + "grad_norm": 0.4897896349430084, + "learning_rate": 4.014000232060978e-06, + "loss": 0.5707, + "step": 1642 + }, + { + "epoch": 1.837807606263982, + "grad_norm": 0.49876436591148376, + "learning_rate": 4.012812660789233e-06, + "loss": 0.5998, + "step": 1643 + }, + { + "epoch": 1.8389261744966443, + "grad_norm": 0.5061901807785034, + "learning_rate": 4.011624550685808e-06, + "loss": 0.602, + "step": 1644 + }, + { + "epoch": 1.8400447427293065, + "grad_norm": 0.505077600479126, + "learning_rate": 4.010435902173883e-06, + "loss": 0.5753, + "step": 1645 + }, + { + "epoch": 1.8411633109619687, + "grad_norm": 0.4808630049228668, + "learning_rate": 4.0092467156768274e-06, + "loss": 0.5737, + "step": 1646 + }, + { + "epoch": 1.8422818791946307, + "grad_norm": 0.5141550898551941, + "learning_rate": 4.008056991618206e-06, + "loss": 0.5963, + "step": 1647 + }, + { + "epoch": 1.843400447427293, + "grad_norm": 0.4977302551269531, + "learning_rate": 4.006866730421773e-06, + "loss": 0.5772, + "step": 1648 + }, + { + "epoch": 1.8445190156599551, + "grad_norm": 0.48998597264289856, + "learning_rate": 4.005675932511474e-06, + "loss": 0.5539, + "step": 1649 + }, + { + "epoch": 1.8456375838926173, + "grad_norm": 0.5025562644004822, + "learning_rate": 4.004484598311445e-06, + "loss": 0.5811, + "step": 1650 + }, + { + "epoch": 1.8467561521252795, + "grad_norm": 0.49174991250038147, + "learning_rate": 4.003292728246015e-06, + "loss": 0.5752, + "step": 1651 + }, + { + "epoch": 1.8478747203579418, + "grad_norm": 0.49403971433639526, + "learning_rate": 4.0021003227397015e-06, + "loss": 0.6112, + "step": 1652 + }, + { + "epoch": 1.848993288590604, + "grad_norm": 0.5124534368515015, + "learning_rate": 4.000907382217215e-06, + "loss": 0.5871, + "step": 1653 + }, + { + "epoch": 1.8501118568232662, + "grad_norm": 0.5068091750144958, + "learning_rate": 3.9997139071034555e-06, + "loss": 0.6084, + "step": 1654 + }, + { + "epoch": 1.8512304250559284, + "grad_norm": 0.5051630139350891, + "learning_rate": 3.998519897823514e-06, + "loss": 0.5964, + "step": 1655 + }, + { + "epoch": 1.8523489932885906, + "grad_norm": 0.49604538083076477, + "learning_rate": 3.997325354802669e-06, + "loss": 0.5655, + "step": 1656 + }, + { + "epoch": 1.8534675615212528, + "grad_norm": 0.5094951391220093, + "learning_rate": 3.996130278466394e-06, + "loss": 0.5824, + "step": 1657 + }, + { + "epoch": 1.854586129753915, + "grad_norm": 0.4972209930419922, + "learning_rate": 3.994934669240347e-06, + "loss": 0.5644, + "step": 1658 + }, + { + "epoch": 1.8557046979865772, + "grad_norm": 0.49230918288230896, + "learning_rate": 3.993738527550382e-06, + "loss": 0.5801, + "step": 1659 + }, + { + "epoch": 1.8568232662192394, + "grad_norm": 0.5058436989784241, + "learning_rate": 3.9925418538225355e-06, + "loss": 0.5917, + "step": 1660 + }, + { + "epoch": 1.8579418344519016, + "grad_norm": 0.5173357129096985, + "learning_rate": 3.9913446484830396e-06, + "loss": 0.5809, + "step": 1661 + }, + { + "epoch": 1.8590604026845639, + "grad_norm": 0.48803800344467163, + "learning_rate": 3.9901469119583125e-06, + "loss": 0.5721, + "step": 1662 + }, + { + "epoch": 1.860178970917226, + "grad_norm": 0.5094105005264282, + "learning_rate": 3.988948644674962e-06, + "loss": 0.5742, + "step": 1663 + }, + { + "epoch": 1.8612975391498883, + "grad_norm": 0.49270302057266235, + "learning_rate": 3.987749847059788e-06, + "loss": 0.5864, + "step": 1664 + }, + { + "epoch": 1.8624161073825505, + "grad_norm": 0.49852290749549866, + "learning_rate": 3.986550519539773e-06, + "loss": 0.5875, + "step": 1665 + }, + { + "epoch": 1.8635346756152127, + "grad_norm": 0.4975663423538208, + "learning_rate": 3.985350662542096e-06, + "loss": 0.5649, + "step": 1666 + }, + { + "epoch": 1.8646532438478747, + "grad_norm": 0.5100948214530945, + "learning_rate": 3.984150276494116e-06, + "loss": 0.564, + "step": 1667 + }, + { + "epoch": 1.8657718120805369, + "grad_norm": 0.5079052448272705, + "learning_rate": 3.982949361823388e-06, + "loss": 0.5662, + "step": 1668 + }, + { + "epoch": 1.866890380313199, + "grad_norm": 0.49659186601638794, + "learning_rate": 3.981747918957653e-06, + "loss": 0.55, + "step": 1669 + }, + { + "epoch": 1.8680089485458613, + "grad_norm": 0.48802655935287476, + "learning_rate": 3.980545948324835e-06, + "loss": 0.5676, + "step": 1670 + }, + { + "epoch": 1.8691275167785235, + "grad_norm": 0.4884412884712219, + "learning_rate": 3.979343450353056e-06, + "loss": 0.5771, + "step": 1671 + }, + { + "epoch": 1.8702460850111857, + "grad_norm": 0.49414676427841187, + "learning_rate": 3.978140425470615e-06, + "loss": 0.5606, + "step": 1672 + }, + { + "epoch": 1.8713646532438477, + "grad_norm": 0.4894448220729828, + "learning_rate": 3.976936874106008e-06, + "loss": 0.5428, + "step": 1673 + }, + { + "epoch": 1.87248322147651, + "grad_norm": 0.48966702818870544, + "learning_rate": 3.9757327966879134e-06, + "loss": 0.5601, + "step": 1674 + }, + { + "epoch": 1.8736017897091721, + "grad_norm": 0.513053297996521, + "learning_rate": 3.974528193645196e-06, + "loss": 0.5993, + "step": 1675 + }, + { + "epoch": 1.8747203579418343, + "grad_norm": 0.5094168782234192, + "learning_rate": 3.973323065406911e-06, + "loss": 0.59, + "step": 1676 + }, + { + "epoch": 1.8758389261744965, + "grad_norm": 0.513595461845398, + "learning_rate": 3.9721174124023e-06, + "loss": 0.5842, + "step": 1677 + }, + { + "epoch": 1.8769574944071588, + "grad_norm": 0.5007264614105225, + "learning_rate": 3.9709112350607904e-06, + "loss": 0.5798, + "step": 1678 + }, + { + "epoch": 1.878076062639821, + "grad_norm": 0.48793676495552063, + "learning_rate": 3.969704533811997e-06, + "loss": 0.5724, + "step": 1679 + }, + { + "epoch": 1.8791946308724832, + "grad_norm": 0.5160763263702393, + "learning_rate": 3.96849730908572e-06, + "loss": 0.5812, + "step": 1680 + }, + { + "epoch": 1.8803131991051454, + "grad_norm": 0.5015366077423096, + "learning_rate": 3.967289561311949e-06, + "loss": 0.562, + "step": 1681 + }, + { + "epoch": 1.8814317673378076, + "grad_norm": 0.4753899574279785, + "learning_rate": 3.9660812909208575e-06, + "loss": 0.548, + "step": 1682 + }, + { + "epoch": 1.8825503355704698, + "grad_norm": 0.5044482350349426, + "learning_rate": 3.964872498342805e-06, + "loss": 0.606, + "step": 1683 + }, + { + "epoch": 1.883668903803132, + "grad_norm": 0.4928360879421234, + "learning_rate": 3.963663184008338e-06, + "loss": 0.5852, + "step": 1684 + }, + { + "epoch": 1.8847874720357942, + "grad_norm": 0.4930068254470825, + "learning_rate": 3.962453348348189e-06, + "loss": 0.5818, + "step": 1685 + }, + { + "epoch": 1.8859060402684564, + "grad_norm": 0.5112072229385376, + "learning_rate": 3.961242991793276e-06, + "loss": 0.6012, + "step": 1686 + }, + { + "epoch": 1.8870246085011186, + "grad_norm": 0.5157861113548279, + "learning_rate": 3.960032114774702e-06, + "loss": 0.6044, + "step": 1687 + }, + { + "epoch": 1.8881431767337808, + "grad_norm": 0.4951840937137604, + "learning_rate": 3.958820717723754e-06, + "loss": 0.5944, + "step": 1688 + }, + { + "epoch": 1.889261744966443, + "grad_norm": 0.5044351816177368, + "learning_rate": 3.957608801071907e-06, + "loss": 0.5814, + "step": 1689 + }, + { + "epoch": 1.8903803131991053, + "grad_norm": 0.498749315738678, + "learning_rate": 3.956396365250821e-06, + "loss": 0.5821, + "step": 1690 + }, + { + "epoch": 1.8914988814317675, + "grad_norm": 0.5246557593345642, + "learning_rate": 3.955183410692338e-06, + "loss": 0.5937, + "step": 1691 + }, + { + "epoch": 1.8926174496644297, + "grad_norm": 0.49238988757133484, + "learning_rate": 3.953969937828489e-06, + "loss": 0.5567, + "step": 1692 + }, + { + "epoch": 1.8937360178970917, + "grad_norm": 0.49119988083839417, + "learning_rate": 3.952755947091485e-06, + "loss": 0.553, + "step": 1693 + }, + { + "epoch": 1.8948545861297539, + "grad_norm": 0.5058138370513916, + "learning_rate": 3.951541438913723e-06, + "loss": 0.5559, + "step": 1694 + }, + { + "epoch": 1.895973154362416, + "grad_norm": 0.49434196949005127, + "learning_rate": 3.950326413727788e-06, + "loss": 0.5723, + "step": 1695 + }, + { + "epoch": 1.8970917225950783, + "grad_norm": 0.49539026618003845, + "learning_rate": 3.949110871966444e-06, + "loss": 0.5426, + "step": 1696 + }, + { + "epoch": 1.8982102908277405, + "grad_norm": 0.5126526355743408, + "learning_rate": 3.9478948140626414e-06, + "loss": 0.5969, + "step": 1697 + }, + { + "epoch": 1.8993288590604027, + "grad_norm": 0.5030180811882019, + "learning_rate": 3.946678240449515e-06, + "loss": 0.5667, + "step": 1698 + }, + { + "epoch": 1.900447427293065, + "grad_norm": 0.5048319101333618, + "learning_rate": 3.945461151560382e-06, + "loss": 0.5755, + "step": 1699 + }, + { + "epoch": 1.901565995525727, + "grad_norm": 0.49721550941467285, + "learning_rate": 3.944243547828742e-06, + "loss": 0.5752, + "step": 1700 + }, + { + "epoch": 1.9026845637583891, + "grad_norm": 0.5135868191719055, + "learning_rate": 3.943025429688281e-06, + "loss": 0.6088, + "step": 1701 + }, + { + "epoch": 1.9038031319910513, + "grad_norm": 0.48717236518859863, + "learning_rate": 3.941806797572867e-06, + "loss": 0.5583, + "step": 1702 + }, + { + "epoch": 1.9049217002237135, + "grad_norm": 0.4741756319999695, + "learning_rate": 3.940587651916551e-06, + "loss": 0.5203, + "step": 1703 + }, + { + "epoch": 1.9060402684563758, + "grad_norm": 0.49962809681892395, + "learning_rate": 3.939367993153566e-06, + "loss": 0.5598, + "step": 1704 + }, + { + "epoch": 1.907158836689038, + "grad_norm": 0.48792213201522827, + "learning_rate": 3.938147821718328e-06, + "loss": 0.5504, + "step": 1705 + }, + { + "epoch": 1.9082774049217002, + "grad_norm": 0.5173740386962891, + "learning_rate": 3.936927138045438e-06, + "loss": 0.5623, + "step": 1706 + }, + { + "epoch": 1.9093959731543624, + "grad_norm": 0.5256320834159851, + "learning_rate": 3.935705942569675e-06, + "loss": 0.5873, + "step": 1707 + }, + { + "epoch": 1.9105145413870246, + "grad_norm": 0.5090080499649048, + "learning_rate": 3.934484235726006e-06, + "loss": 0.6075, + "step": 1708 + }, + { + "epoch": 1.9116331096196868, + "grad_norm": 0.4803988039493561, + "learning_rate": 3.933262017949574e-06, + "loss": 0.5692, + "step": 1709 + }, + { + "epoch": 1.912751677852349, + "grad_norm": 0.4928509294986725, + "learning_rate": 3.932039289675709e-06, + "loss": 0.565, + "step": 1710 + }, + { + "epoch": 1.9138702460850112, + "grad_norm": 0.5102552771568298, + "learning_rate": 3.930816051339918e-06, + "loss": 0.5732, + "step": 1711 + }, + { + "epoch": 1.9149888143176734, + "grad_norm": 0.49371102452278137, + "learning_rate": 3.929592303377896e-06, + "loss": 0.5702, + "step": 1712 + }, + { + "epoch": 1.9161073825503356, + "grad_norm": 0.48887091875076294, + "learning_rate": 3.928368046225515e-06, + "loss": 0.5743, + "step": 1713 + }, + { + "epoch": 1.9172259507829978, + "grad_norm": 0.5040315985679626, + "learning_rate": 3.927143280318828e-06, + "loss": 0.5998, + "step": 1714 + }, + { + "epoch": 1.91834451901566, + "grad_norm": 0.5192286372184753, + "learning_rate": 3.92591800609407e-06, + "loss": 0.5694, + "step": 1715 + }, + { + "epoch": 1.9194630872483223, + "grad_norm": 0.5107104778289795, + "learning_rate": 3.924692223987661e-06, + "loss": 0.6087, + "step": 1716 + }, + { + "epoch": 1.9205816554809845, + "grad_norm": 0.5028025507926941, + "learning_rate": 3.923465934436195e-06, + "loss": 0.5696, + "step": 1717 + }, + { + "epoch": 1.9217002237136467, + "grad_norm": 0.5084637403488159, + "learning_rate": 3.922239137876452e-06, + "loss": 0.6032, + "step": 1718 + }, + { + "epoch": 1.9228187919463087, + "grad_norm": 0.5004264116287231, + "learning_rate": 3.921011834745392e-06, + "loss": 0.5559, + "step": 1719 + }, + { + "epoch": 1.9239373601789709, + "grad_norm": 0.522179365158081, + "learning_rate": 3.919784025480151e-06, + "loss": 0.612, + "step": 1720 + }, + { + "epoch": 1.925055928411633, + "grad_norm": 0.508663535118103, + "learning_rate": 3.918555710518051e-06, + "loss": 0.5731, + "step": 1721 + }, + { + "epoch": 1.9261744966442953, + "grad_norm": 0.5001826286315918, + "learning_rate": 3.917326890296591e-06, + "loss": 0.5906, + "step": 1722 + }, + { + "epoch": 1.9272930648769575, + "grad_norm": 0.4933156967163086, + "learning_rate": 3.91609756525345e-06, + "loss": 0.5816, + "step": 1723 + }, + { + "epoch": 1.9284116331096197, + "grad_norm": 0.48916542530059814, + "learning_rate": 3.914867735826489e-06, + "loss": 0.6014, + "step": 1724 + }, + { + "epoch": 1.929530201342282, + "grad_norm": 0.5224503874778748, + "learning_rate": 3.913637402453745e-06, + "loss": 0.5787, + "step": 1725 + }, + { + "epoch": 1.930648769574944, + "grad_norm": 0.4952391982078552, + "learning_rate": 3.912406565573438e-06, + "loss": 0.5606, + "step": 1726 + }, + { + "epoch": 1.9317673378076061, + "grad_norm": 0.5111702680587769, + "learning_rate": 3.911175225623965e-06, + "loss": 0.5668, + "step": 1727 + }, + { + "epoch": 1.9328859060402683, + "grad_norm": 0.4997490346431732, + "learning_rate": 3.909943383043904e-06, + "loss": 0.5695, + "step": 1728 + }, + { + "epoch": 1.9340044742729305, + "grad_norm": 0.5036013126373291, + "learning_rate": 3.908711038272011e-06, + "loss": 0.5928, + "step": 1729 + }, + { + "epoch": 1.9351230425055927, + "grad_norm": 0.5162931084632874, + "learning_rate": 3.907478191747221e-06, + "loss": 0.5749, + "step": 1730 + }, + { + "epoch": 1.936241610738255, + "grad_norm": 0.48660942912101746, + "learning_rate": 3.906244843908647e-06, + "loss": 0.5539, + "step": 1731 + }, + { + "epoch": 1.9373601789709172, + "grad_norm": 0.4941866993904114, + "learning_rate": 3.905010995195582e-06, + "loss": 0.5854, + "step": 1732 + }, + { + "epoch": 1.9384787472035794, + "grad_norm": 0.48834383487701416, + "learning_rate": 3.903776646047496e-06, + "loss": 0.6049, + "step": 1733 + }, + { + "epoch": 1.9395973154362416, + "grad_norm": 0.5004755258560181, + "learning_rate": 3.902541796904038e-06, + "loss": 0.5544, + "step": 1734 + }, + { + "epoch": 1.9407158836689038, + "grad_norm": 0.508080005645752, + "learning_rate": 3.901306448205035e-06, + "loss": 0.5804, + "step": 1735 + }, + { + "epoch": 1.941834451901566, + "grad_norm": 0.49556127190589905, + "learning_rate": 3.9000706003904934e-06, + "loss": 0.5871, + "step": 1736 + }, + { + "epoch": 1.9429530201342282, + "grad_norm": 0.49995389580726624, + "learning_rate": 3.898834253900594e-06, + "loss": 0.59, + "step": 1737 + }, + { + "epoch": 1.9440715883668904, + "grad_norm": 0.4921482503414154, + "learning_rate": 3.8975974091756975e-06, + "loss": 0.5611, + "step": 1738 + }, + { + "epoch": 1.9451901565995526, + "grad_norm": 0.5095739364624023, + "learning_rate": 3.896360066656342e-06, + "loss": 0.5636, + "step": 1739 + }, + { + "epoch": 1.9463087248322148, + "grad_norm": 0.4894288182258606, + "learning_rate": 3.895122226783243e-06, + "loss": 0.5896, + "step": 1740 + }, + { + "epoch": 1.947427293064877, + "grad_norm": 0.516846776008606, + "learning_rate": 3.893883889997292e-06, + "loss": 0.5938, + "step": 1741 + }, + { + "epoch": 1.9485458612975393, + "grad_norm": 0.4975781738758087, + "learning_rate": 3.892645056739559e-06, + "loss": 0.5635, + "step": 1742 + }, + { + "epoch": 1.9496644295302015, + "grad_norm": 0.49723535776138306, + "learning_rate": 3.891405727451289e-06, + "loss": 0.588, + "step": 1743 + }, + { + "epoch": 1.9507829977628637, + "grad_norm": 0.5051411390304565, + "learning_rate": 3.8901659025739044e-06, + "loss": 0.5574, + "step": 1744 + }, + { + "epoch": 1.951901565995526, + "grad_norm": 0.4824368953704834, + "learning_rate": 3.888925582549006e-06, + "loss": 0.5679, + "step": 1745 + }, + { + "epoch": 1.9530201342281879, + "grad_norm": 0.49928563833236694, + "learning_rate": 3.887684767818369e-06, + "loss": 0.5664, + "step": 1746 + }, + { + "epoch": 1.95413870246085, + "grad_norm": 0.5037205219268799, + "learning_rate": 3.886443458823944e-06, + "loss": 0.5685, + "step": 1747 + }, + { + "epoch": 1.9552572706935123, + "grad_norm": 0.49731746315956116, + "learning_rate": 3.88520165600786e-06, + "loss": 0.563, + "step": 1748 + }, + { + "epoch": 1.9563758389261745, + "grad_norm": 0.5214453935623169, + "learning_rate": 3.883959359812421e-06, + "loss": 0.5882, + "step": 1749 + }, + { + "epoch": 1.9574944071588367, + "grad_norm": 0.49750804901123047, + "learning_rate": 3.8827165706801055e-06, + "loss": 0.5872, + "step": 1750 + }, + { + "epoch": 1.958612975391499, + "grad_norm": 0.5092785358428955, + "learning_rate": 3.881473289053569e-06, + "loss": 0.5948, + "step": 1751 + }, + { + "epoch": 1.959731543624161, + "grad_norm": 0.5069229602813721, + "learning_rate": 3.880229515375642e-06, + "loss": 0.5836, + "step": 1752 + }, + { + "epoch": 1.9608501118568231, + "grad_norm": 0.48659929633140564, + "learning_rate": 3.878985250089331e-06, + "loss": 0.5723, + "step": 1753 + }, + { + "epoch": 1.9619686800894853, + "grad_norm": 0.4850320518016815, + "learning_rate": 3.8777404936378145e-06, + "loss": 0.567, + "step": 1754 + }, + { + "epoch": 1.9630872483221475, + "grad_norm": 0.49184632301330566, + "learning_rate": 3.87649524646445e-06, + "loss": 0.5594, + "step": 1755 + }, + { + "epoch": 1.9642058165548097, + "grad_norm": 0.5033354163169861, + "learning_rate": 3.875249509012769e-06, + "loss": 0.5774, + "step": 1756 + }, + { + "epoch": 1.965324384787472, + "grad_norm": 0.4867788851261139, + "learning_rate": 3.874003281726474e-06, + "loss": 0.5656, + "step": 1757 + }, + { + "epoch": 1.9664429530201342, + "grad_norm": 0.5024017095565796, + "learning_rate": 3.872756565049447e-06, + "loss": 0.542, + "step": 1758 + }, + { + "epoch": 1.9675615212527964, + "grad_norm": 0.4939708709716797, + "learning_rate": 3.871509359425741e-06, + "loss": 0.5554, + "step": 1759 + }, + { + "epoch": 1.9686800894854586, + "grad_norm": 0.48120421171188354, + "learning_rate": 3.870261665299583e-06, + "loss": 0.5724, + "step": 1760 + }, + { + "epoch": 1.9697986577181208, + "grad_norm": 0.49949952960014343, + "learning_rate": 3.869013483115378e-06, + "loss": 0.5732, + "step": 1761 + }, + { + "epoch": 1.970917225950783, + "grad_norm": 0.5079313516616821, + "learning_rate": 3.867764813317699e-06, + "loss": 0.5969, + "step": 1762 + }, + { + "epoch": 1.9720357941834452, + "grad_norm": 0.5027442574501038, + "learning_rate": 3.866515656351297e-06, + "loss": 0.5742, + "step": 1763 + }, + { + "epoch": 1.9731543624161074, + "grad_norm": 0.4810044765472412, + "learning_rate": 3.865266012661095e-06, + "loss": 0.5704, + "step": 1764 + }, + { + "epoch": 1.9742729306487696, + "grad_norm": 0.49268779158592224, + "learning_rate": 3.86401588269219e-06, + "loss": 0.5459, + "step": 1765 + }, + { + "epoch": 1.9753914988814318, + "grad_norm": 0.4957699179649353, + "learning_rate": 3.86276526688985e-06, + "loss": 0.5722, + "step": 1766 + }, + { + "epoch": 1.976510067114094, + "grad_norm": 0.4951817989349365, + "learning_rate": 3.8615141656995194e-06, + "loss": 0.5564, + "step": 1767 + }, + { + "epoch": 1.9776286353467563, + "grad_norm": 0.5150087475776672, + "learning_rate": 3.860262579566813e-06, + "loss": 0.5721, + "step": 1768 + }, + { + "epoch": 1.9787472035794185, + "grad_norm": 0.5071924328804016, + "learning_rate": 3.859010508937521e-06, + "loss": 0.5685, + "step": 1769 + }, + { + "epoch": 1.9798657718120807, + "grad_norm": 0.5006820559501648, + "learning_rate": 3.857757954257601e-06, + "loss": 0.5701, + "step": 1770 + }, + { + "epoch": 1.9809843400447429, + "grad_norm": 0.5228837132453918, + "learning_rate": 3.856504915973188e-06, + "loss": 0.5691, + "step": 1771 + }, + { + "epoch": 1.9821029082774049, + "grad_norm": 0.4966356158256531, + "learning_rate": 3.855251394530589e-06, + "loss": 0.5598, + "step": 1772 + }, + { + "epoch": 1.983221476510067, + "grad_norm": 0.5056618452072144, + "learning_rate": 3.85399739037628e-06, + "loss": 0.5656, + "step": 1773 + }, + { + "epoch": 1.9843400447427293, + "grad_norm": 0.49823620915412903, + "learning_rate": 3.85274290395691e-06, + "loss": 0.5685, + "step": 1774 + }, + { + "epoch": 1.9854586129753915, + "grad_norm": 0.5027350187301636, + "learning_rate": 3.851487935719302e-06, + "loss": 0.5697, + "step": 1775 + }, + { + "epoch": 1.9865771812080537, + "grad_norm": 0.5098000764846802, + "learning_rate": 3.8502324861104475e-06, + "loss": 0.5658, + "step": 1776 + }, + { + "epoch": 1.987695749440716, + "grad_norm": 0.5115095376968384, + "learning_rate": 3.848976555577513e-06, + "loss": 0.5695, + "step": 1777 + }, + { + "epoch": 1.9888143176733781, + "grad_norm": 0.485042005777359, + "learning_rate": 3.847720144567832e-06, + "loss": 0.5505, + "step": 1778 + }, + { + "epoch": 1.9899328859060401, + "grad_norm": 0.5096463561058044, + "learning_rate": 3.846463253528913e-06, + "loss": 0.576, + "step": 1779 + }, + { + "epoch": 1.9910514541387023, + "grad_norm": 0.508958101272583, + "learning_rate": 3.845205882908432e-06, + "loss": 0.58, + "step": 1780 + }, + { + "epoch": 1.9921700223713645, + "grad_norm": 0.5001410245895386, + "learning_rate": 3.8439480331542394e-06, + "loss": 0.5778, + "step": 1781 + }, + { + "epoch": 1.9932885906040267, + "grad_norm": 0.49567386507987976, + "learning_rate": 3.842689704714354e-06, + "loss": 0.5688, + "step": 1782 + }, + { + "epoch": 1.994407158836689, + "grad_norm": 0.5215677618980408, + "learning_rate": 3.841430898036966e-06, + "loss": 0.5879, + "step": 1783 + }, + { + "epoch": 1.9955257270693512, + "grad_norm": 0.5176389217376709, + "learning_rate": 3.840171613570435e-06, + "loss": 0.5788, + "step": 1784 + }, + { + "epoch": 1.9966442953020134, + "grad_norm": 0.5101540088653564, + "learning_rate": 3.838911851763291e-06, + "loss": 0.563, + "step": 1785 + }, + { + "epoch": 1.9977628635346756, + "grad_norm": 0.5037233233451843, + "learning_rate": 3.837651613064234e-06, + "loss": 0.582, + "step": 1786 + }, + { + "epoch": 1.9988814317673378, + "grad_norm": 0.5072547197341919, + "learning_rate": 3.836390897922136e-06, + "loss": 0.5856, + "step": 1787 + }, + { + "epoch": 2.0, + "grad_norm": 0.49261927604675293, + "learning_rate": 3.835129706786036e-06, + "loss": 0.5879, + "step": 1788 + }, + { + "epoch": 2.001118568232662, + "grad_norm": 0.4979532063007355, + "learning_rate": 3.833868040105143e-06, + "loss": 0.537, + "step": 1789 + }, + { + "epoch": 2.0022371364653244, + "grad_norm": 0.5097135305404663, + "learning_rate": 3.8326058983288355e-06, + "loss": 0.5378, + "step": 1790 + }, + { + "epoch": 2.0033557046979866, + "grad_norm": 0.5016512870788574, + "learning_rate": 3.831343281906663e-06, + "loss": 0.5606, + "step": 1791 + }, + { + "epoch": 2.004474272930649, + "grad_norm": 0.5033982992172241, + "learning_rate": 3.830080191288342e-06, + "loss": 0.5779, + "step": 1792 + }, + { + "epoch": 2.005592841163311, + "grad_norm": 0.49050572514533997, + "learning_rate": 3.8288166269237585e-06, + "loss": 0.5639, + "step": 1793 + }, + { + "epoch": 2.0067114093959733, + "grad_norm": 0.49373260140419006, + "learning_rate": 3.827552589262966e-06, + "loss": 0.5647, + "step": 1794 + }, + { + "epoch": 2.0078299776286355, + "grad_norm": 0.5228650569915771, + "learning_rate": 3.82628807875619e-06, + "loss": 0.56, + "step": 1795 + }, + { + "epoch": 2.0089485458612977, + "grad_norm": 0.5270240902900696, + "learning_rate": 3.8250230958538205e-06, + "loss": 0.5804, + "step": 1796 + }, + { + "epoch": 2.01006711409396, + "grad_norm": 0.5110482573509216, + "learning_rate": 3.8237576410064185e-06, + "loss": 0.5483, + "step": 1797 + }, + { + "epoch": 2.011185682326622, + "grad_norm": 0.5088019967079163, + "learning_rate": 3.8224917146647104e-06, + "loss": 0.5526, + "step": 1798 + }, + { + "epoch": 2.0123042505592843, + "grad_norm": 0.5138302445411682, + "learning_rate": 3.821225317279595e-06, + "loss": 0.5579, + "step": 1799 + }, + { + "epoch": 2.0134228187919465, + "grad_norm": 0.4998088479042053, + "learning_rate": 3.8199584493021335e-06, + "loss": 0.5713, + "step": 1800 + }, + { + "epoch": 2.0145413870246087, + "grad_norm": 0.5129007697105408, + "learning_rate": 3.818691111183559e-06, + "loss": 0.5301, + "step": 1801 + }, + { + "epoch": 2.0156599552572705, + "grad_norm": 0.5121548175811768, + "learning_rate": 3.81742330337527e-06, + "loss": 0.5561, + "step": 1802 + }, + { + "epoch": 2.0167785234899327, + "grad_norm": 0.5094550251960754, + "learning_rate": 3.816155026328832e-06, + "loss": 0.5483, + "step": 1803 + }, + { + "epoch": 2.017897091722595, + "grad_norm": 0.5191040635108948, + "learning_rate": 3.81488628049598e-06, + "loss": 0.568, + "step": 1804 + }, + { + "epoch": 2.019015659955257, + "grad_norm": 0.5283321142196655, + "learning_rate": 3.813617066328612e-06, + "loss": 0.5759, + "step": 1805 + }, + { + "epoch": 2.0201342281879193, + "grad_norm": 0.5108759999275208, + "learning_rate": 3.812347384278796e-06, + "loss": 0.531, + "step": 1806 + }, + { + "epoch": 2.0212527964205815, + "grad_norm": 0.5010626316070557, + "learning_rate": 3.811077234798766e-06, + "loss": 0.5455, + "step": 1807 + }, + { + "epoch": 2.0223713646532437, + "grad_norm": 0.5164163708686829, + "learning_rate": 3.8098066183409223e-06, + "loss": 0.5558, + "step": 1808 + }, + { + "epoch": 2.023489932885906, + "grad_norm": 0.501948893070221, + "learning_rate": 3.808535535357831e-06, + "loss": 0.5393, + "step": 1809 + }, + { + "epoch": 2.024608501118568, + "grad_norm": 0.516548752784729, + "learning_rate": 3.8072639863022256e-06, + "loss": 0.5604, + "step": 1810 + }, + { + "epoch": 2.0257270693512304, + "grad_norm": 0.5083953738212585, + "learning_rate": 3.8059919716270033e-06, + "loss": 0.5773, + "step": 1811 + }, + { + "epoch": 2.0268456375838926, + "grad_norm": 0.5123399496078491, + "learning_rate": 3.8047194917852303e-06, + "loss": 0.5429, + "step": 1812 + }, + { + "epoch": 2.027964205816555, + "grad_norm": 0.5018823742866516, + "learning_rate": 3.803446547230136e-06, + "loss": 0.5492, + "step": 1813 + }, + { + "epoch": 2.029082774049217, + "grad_norm": 0.5036315321922302, + "learning_rate": 3.802173138415117e-06, + "loss": 0.5493, + "step": 1814 + }, + { + "epoch": 2.030201342281879, + "grad_norm": 0.5142338275909424, + "learning_rate": 3.8008992657937326e-06, + "loss": 0.5531, + "step": 1815 + }, + { + "epoch": 2.0313199105145414, + "grad_norm": 0.5331456065177917, + "learning_rate": 3.7996249298197108e-06, + "loss": 0.5543, + "step": 1816 + }, + { + "epoch": 2.0324384787472036, + "grad_norm": 0.51402747631073, + "learning_rate": 3.7983501309469422e-06, + "loss": 0.5828, + "step": 1817 + }, + { + "epoch": 2.033557046979866, + "grad_norm": 0.5249538421630859, + "learning_rate": 3.7970748696294834e-06, + "loss": 0.5662, + "step": 1818 + }, + { + "epoch": 2.034675615212528, + "grad_norm": 0.520852267742157, + "learning_rate": 3.7957991463215547e-06, + "loss": 0.549, + "step": 1819 + }, + { + "epoch": 2.0357941834451903, + "grad_norm": 0.5112542510032654, + "learning_rate": 3.7945229614775424e-06, + "loss": 0.5821, + "step": 1820 + }, + { + "epoch": 2.0369127516778525, + "grad_norm": 0.5227968096733093, + "learning_rate": 3.7932463155519973e-06, + "loss": 0.5731, + "step": 1821 + }, + { + "epoch": 2.0380313199105147, + "grad_norm": 0.5266485810279846, + "learning_rate": 3.7919692089996306e-06, + "loss": 0.5697, + "step": 1822 + }, + { + "epoch": 2.039149888143177, + "grad_norm": 0.5138500928878784, + "learning_rate": 3.7906916422753225e-06, + "loss": 0.5305, + "step": 1823 + }, + { + "epoch": 2.040268456375839, + "grad_norm": 0.5250424742698669, + "learning_rate": 3.7894136158341153e-06, + "loss": 0.5247, + "step": 1824 + }, + { + "epoch": 2.0413870246085013, + "grad_norm": 0.5152517557144165, + "learning_rate": 3.788135130131215e-06, + "loss": 0.5781, + "step": 1825 + }, + { + "epoch": 2.0425055928411635, + "grad_norm": 0.5056542158126831, + "learning_rate": 3.7868561856219893e-06, + "loss": 0.5259, + "step": 1826 + }, + { + "epoch": 2.0436241610738257, + "grad_norm": 0.5131957530975342, + "learning_rate": 3.7855767827619714e-06, + "loss": 0.5567, + "step": 1827 + }, + { + "epoch": 2.0447427293064875, + "grad_norm": 0.5123169422149658, + "learning_rate": 3.7842969220068592e-06, + "loss": 0.5584, + "step": 1828 + }, + { + "epoch": 2.0458612975391497, + "grad_norm": 0.5096324682235718, + "learning_rate": 3.7830166038125107e-06, + "loss": 0.537, + "step": 1829 + }, + { + "epoch": 2.046979865771812, + "grad_norm": 0.5322879552841187, + "learning_rate": 3.781735828634947e-06, + "loss": 0.5619, + "step": 1830 + }, + { + "epoch": 2.048098434004474, + "grad_norm": 0.5452521443367004, + "learning_rate": 3.780454596930354e-06, + "loss": 0.5658, + "step": 1831 + }, + { + "epoch": 2.0492170022371363, + "grad_norm": 0.5220131874084473, + "learning_rate": 3.7791729091550795e-06, + "loss": 0.549, + "step": 1832 + }, + { + "epoch": 2.0503355704697985, + "grad_norm": 0.5197979211807251, + "learning_rate": 3.7778907657656332e-06, + "loss": 0.577, + "step": 1833 + }, + { + "epoch": 2.0514541387024607, + "grad_norm": 0.5078215003013611, + "learning_rate": 3.7766081672186866e-06, + "loss": 0.5694, + "step": 1834 + }, + { + "epoch": 2.052572706935123, + "grad_norm": 0.5158956050872803, + "learning_rate": 3.7753251139710746e-06, + "loss": 0.5425, + "step": 1835 + }, + { + "epoch": 2.053691275167785, + "grad_norm": 0.5045775771141052, + "learning_rate": 3.774041606479794e-06, + "loss": 0.546, + "step": 1836 + }, + { + "epoch": 2.0548098434004474, + "grad_norm": 0.5353016257286072, + "learning_rate": 3.7727576452020022e-06, + "loss": 0.5593, + "step": 1837 + }, + { + "epoch": 2.0559284116331096, + "grad_norm": 0.5330696702003479, + "learning_rate": 3.7714732305950185e-06, + "loss": 0.5346, + "step": 1838 + }, + { + "epoch": 2.057046979865772, + "grad_norm": 0.518613338470459, + "learning_rate": 3.770188363116324e-06, + "loss": 0.5674, + "step": 1839 + }, + { + "epoch": 2.058165548098434, + "grad_norm": 0.5091762542724609, + "learning_rate": 3.768903043223562e-06, + "loss": 0.5635, + "step": 1840 + }, + { + "epoch": 2.059284116331096, + "grad_norm": 0.5355960130691528, + "learning_rate": 3.767617271374536e-06, + "loss": 0.5471, + "step": 1841 + }, + { + "epoch": 2.0604026845637584, + "grad_norm": 0.5394028425216675, + "learning_rate": 3.76633104802721e-06, + "loss": 0.5626, + "step": 1842 + }, + { + "epoch": 2.0615212527964206, + "grad_norm": 0.5025933384895325, + "learning_rate": 3.76504437363971e-06, + "loss": 0.5442, + "step": 1843 + }, + { + "epoch": 2.062639821029083, + "grad_norm": 0.5353500247001648, + "learning_rate": 3.763757248670321e-06, + "loss": 0.556, + "step": 1844 + }, + { + "epoch": 2.063758389261745, + "grad_norm": 0.5422487854957581, + "learning_rate": 3.7624696735774914e-06, + "loss": 0.5556, + "step": 1845 + }, + { + "epoch": 2.0648769574944073, + "grad_norm": 0.5265703201293945, + "learning_rate": 3.7611816488198267e-06, + "loss": 0.5437, + "step": 1846 + }, + { + "epoch": 2.0659955257270695, + "grad_norm": 0.5145528316497803, + "learning_rate": 3.7598931748560935e-06, + "loss": 0.543, + "step": 1847 + }, + { + "epoch": 2.0671140939597317, + "grad_norm": 0.5063956379890442, + "learning_rate": 3.758604252145219e-06, + "loss": 0.5444, + "step": 1848 + }, + { + "epoch": 2.068232662192394, + "grad_norm": 0.5059778094291687, + "learning_rate": 3.7573148811462924e-06, + "loss": 0.5427, + "step": 1849 + }, + { + "epoch": 2.069351230425056, + "grad_norm": 0.5186747908592224, + "learning_rate": 3.756025062318557e-06, + "loss": 0.5534, + "step": 1850 + }, + { + "epoch": 2.0704697986577183, + "grad_norm": 0.5233258605003357, + "learning_rate": 3.75473479612142e-06, + "loss": 0.5641, + "step": 1851 + }, + { + "epoch": 2.0715883668903805, + "grad_norm": 0.5194051265716553, + "learning_rate": 3.7534440830144464e-06, + "loss": 0.5593, + "step": 1852 + }, + { + "epoch": 2.0727069351230427, + "grad_norm": 0.5253271460533142, + "learning_rate": 3.7521529234573616e-06, + "loss": 0.5654, + "step": 1853 + }, + { + "epoch": 2.073825503355705, + "grad_norm": 0.500721275806427, + "learning_rate": 3.7508613179100494e-06, + "loss": 0.5528, + "step": 1854 + }, + { + "epoch": 2.0749440715883667, + "grad_norm": 0.5255014896392822, + "learning_rate": 3.749569266832551e-06, + "loss": 0.57, + "step": 1855 + }, + { + "epoch": 2.076062639821029, + "grad_norm": 0.5196225643157959, + "learning_rate": 3.748276770685068e-06, + "loss": 0.5559, + "step": 1856 + }, + { + "epoch": 2.077181208053691, + "grad_norm": 0.5191462635993958, + "learning_rate": 3.746983829927961e-06, + "loss": 0.5437, + "step": 1857 + }, + { + "epoch": 2.0782997762863533, + "grad_norm": 0.5193758606910706, + "learning_rate": 3.7456904450217467e-06, + "loss": 0.555, + "step": 1858 + }, + { + "epoch": 2.0794183445190155, + "grad_norm": 0.5171102285385132, + "learning_rate": 3.7443966164271016e-06, + "loss": 0.5613, + "step": 1859 + }, + { + "epoch": 2.0805369127516777, + "grad_norm": 0.5180017352104187, + "learning_rate": 3.7431023446048597e-06, + "loss": 0.5824, + "step": 1860 + }, + { + "epoch": 2.08165548098434, + "grad_norm": 0.513680100440979, + "learning_rate": 3.741807630016014e-06, + "loss": 0.566, + "step": 1861 + }, + { + "epoch": 2.082774049217002, + "grad_norm": 0.5289089679718018, + "learning_rate": 3.740512473121713e-06, + "loss": 0.5533, + "step": 1862 + }, + { + "epoch": 2.0838926174496644, + "grad_norm": 0.531887412071228, + "learning_rate": 3.7392168743832648e-06, + "loss": 0.5507, + "step": 1863 + }, + { + "epoch": 2.0850111856823266, + "grad_norm": 0.5146557688713074, + "learning_rate": 3.737920834262134e-06, + "loss": 0.5706, + "step": 1864 + }, + { + "epoch": 2.086129753914989, + "grad_norm": 0.526422917842865, + "learning_rate": 3.736624353219943e-06, + "loss": 0.5469, + "step": 1865 + }, + { + "epoch": 2.087248322147651, + "grad_norm": 0.5136563777923584, + "learning_rate": 3.73532743171847e-06, + "loss": 0.5557, + "step": 1866 + }, + { + "epoch": 2.088366890380313, + "grad_norm": 0.5312618017196655, + "learning_rate": 3.7340300702196515e-06, + "loss": 0.5711, + "step": 1867 + }, + { + "epoch": 2.0894854586129754, + "grad_norm": 0.5255826711654663, + "learning_rate": 3.7327322691855793e-06, + "loss": 0.5598, + "step": 1868 + }, + { + "epoch": 2.0906040268456376, + "grad_norm": 0.5231844186782837, + "learning_rate": 3.7314340290785026e-06, + "loss": 0.5514, + "step": 1869 + }, + { + "epoch": 2.0917225950783, + "grad_norm": 0.5380609035491943, + "learning_rate": 3.7301353503608286e-06, + "loss": 0.5366, + "step": 1870 + }, + { + "epoch": 2.092841163310962, + "grad_norm": 0.5260589122772217, + "learning_rate": 3.7288362334951156e-06, + "loss": 0.5447, + "step": 1871 + }, + { + "epoch": 2.0939597315436242, + "grad_norm": 0.5319775938987732, + "learning_rate": 3.727536678944085e-06, + "loss": 0.557, + "step": 1872 + }, + { + "epoch": 2.0950782997762865, + "grad_norm": 0.509621262550354, + "learning_rate": 3.726236687170608e-06, + "loss": 0.549, + "step": 1873 + }, + { + "epoch": 2.0961968680089487, + "grad_norm": 0.520440399646759, + "learning_rate": 3.724936258637715e-06, + "loss": 0.5445, + "step": 1874 + }, + { + "epoch": 2.097315436241611, + "grad_norm": 0.5256479978561401, + "learning_rate": 3.7236353938085914e-06, + "loss": 0.5609, + "step": 1875 + }, + { + "epoch": 2.098434004474273, + "grad_norm": 0.5247079133987427, + "learning_rate": 3.7223340931465763e-06, + "loss": 0.5546, + "step": 1876 + }, + { + "epoch": 2.0995525727069353, + "grad_norm": 0.5369859933853149, + "learning_rate": 3.7210323571151664e-06, + "loss": 0.5669, + "step": 1877 + }, + { + "epoch": 2.1006711409395975, + "grad_norm": 0.5226863026618958, + "learning_rate": 3.7197301861780126e-06, + "loss": 0.5652, + "step": 1878 + }, + { + "epoch": 2.1017897091722597, + "grad_norm": 0.5196619033813477, + "learning_rate": 3.718427580798919e-06, + "loss": 0.5509, + "step": 1879 + }, + { + "epoch": 2.1029082774049215, + "grad_norm": 0.5064836144447327, + "learning_rate": 3.717124541441847e-06, + "loss": 0.5613, + "step": 1880 + }, + { + "epoch": 2.1040268456375837, + "grad_norm": 0.5177644491195679, + "learning_rate": 3.715821068570912e-06, + "loss": 0.5628, + "step": 1881 + }, + { + "epoch": 2.105145413870246, + "grad_norm": 0.5370467901229858, + "learning_rate": 3.7145171626503824e-06, + "loss": 0.5445, + "step": 1882 + }, + { + "epoch": 2.106263982102908, + "grad_norm": 0.5503098368644714, + "learning_rate": 3.7132128241446815e-06, + "loss": 0.5667, + "step": 1883 + }, + { + "epoch": 2.1073825503355703, + "grad_norm": 0.5312926173210144, + "learning_rate": 3.711908053518387e-06, + "loss": 0.5636, + "step": 1884 + }, + { + "epoch": 2.1085011185682325, + "grad_norm": 0.5327373743057251, + "learning_rate": 3.710602851236232e-06, + "loss": 0.5341, + "step": 1885 + }, + { + "epoch": 2.1096196868008947, + "grad_norm": 0.5360662341117859, + "learning_rate": 3.7092972177630998e-06, + "loss": 0.5842, + "step": 1886 + }, + { + "epoch": 2.110738255033557, + "grad_norm": 0.532006561756134, + "learning_rate": 3.7079911535640302e-06, + "loss": 0.5852, + "step": 1887 + }, + { + "epoch": 2.111856823266219, + "grad_norm": 0.5109444260597229, + "learning_rate": 3.7066846591042155e-06, + "loss": 0.5655, + "step": 1888 + }, + { + "epoch": 2.1129753914988814, + "grad_norm": 0.5301034450531006, + "learning_rate": 3.705377734849001e-06, + "loss": 0.5482, + "step": 1889 + }, + { + "epoch": 2.1140939597315436, + "grad_norm": 0.5499679446220398, + "learning_rate": 3.7040703812638858e-06, + "loss": 0.5833, + "step": 1890 + }, + { + "epoch": 2.115212527964206, + "grad_norm": 0.5118054151535034, + "learning_rate": 3.7027625988145204e-06, + "loss": 0.527, + "step": 1891 + }, + { + "epoch": 2.116331096196868, + "grad_norm": 0.5172855257987976, + "learning_rate": 3.7014543879667097e-06, + "loss": 0.5533, + "step": 1892 + }, + { + "epoch": 2.11744966442953, + "grad_norm": 0.5329854488372803, + "learning_rate": 3.7001457491864106e-06, + "loss": 0.5519, + "step": 1893 + }, + { + "epoch": 2.1185682326621924, + "grad_norm": 0.522705078125, + "learning_rate": 3.6988366829397326e-06, + "loss": 0.5723, + "step": 1894 + }, + { + "epoch": 2.1196868008948546, + "grad_norm": 0.5207564234733582, + "learning_rate": 3.697527189692937e-06, + "loss": 0.5397, + "step": 1895 + }, + { + "epoch": 2.120805369127517, + "grad_norm": 0.5084646344184875, + "learning_rate": 3.696217269912437e-06, + "loss": 0.5288, + "step": 1896 + }, + { + "epoch": 2.121923937360179, + "grad_norm": 0.533870279788971, + "learning_rate": 3.694906924064799e-06, + "loss": 0.5575, + "step": 1897 + }, + { + "epoch": 2.1230425055928412, + "grad_norm": 0.5326732993125916, + "learning_rate": 3.69359615261674e-06, + "loss": 0.5667, + "step": 1898 + }, + { + "epoch": 2.1241610738255035, + "grad_norm": 0.5208526253700256, + "learning_rate": 3.6922849560351293e-06, + "loss": 0.548, + "step": 1899 + }, + { + "epoch": 2.1252796420581657, + "grad_norm": 0.5319230556488037, + "learning_rate": 3.6909733347869855e-06, + "loss": 0.5455, + "step": 1900 + }, + { + "epoch": 2.126398210290828, + "grad_norm": 0.5171891450881958, + "learning_rate": 3.6896612893394816e-06, + "loss": 0.5201, + "step": 1901 + }, + { + "epoch": 2.12751677852349, + "grad_norm": 0.4919440746307373, + "learning_rate": 3.6883488201599405e-06, + "loss": 0.5407, + "step": 1902 + }, + { + "epoch": 2.1286353467561523, + "grad_norm": 0.4933460056781769, + "learning_rate": 3.687035927715835e-06, + "loss": 0.5519, + "step": 1903 + }, + { + "epoch": 2.1297539149888145, + "grad_norm": 0.5424522161483765, + "learning_rate": 3.6857226124747903e-06, + "loss": 0.5531, + "step": 1904 + }, + { + "epoch": 2.1308724832214767, + "grad_norm": 0.5328218340873718, + "learning_rate": 3.6844088749045796e-06, + "loss": 0.5419, + "step": 1905 + }, + { + "epoch": 2.131991051454139, + "grad_norm": 0.5593510270118713, + "learning_rate": 3.683094715473131e-06, + "loss": 0.5798, + "step": 1906 + }, + { + "epoch": 2.1331096196868007, + "grad_norm": 0.523350179195404, + "learning_rate": 3.6817801346485183e-06, + "loss": 0.5616, + "step": 1907 + }, + { + "epoch": 2.134228187919463, + "grad_norm": 0.523203432559967, + "learning_rate": 3.680465132898967e-06, + "loss": 0.557, + "step": 1908 + }, + { + "epoch": 2.135346756152125, + "grad_norm": 0.5441296696662903, + "learning_rate": 3.6791497106928532e-06, + "loss": 0.5602, + "step": 1909 + }, + { + "epoch": 2.1364653243847873, + "grad_norm": 0.5297915935516357, + "learning_rate": 3.6778338684987036e-06, + "loss": 0.5703, + "step": 1910 + }, + { + "epoch": 2.1375838926174495, + "grad_norm": 0.5416028499603271, + "learning_rate": 3.6765176067851915e-06, + "loss": 0.5547, + "step": 1911 + }, + { + "epoch": 2.1387024608501117, + "grad_norm": 0.5388940572738647, + "learning_rate": 3.6752009260211413e-06, + "loss": 0.5625, + "step": 1912 + }, + { + "epoch": 2.139821029082774, + "grad_norm": 0.5242265462875366, + "learning_rate": 3.673883826675527e-06, + "loss": 0.551, + "step": 1913 + }, + { + "epoch": 2.140939597315436, + "grad_norm": 0.5342205166816711, + "learning_rate": 3.672566309217472e-06, + "loss": 0.5404, + "step": 1914 + }, + { + "epoch": 2.1420581655480984, + "grad_norm": 0.511896550655365, + "learning_rate": 3.6712483741162473e-06, + "loss": 0.5424, + "step": 1915 + }, + { + "epoch": 2.1431767337807606, + "grad_norm": 0.5412023067474365, + "learning_rate": 3.669930021841274e-06, + "loss": 0.5656, + "step": 1916 + }, + { + "epoch": 2.1442953020134228, + "grad_norm": 0.523468554019928, + "learning_rate": 3.6686112528621204e-06, + "loss": 0.5497, + "step": 1917 + }, + { + "epoch": 2.145413870246085, + "grad_norm": 0.5094099044799805, + "learning_rate": 3.6672920676485047e-06, + "loss": 0.5231, + "step": 1918 + }, + { + "epoch": 2.146532438478747, + "grad_norm": 0.5130200386047363, + "learning_rate": 3.665972466670291e-06, + "loss": 0.574, + "step": 1919 + }, + { + "epoch": 2.1476510067114094, + "grad_norm": 0.5246685743331909, + "learning_rate": 3.6646524503974955e-06, + "loss": 0.555, + "step": 1920 + }, + { + "epoch": 2.1487695749440716, + "grad_norm": 0.5176163911819458, + "learning_rate": 3.6633320193002786e-06, + "loss": 0.5587, + "step": 1921 + }, + { + "epoch": 2.149888143176734, + "grad_norm": 0.5470594167709351, + "learning_rate": 3.6620111738489494e-06, + "loss": 0.585, + "step": 1922 + }, + { + "epoch": 2.151006711409396, + "grad_norm": 0.5229683518409729, + "learning_rate": 3.660689914513965e-06, + "loss": 0.573, + "step": 1923 + }, + { + "epoch": 2.1521252796420582, + "grad_norm": 0.5269148945808411, + "learning_rate": 3.6593682417659303e-06, + "loss": 0.553, + "step": 1924 + }, + { + "epoch": 2.1532438478747205, + "grad_norm": 0.5332608819007874, + "learning_rate": 3.6580461560755974e-06, + "loss": 0.587, + "step": 1925 + }, + { + "epoch": 2.1543624161073827, + "grad_norm": 0.5306010842323303, + "learning_rate": 3.6567236579138645e-06, + "loss": 0.5373, + "step": 1926 + }, + { + "epoch": 2.155480984340045, + "grad_norm": 0.5315900444984436, + "learning_rate": 3.6554007477517774e-06, + "loss": 0.554, + "step": 1927 + }, + { + "epoch": 2.156599552572707, + "grad_norm": 0.5413175225257874, + "learning_rate": 3.6540774260605285e-06, + "loss": 0.566, + "step": 1928 + }, + { + "epoch": 2.1577181208053693, + "grad_norm": 0.511902928352356, + "learning_rate": 3.652753693311456e-06, + "loss": 0.5681, + "step": 1929 + }, + { + "epoch": 2.1588366890380315, + "grad_norm": 0.5407286882400513, + "learning_rate": 3.6514295499760477e-06, + "loss": 0.5844, + "step": 1930 + }, + { + "epoch": 2.1599552572706937, + "grad_norm": 0.5205231308937073, + "learning_rate": 3.6501049965259337e-06, + "loss": 0.5647, + "step": 1931 + }, + { + "epoch": 2.1610738255033555, + "grad_norm": 0.5314143300056458, + "learning_rate": 3.648780033432891e-06, + "loss": 0.5489, + "step": 1932 + }, + { + "epoch": 2.162192393736018, + "grad_norm": 0.5291325449943542, + "learning_rate": 3.6474546611688446e-06, + "loss": 0.5717, + "step": 1933 + }, + { + "epoch": 2.16331096196868, + "grad_norm": 0.5457761883735657, + "learning_rate": 3.646128880205863e-06, + "loss": 0.5589, + "step": 1934 + }, + { + "epoch": 2.164429530201342, + "grad_norm": 0.5354388952255249, + "learning_rate": 3.6448026910161626e-06, + "loss": 0.5663, + "step": 1935 + }, + { + "epoch": 2.1655480984340043, + "grad_norm": 0.5253117084503174, + "learning_rate": 3.643476094072102e-06, + "loss": 0.5479, + "step": 1936 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.5337569713592529, + "learning_rate": 3.6421490898461875e-06, + "loss": 0.5772, + "step": 1937 + }, + { + "epoch": 2.1677852348993287, + "grad_norm": 0.5295342206954956, + "learning_rate": 3.6408216788110705e-06, + "loss": 0.5415, + "step": 1938 + }, + { + "epoch": 2.168903803131991, + "grad_norm": 0.5360909700393677, + "learning_rate": 3.6394938614395454e-06, + "loss": 0.5488, + "step": 1939 + }, + { + "epoch": 2.170022371364653, + "grad_norm": 0.5327028036117554, + "learning_rate": 3.6381656382045528e-06, + "loss": 0.5406, + "step": 1940 + }, + { + "epoch": 2.1711409395973154, + "grad_norm": 0.5560749173164368, + "learning_rate": 3.636837009579178e-06, + "loss": 0.6032, + "step": 1941 + }, + { + "epoch": 2.1722595078299776, + "grad_norm": 0.5210057497024536, + "learning_rate": 3.6355079760366506e-06, + "loss": 0.5574, + "step": 1942 + }, + { + "epoch": 2.1733780760626398, + "grad_norm": 0.5295266509056091, + "learning_rate": 3.6341785380503436e-06, + "loss": 0.562, + "step": 1943 + }, + { + "epoch": 2.174496644295302, + "grad_norm": 0.50819993019104, + "learning_rate": 3.6328486960937735e-06, + "loss": 0.5407, + "step": 1944 + }, + { + "epoch": 2.175615212527964, + "grad_norm": 0.5325652360916138, + "learning_rate": 3.6315184506406026e-06, + "loss": 0.5554, + "step": 1945 + }, + { + "epoch": 2.1767337807606264, + "grad_norm": 0.5164484977722168, + "learning_rate": 3.6301878021646365e-06, + "loss": 0.5323, + "step": 1946 + }, + { + "epoch": 2.1778523489932886, + "grad_norm": 0.5184676647186279, + "learning_rate": 3.628856751139823e-06, + "loss": 0.5249, + "step": 1947 + }, + { + "epoch": 2.178970917225951, + "grad_norm": 0.5510220527648926, + "learning_rate": 3.627525298040255e-06, + "loss": 0.586, + "step": 1948 + }, + { + "epoch": 2.180089485458613, + "grad_norm": 0.5402721166610718, + "learning_rate": 3.626193443340167e-06, + "loss": 0.5785, + "step": 1949 + }, + { + "epoch": 2.1812080536912752, + "grad_norm": 0.5245886445045471, + "learning_rate": 3.6248611875139382e-06, + "loss": 0.5552, + "step": 1950 + }, + { + "epoch": 2.1823266219239374, + "grad_norm": 0.562502384185791, + "learning_rate": 3.6235285310360886e-06, + "loss": 0.5869, + "step": 1951 + }, + { + "epoch": 2.1834451901565997, + "grad_norm": 0.5293618440628052, + "learning_rate": 3.622195474381283e-06, + "loss": 0.5646, + "step": 1952 + }, + { + "epoch": 2.184563758389262, + "grad_norm": 0.5357851386070251, + "learning_rate": 3.620862018024327e-06, + "loss": 0.5731, + "step": 1953 + }, + { + "epoch": 2.185682326621924, + "grad_norm": 0.5279757976531982, + "learning_rate": 3.61952816244017e-06, + "loss": 0.5465, + "step": 1954 + }, + { + "epoch": 2.1868008948545863, + "grad_norm": 0.5184484124183655, + "learning_rate": 3.618193908103904e-06, + "loss": 0.5658, + "step": 1955 + }, + { + "epoch": 2.1879194630872485, + "grad_norm": 0.5314368605613708, + "learning_rate": 3.61685925549076e-06, + "loss": 0.5664, + "step": 1956 + }, + { + "epoch": 2.1890380313199107, + "grad_norm": 0.5167467594146729, + "learning_rate": 3.6155242050761135e-06, + "loss": 0.5649, + "step": 1957 + }, + { + "epoch": 2.190156599552573, + "grad_norm": 0.5202130079269409, + "learning_rate": 3.614188757335481e-06, + "loss": 0.5309, + "step": 1958 + }, + { + "epoch": 2.1912751677852347, + "grad_norm": 0.5223071575164795, + "learning_rate": 3.612852912744522e-06, + "loss": 0.5547, + "step": 1959 + }, + { + "epoch": 2.192393736017897, + "grad_norm": 0.51512610912323, + "learning_rate": 3.611516671779034e-06, + "loss": 0.5482, + "step": 1960 + }, + { + "epoch": 2.193512304250559, + "grad_norm": 0.5242097973823547, + "learning_rate": 3.6101800349149586e-06, + "loss": 0.5581, + "step": 1961 + }, + { + "epoch": 2.1946308724832213, + "grad_norm": 0.5404880046844482, + "learning_rate": 3.6088430026283773e-06, + "loss": 0.565, + "step": 1962 + }, + { + "epoch": 2.1957494407158835, + "grad_norm": 0.5668287873268127, + "learning_rate": 3.6075055753955125e-06, + "loss": 0.5784, + "step": 1963 + }, + { + "epoch": 2.1968680089485457, + "grad_norm": 0.5423756837844849, + "learning_rate": 3.6061677536927265e-06, + "loss": 0.5676, + "step": 1964 + }, + { + "epoch": 2.197986577181208, + "grad_norm": 0.5335581302642822, + "learning_rate": 3.604829537996524e-06, + "loss": 0.5362, + "step": 1965 + }, + { + "epoch": 2.19910514541387, + "grad_norm": 0.5309687852859497, + "learning_rate": 3.603490928783549e-06, + "loss": 0.5242, + "step": 1966 + }, + { + "epoch": 2.2002237136465324, + "grad_norm": 0.5442765951156616, + "learning_rate": 3.6021519265305847e-06, + "loss": 0.581, + "step": 1967 + }, + { + "epoch": 2.2013422818791946, + "grad_norm": 0.5234119892120361, + "learning_rate": 3.6008125317145563e-06, + "loss": 0.5361, + "step": 1968 + }, + { + "epoch": 2.2024608501118568, + "grad_norm": 0.5330463647842407, + "learning_rate": 3.599472744812527e-06, + "loss": 0.5527, + "step": 1969 + }, + { + "epoch": 2.203579418344519, + "grad_norm": 0.5369210243225098, + "learning_rate": 3.5981325663017e-06, + "loss": 0.5831, + "step": 1970 + }, + { + "epoch": 2.204697986577181, + "grad_norm": 0.5206089019775391, + "learning_rate": 3.59679199665942e-06, + "loss": 0.5518, + "step": 1971 + }, + { + "epoch": 2.2058165548098434, + "grad_norm": 0.5332524180412292, + "learning_rate": 3.595451036363168e-06, + "loss": 0.5518, + "step": 1972 + }, + { + "epoch": 2.2069351230425056, + "grad_norm": 0.5313246250152588, + "learning_rate": 3.5941096858905657e-06, + "loss": 0.5152, + "step": 1973 + }, + { + "epoch": 2.208053691275168, + "grad_norm": 0.5441534519195557, + "learning_rate": 3.5927679457193743e-06, + "loss": 0.5664, + "step": 1974 + }, + { + "epoch": 2.20917225950783, + "grad_norm": 0.547796905040741, + "learning_rate": 3.5914258163274922e-06, + "loss": 0.5442, + "step": 1975 + }, + { + "epoch": 2.2102908277404922, + "grad_norm": 0.5382126569747925, + "learning_rate": 3.5900832981929574e-06, + "loss": 0.5346, + "step": 1976 + }, + { + "epoch": 2.2114093959731544, + "grad_norm": 0.519952654838562, + "learning_rate": 3.5887403917939466e-06, + "loss": 0.549, + "step": 1977 + }, + { + "epoch": 2.2125279642058167, + "grad_norm": 0.5248762965202332, + "learning_rate": 3.5873970976087746e-06, + "loss": 0.5412, + "step": 1978 + }, + { + "epoch": 2.213646532438479, + "grad_norm": 0.5430974364280701, + "learning_rate": 3.5860534161158943e-06, + "loss": 0.5803, + "step": 1979 + }, + { + "epoch": 2.214765100671141, + "grad_norm": 0.5381439924240112, + "learning_rate": 3.5847093477938955e-06, + "loss": 0.5718, + "step": 1980 + }, + { + "epoch": 2.2158836689038033, + "grad_norm": 0.5410338044166565, + "learning_rate": 3.583364893121508e-06, + "loss": 0.546, + "step": 1981 + }, + { + "epoch": 2.2170022371364655, + "grad_norm": 0.5335426330566406, + "learning_rate": 3.5820200525775973e-06, + "loss": 0.5344, + "step": 1982 + }, + { + "epoch": 2.2181208053691277, + "grad_norm": 0.5379711985588074, + "learning_rate": 3.5806748266411673e-06, + "loss": 0.5749, + "step": 1983 + }, + { + "epoch": 2.21923937360179, + "grad_norm": 0.5368325710296631, + "learning_rate": 3.579329215791359e-06, + "loss": 0.5566, + "step": 1984 + }, + { + "epoch": 2.220357941834452, + "grad_norm": 0.5387017130851746, + "learning_rate": 3.5779832205074494e-06, + "loss": 0.5741, + "step": 1985 + }, + { + "epoch": 2.221476510067114, + "grad_norm": 0.5216672420501709, + "learning_rate": 3.576636841268854e-06, + "loss": 0.55, + "step": 1986 + }, + { + "epoch": 2.222595078299776, + "grad_norm": 0.5340707302093506, + "learning_rate": 3.5752900785551247e-06, + "loss": 0.5306, + "step": 1987 + }, + { + "epoch": 2.2237136465324383, + "grad_norm": 0.5536656975746155, + "learning_rate": 3.5739429328459497e-06, + "loss": 0.5777, + "step": 1988 + }, + { + "epoch": 2.2248322147651005, + "grad_norm": 0.5179775953292847, + "learning_rate": 3.5725954046211527e-06, + "loss": 0.5504, + "step": 1989 + }, + { + "epoch": 2.2259507829977627, + "grad_norm": 0.5268764495849609, + "learning_rate": 3.571247494360695e-06, + "loss": 0.5344, + "step": 1990 + }, + { + "epoch": 2.227069351230425, + "grad_norm": 0.5509704947471619, + "learning_rate": 3.569899202544675e-06, + "loss": 0.5738, + "step": 1991 + }, + { + "epoch": 2.228187919463087, + "grad_norm": 0.5285952091217041, + "learning_rate": 3.5685505296533233e-06, + "loss": 0.5422, + "step": 1992 + }, + { + "epoch": 2.2293064876957494, + "grad_norm": 0.5342767834663391, + "learning_rate": 3.5672014761670097e-06, + "loss": 0.569, + "step": 1993 + }, + { + "epoch": 2.2304250559284116, + "grad_norm": 0.5381338000297546, + "learning_rate": 3.5658520425662385e-06, + "loss": 0.5989, + "step": 1994 + }, + { + "epoch": 2.2315436241610738, + "grad_norm": 0.5339351892471313, + "learning_rate": 3.5645022293316497e-06, + "loss": 0.5486, + "step": 1995 + }, + { + "epoch": 2.232662192393736, + "grad_norm": 0.5398667454719543, + "learning_rate": 3.5631520369440166e-06, + "loss": 0.5271, + "step": 1996 + }, + { + "epoch": 2.233780760626398, + "grad_norm": 0.5417992472648621, + "learning_rate": 3.561801465884249e-06, + "loss": 0.5676, + "step": 1997 + }, + { + "epoch": 2.2348993288590604, + "grad_norm": 0.5108304619789124, + "learning_rate": 3.560450516633393e-06, + "loss": 0.5547, + "step": 1998 + }, + { + "epoch": 2.2360178970917226, + "grad_norm": 0.5282533168792725, + "learning_rate": 3.559099189672628e-06, + "loss": 0.5352, + "step": 1999 + }, + { + "epoch": 2.237136465324385, + "grad_norm": 0.5304929614067078, + "learning_rate": 3.557747485483266e-06, + "loss": 0.5854, + "step": 2000 + }, + { + "epoch": 2.238255033557047, + "grad_norm": 0.5358375310897827, + "learning_rate": 3.556395404546756e-06, + "loss": 0.5491, + "step": 2001 + }, + { + "epoch": 2.2393736017897092, + "grad_norm": 0.5124779939651489, + "learning_rate": 3.5550429473446817e-06, + "loss": 0.5354, + "step": 2002 + }, + { + "epoch": 2.2404921700223714, + "grad_norm": 0.5421090126037598, + "learning_rate": 3.5536901143587594e-06, + "loss": 0.5665, + "step": 2003 + }, + { + "epoch": 2.2416107382550337, + "grad_norm": 0.527865469455719, + "learning_rate": 3.552336906070838e-06, + "loss": 0.559, + "step": 2004 + }, + { + "epoch": 2.242729306487696, + "grad_norm": 0.5450758337974548, + "learning_rate": 3.5509833229629022e-06, + "loss": 0.5807, + "step": 2005 + }, + { + "epoch": 2.243847874720358, + "grad_norm": 0.5631154179573059, + "learning_rate": 3.54962936551707e-06, + "loss": 0.5922, + "step": 2006 + }, + { + "epoch": 2.2449664429530203, + "grad_norm": 0.5289211273193359, + "learning_rate": 3.5482750342155913e-06, + "loss": 0.571, + "step": 2007 + }, + { + "epoch": 2.2460850111856825, + "grad_norm": 0.5309174656867981, + "learning_rate": 3.546920329540851e-06, + "loss": 0.5545, + "step": 2008 + }, + { + "epoch": 2.2472035794183447, + "grad_norm": 0.5320987701416016, + "learning_rate": 3.545565251975366e-06, + "loss": 0.5708, + "step": 2009 + }, + { + "epoch": 2.248322147651007, + "grad_norm": 0.5375826954841614, + "learning_rate": 3.5442098020017843e-06, + "loss": 0.5656, + "step": 2010 + }, + { + "epoch": 2.2494407158836687, + "grad_norm": 0.5207297801971436, + "learning_rate": 3.5428539801028916e-06, + "loss": 0.5585, + "step": 2011 + }, + { + "epoch": 2.2505592841163313, + "grad_norm": 0.5171449184417725, + "learning_rate": 3.541497786761601e-06, + "loss": 0.5545, + "step": 2012 + }, + { + "epoch": 2.251677852348993, + "grad_norm": 0.5213256478309631, + "learning_rate": 3.5401412224609583e-06, + "loss": 0.5507, + "step": 2013 + }, + { + "epoch": 2.2527964205816553, + "grad_norm": 0.5352752804756165, + "learning_rate": 3.538784287684145e-06, + "loss": 0.5366, + "step": 2014 + }, + { + "epoch": 2.2539149888143175, + "grad_norm": 0.5403833389282227, + "learning_rate": 3.537426982914472e-06, + "loss": 0.574, + "step": 2015 + }, + { + "epoch": 2.2550335570469797, + "grad_norm": 0.5127537250518799, + "learning_rate": 3.5360693086353827e-06, + "loss": 0.5559, + "step": 2016 + }, + { + "epoch": 2.256152125279642, + "grad_norm": 0.5236245393753052, + "learning_rate": 3.5347112653304495e-06, + "loss": 0.5595, + "step": 2017 + }, + { + "epoch": 2.257270693512304, + "grad_norm": 0.5527048110961914, + "learning_rate": 3.5333528534833813e-06, + "loss": 0.5692, + "step": 2018 + }, + { + "epoch": 2.2583892617449663, + "grad_norm": 0.5271632075309753, + "learning_rate": 3.5319940735780134e-06, + "loss": 0.5444, + "step": 2019 + }, + { + "epoch": 2.2595078299776286, + "grad_norm": 0.5247520208358765, + "learning_rate": 3.5306349260983164e-06, + "loss": 0.5668, + "step": 2020 + }, + { + "epoch": 2.2606263982102908, + "grad_norm": 0.5510424375534058, + "learning_rate": 3.5292754115283877e-06, + "loss": 0.5548, + "step": 2021 + }, + { + "epoch": 2.261744966442953, + "grad_norm": 0.5288413166999817, + "learning_rate": 3.527915530352459e-06, + "loss": 0.5313, + "step": 2022 + }, + { + "epoch": 2.262863534675615, + "grad_norm": 0.5299348831176758, + "learning_rate": 3.5265552830548898e-06, + "loss": 0.5689, + "step": 2023 + }, + { + "epoch": 2.2639821029082774, + "grad_norm": 0.5311738848686218, + "learning_rate": 3.5251946701201725e-06, + "loss": 0.5362, + "step": 2024 + }, + { + "epoch": 2.2651006711409396, + "grad_norm": 0.5376578569412231, + "learning_rate": 3.5238336920329274e-06, + "loss": 0.5502, + "step": 2025 + }, + { + "epoch": 2.266219239373602, + "grad_norm": 0.5269151926040649, + "learning_rate": 3.522472349277906e-06, + "loss": 0.5495, + "step": 2026 + }, + { + "epoch": 2.267337807606264, + "grad_norm": 0.5249069333076477, + "learning_rate": 3.521110642339991e-06, + "loss": 0.5515, + "step": 2027 + }, + { + "epoch": 2.2684563758389262, + "grad_norm": 0.5272762775421143, + "learning_rate": 3.519748571704192e-06, + "loss": 0.5684, + "step": 2028 + }, + { + "epoch": 2.2695749440715884, + "grad_norm": 0.5348524451255798, + "learning_rate": 3.5183861378556495e-06, + "loss": 0.5712, + "step": 2029 + }, + { + "epoch": 2.2706935123042506, + "grad_norm": 0.5510343909263611, + "learning_rate": 3.5170233412796338e-06, + "loss": 0.5683, + "step": 2030 + }, + { + "epoch": 2.271812080536913, + "grad_norm": 0.5229319334030151, + "learning_rate": 3.5156601824615443e-06, + "loss": 0.5368, + "step": 2031 + }, + { + "epoch": 2.272930648769575, + "grad_norm": 0.5142685770988464, + "learning_rate": 3.5142966618869096e-06, + "loss": 0.5603, + "step": 2032 + }, + { + "epoch": 2.2740492170022373, + "grad_norm": 0.5549158453941345, + "learning_rate": 3.512932780041386e-06, + "loss": 0.561, + "step": 2033 + }, + { + "epoch": 2.2751677852348995, + "grad_norm": 0.5306481719017029, + "learning_rate": 3.511568537410759e-06, + "loss": 0.5555, + "step": 2034 + }, + { + "epoch": 2.2762863534675617, + "grad_norm": 0.5306711196899414, + "learning_rate": 3.5102039344809423e-06, + "loss": 0.5623, + "step": 2035 + }, + { + "epoch": 2.277404921700224, + "grad_norm": 0.5515317320823669, + "learning_rate": 3.508838971737981e-06, + "loss": 0.5641, + "step": 2036 + }, + { + "epoch": 2.278523489932886, + "grad_norm": 0.519772469997406, + "learning_rate": 3.5074736496680433e-06, + "loss": 0.5459, + "step": 2037 + }, + { + "epoch": 2.279642058165548, + "grad_norm": 0.5249688625335693, + "learning_rate": 3.5061079687574286e-06, + "loss": 0.5644, + "step": 2038 + }, + { + "epoch": 2.2807606263982105, + "grad_norm": 0.5376660227775574, + "learning_rate": 3.5047419294925632e-06, + "loss": 0.5526, + "step": 2039 + }, + { + "epoch": 2.2818791946308723, + "grad_norm": 0.537214457988739, + "learning_rate": 3.5033755323600017e-06, + "loss": 0.5609, + "step": 2040 + }, + { + "epoch": 2.2829977628635345, + "grad_norm": 0.5479531288146973, + "learning_rate": 3.5020087778464253e-06, + "loss": 0.5631, + "step": 2041 + }, + { + "epoch": 2.2841163310961967, + "grad_norm": 0.5343988537788391, + "learning_rate": 3.500641666438643e-06, + "loss": 0.5477, + "step": 2042 + }, + { + "epoch": 2.285234899328859, + "grad_norm": 0.5822009444236755, + "learning_rate": 3.49927419862359e-06, + "loss": 0.6041, + "step": 2043 + }, + { + "epoch": 2.286353467561521, + "grad_norm": 0.5267564654350281, + "learning_rate": 3.4979063748883314e-06, + "loss": 0.5381, + "step": 2044 + }, + { + "epoch": 2.2874720357941833, + "grad_norm": 0.5210385322570801, + "learning_rate": 3.496538195720055e-06, + "loss": 0.5266, + "step": 2045 + }, + { + "epoch": 2.2885906040268456, + "grad_norm": 0.5286872386932373, + "learning_rate": 3.495169661606077e-06, + "loss": 0.5438, + "step": 2046 + }, + { + "epoch": 2.2897091722595078, + "grad_norm": 0.5459385514259338, + "learning_rate": 3.493800773033841e-06, + "loss": 0.5429, + "step": 2047 + }, + { + "epoch": 2.29082774049217, + "grad_norm": 0.5343181490898132, + "learning_rate": 3.4924315304909175e-06, + "loss": 0.548, + "step": 2048 + }, + { + "epoch": 2.291946308724832, + "grad_norm": 0.5392283201217651, + "learning_rate": 3.4910619344649975e-06, + "loss": 0.5613, + "step": 2049 + }, + { + "epoch": 2.2930648769574944, + "grad_norm": 0.5353871583938599, + "learning_rate": 3.489691985443905e-06, + "loss": 0.5455, + "step": 2050 + }, + { + "epoch": 2.2941834451901566, + "grad_norm": 0.5304256677627563, + "learning_rate": 3.488321683915586e-06, + "loss": 0.5411, + "step": 2051 + }, + { + "epoch": 2.295302013422819, + "grad_norm": 0.5411984324455261, + "learning_rate": 3.486951030368113e-06, + "loss": 0.5442, + "step": 2052 + }, + { + "epoch": 2.296420581655481, + "grad_norm": 0.5262925028800964, + "learning_rate": 3.485580025289683e-06, + "loss": 0.5653, + "step": 2053 + }, + { + "epoch": 2.2975391498881432, + "grad_norm": 0.5307210683822632, + "learning_rate": 3.4842086691686193e-06, + "loss": 0.5493, + "step": 2054 + }, + { + "epoch": 2.2986577181208054, + "grad_norm": 0.5244054794311523, + "learning_rate": 3.4828369624933687e-06, + "loss": 0.524, + "step": 2055 + }, + { + "epoch": 2.2997762863534676, + "grad_norm": 0.5376085638999939, + "learning_rate": 3.4814649057525063e-06, + "loss": 0.576, + "step": 2056 + }, + { + "epoch": 2.30089485458613, + "grad_norm": 0.521187424659729, + "learning_rate": 3.4800924994347273e-06, + "loss": 0.5466, + "step": 2057 + }, + { + "epoch": 2.302013422818792, + "grad_norm": 0.5184813141822815, + "learning_rate": 3.478719744028855e-06, + "loss": 0.5469, + "step": 2058 + }, + { + "epoch": 2.3031319910514543, + "grad_norm": 0.5091737508773804, + "learning_rate": 3.477346640023835e-06, + "loss": 0.5443, + "step": 2059 + }, + { + "epoch": 2.3042505592841165, + "grad_norm": 0.5425874590873718, + "learning_rate": 3.4759731879087373e-06, + "loss": 0.5774, + "step": 2060 + }, + { + "epoch": 2.3053691275167787, + "grad_norm": 0.5415844917297363, + "learning_rate": 3.4745993881727575e-06, + "loss": 0.5527, + "step": 2061 + }, + { + "epoch": 2.306487695749441, + "grad_norm": 0.5313822627067566, + "learning_rate": 3.4732252413052127e-06, + "loss": 0.5515, + "step": 2062 + }, + { + "epoch": 2.3076062639821027, + "grad_norm": 0.5660613179206848, + "learning_rate": 3.4718507477955447e-06, + "loss": 0.5868, + "step": 2063 + }, + { + "epoch": 2.3087248322147653, + "grad_norm": 0.5313783884048462, + "learning_rate": 3.4704759081333208e-06, + "loss": 0.564, + "step": 2064 + }, + { + "epoch": 2.309843400447427, + "grad_norm": 0.5220955014228821, + "learning_rate": 3.469100722808227e-06, + "loss": 0.5554, + "step": 2065 + }, + { + "epoch": 2.3109619686800893, + "grad_norm": 0.5279406309127808, + "learning_rate": 3.4677251923100765e-06, + "loss": 0.5277, + "step": 2066 + }, + { + "epoch": 2.3120805369127515, + "grad_norm": 0.5456680059432983, + "learning_rate": 3.466349317128804e-06, + "loss": 0.5419, + "step": 2067 + }, + { + "epoch": 2.3131991051454137, + "grad_norm": 0.5503676533699036, + "learning_rate": 3.4649730977544666e-06, + "loss": 0.5841, + "step": 2068 + }, + { + "epoch": 2.314317673378076, + "grad_norm": 0.5170747637748718, + "learning_rate": 3.463596534677245e-06, + "loss": 0.524, + "step": 2069 + }, + { + "epoch": 2.315436241610738, + "grad_norm": 0.5500468015670776, + "learning_rate": 3.4622196283874406e-06, + "loss": 0.5864, + "step": 2070 + }, + { + "epoch": 2.3165548098434003, + "grad_norm": 0.5632989406585693, + "learning_rate": 3.460842379375478e-06, + "loss": 0.5662, + "step": 2071 + }, + { + "epoch": 2.3176733780760626, + "grad_norm": 0.5188435912132263, + "learning_rate": 3.4594647881319053e-06, + "loss": 0.5393, + "step": 2072 + }, + { + "epoch": 2.3187919463087248, + "grad_norm": 0.53911292552948, + "learning_rate": 3.458086855147391e-06, + "loss": 0.5544, + "step": 2073 + }, + { + "epoch": 2.319910514541387, + "grad_norm": 0.5565316081047058, + "learning_rate": 3.4567085809127247e-06, + "loss": 0.5363, + "step": 2074 + }, + { + "epoch": 2.321029082774049, + "grad_norm": 0.5365592241287231, + "learning_rate": 3.455329965918819e-06, + "loss": 0.5551, + "step": 2075 + }, + { + "epoch": 2.3221476510067114, + "grad_norm": 0.5494614839553833, + "learning_rate": 3.453951010656707e-06, + "loss": 0.5717, + "step": 2076 + }, + { + "epoch": 2.3232662192393736, + "grad_norm": 0.5462038516998291, + "learning_rate": 3.4525717156175427e-06, + "loss": 0.5775, + "step": 2077 + }, + { + "epoch": 2.324384787472036, + "grad_norm": 0.540730893611908, + "learning_rate": 3.451192081292602e-06, + "loss": 0.5583, + "step": 2078 + }, + { + "epoch": 2.325503355704698, + "grad_norm": 0.5468118190765381, + "learning_rate": 3.4498121081732816e-06, + "loss": 0.5674, + "step": 2079 + }, + { + "epoch": 2.3266219239373602, + "grad_norm": 0.5287937521934509, + "learning_rate": 3.448431796751099e-06, + "loss": 0.5533, + "step": 2080 + }, + { + "epoch": 2.3277404921700224, + "grad_norm": 0.5252012014389038, + "learning_rate": 3.4470511475176908e-06, + "loss": 0.5254, + "step": 2081 + }, + { + "epoch": 2.3288590604026846, + "grad_norm": 0.5183524489402771, + "learning_rate": 3.445670160964815e-06, + "loss": 0.5414, + "step": 2082 + }, + { + "epoch": 2.329977628635347, + "grad_norm": 0.5425177812576294, + "learning_rate": 3.4442888375843497e-06, + "loss": 0.5574, + "step": 2083 + }, + { + "epoch": 2.331096196868009, + "grad_norm": 0.5316156148910522, + "learning_rate": 3.4429071778682932e-06, + "loss": 0.5753, + "step": 2084 + }, + { + "epoch": 2.3322147651006713, + "grad_norm": 0.5115413069725037, + "learning_rate": 3.4415251823087635e-06, + "loss": 0.5378, + "step": 2085 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.5460888147354126, + "learning_rate": 3.4401428513979975e-06, + "loss": 0.5459, + "step": 2086 + }, + { + "epoch": 2.3344519015659957, + "grad_norm": 0.5215389132499695, + "learning_rate": 3.4387601856283514e-06, + "loss": 0.5363, + "step": 2087 + }, + { + "epoch": 2.335570469798658, + "grad_norm": 0.5442741513252258, + "learning_rate": 3.4373771854923032e-06, + "loss": 0.5586, + "step": 2088 + }, + { + "epoch": 2.33668903803132, + "grad_norm": 0.5592746734619141, + "learning_rate": 3.4359938514824463e-06, + "loss": 0.5559, + "step": 2089 + }, + { + "epoch": 2.337807606263982, + "grad_norm": 0.5376172661781311, + "learning_rate": 3.4346101840914963e-06, + "loss": 0.5732, + "step": 2090 + }, + { + "epoch": 2.3389261744966445, + "grad_norm": 0.5412551760673523, + "learning_rate": 3.433226183812285e-06, + "loss": 0.5561, + "step": 2091 + }, + { + "epoch": 2.3400447427293063, + "grad_norm": 0.5335832238197327, + "learning_rate": 3.4318418511377638e-06, + "loss": 0.5494, + "step": 2092 + }, + { + "epoch": 2.3411633109619685, + "grad_norm": 0.5402706265449524, + "learning_rate": 3.430457186561004e-06, + "loss": 0.5778, + "step": 2093 + }, + { + "epoch": 2.3422818791946307, + "grad_norm": 0.5321234464645386, + "learning_rate": 3.4290721905751913e-06, + "loss": 0.5606, + "step": 2094 + }, + { + "epoch": 2.343400447427293, + "grad_norm": 0.5206087231636047, + "learning_rate": 3.427686863673634e-06, + "loss": 0.5388, + "step": 2095 + }, + { + "epoch": 2.344519015659955, + "grad_norm": 0.5416733622550964, + "learning_rate": 3.4263012063497548e-06, + "loss": 0.5808, + "step": 2096 + }, + { + "epoch": 2.3456375838926173, + "grad_norm": 0.5340664386749268, + "learning_rate": 3.4249152190970957e-06, + "loss": 0.5649, + "step": 2097 + }, + { + "epoch": 2.3467561521252795, + "grad_norm": 0.5299456715583801, + "learning_rate": 3.423528902409316e-06, + "loss": 0.5542, + "step": 2098 + }, + { + "epoch": 2.3478747203579418, + "grad_norm": 0.5439330339431763, + "learning_rate": 3.422142256780192e-06, + "loss": 0.5598, + "step": 2099 + }, + { + "epoch": 2.348993288590604, + "grad_norm": 0.5650404691696167, + "learning_rate": 3.4207552827036177e-06, + "loss": 0.5651, + "step": 2100 + }, + { + "epoch": 2.350111856823266, + "grad_norm": 0.549656093120575, + "learning_rate": 3.4193679806736048e-06, + "loss": 0.5692, + "step": 2101 + }, + { + "epoch": 2.3512304250559284, + "grad_norm": 0.5383009314537048, + "learning_rate": 3.4179803511842772e-06, + "loss": 0.5739, + "step": 2102 + }, + { + "epoch": 2.3523489932885906, + "grad_norm": 0.5449522733688354, + "learning_rate": 3.4165923947298825e-06, + "loss": 0.5468, + "step": 2103 + }, + { + "epoch": 2.353467561521253, + "grad_norm": 0.528679609298706, + "learning_rate": 3.4152041118047797e-06, + "loss": 0.5403, + "step": 2104 + }, + { + "epoch": 2.354586129753915, + "grad_norm": 0.5643302202224731, + "learning_rate": 3.4138155029034466e-06, + "loss": 0.5815, + "step": 2105 + }, + { + "epoch": 2.3557046979865772, + "grad_norm": 0.5397313833236694, + "learning_rate": 3.412426568520475e-06, + "loss": 0.5571, + "step": 2106 + }, + { + "epoch": 2.3568232662192394, + "grad_norm": 0.5378561019897461, + "learning_rate": 3.4110373091505743e-06, + "loss": 0.5432, + "step": 2107 + }, + { + "epoch": 2.3579418344519016, + "grad_norm": 0.550048291683197, + "learning_rate": 3.409647725288569e-06, + "loss": 0.5419, + "step": 2108 + }, + { + "epoch": 2.359060402684564, + "grad_norm": 0.5251745581626892, + "learning_rate": 3.4082578174294e-06, + "loss": 0.529, + "step": 2109 + }, + { + "epoch": 2.360178970917226, + "grad_norm": 0.5445367097854614, + "learning_rate": 3.406867586068122e-06, + "loss": 0.5819, + "step": 2110 + }, + { + "epoch": 2.3612975391498883, + "grad_norm": 0.546407163143158, + "learning_rate": 3.405477031699906e-06, + "loss": 0.5627, + "step": 2111 + }, + { + "epoch": 2.3624161073825505, + "grad_norm": 0.5644004344940186, + "learning_rate": 3.404086154820039e-06, + "loss": 0.5544, + "step": 2112 + }, + { + "epoch": 2.3635346756152127, + "grad_norm": 0.5527307987213135, + "learning_rate": 3.40269495592392e-06, + "loss": 0.5547, + "step": 2113 + }, + { + "epoch": 2.364653243847875, + "grad_norm": 0.5552747249603271, + "learning_rate": 3.401303435507065e-06, + "loss": 0.5836, + "step": 2114 + }, + { + "epoch": 2.365771812080537, + "grad_norm": 0.5372108817100525, + "learning_rate": 3.3999115940651044e-06, + "loss": 0.5544, + "step": 2115 + }, + { + "epoch": 2.3668903803131993, + "grad_norm": 0.5257200598716736, + "learning_rate": 3.398519432093782e-06, + "loss": 0.5662, + "step": 2116 + }, + { + "epoch": 2.368008948545861, + "grad_norm": 0.5350222587585449, + "learning_rate": 3.3971269500889577e-06, + "loss": 0.528, + "step": 2117 + }, + { + "epoch": 2.3691275167785237, + "grad_norm": 0.551288902759552, + "learning_rate": 3.3957341485466024e-06, + "loss": 0.5586, + "step": 2118 + }, + { + "epoch": 2.3702460850111855, + "grad_norm": 0.5550601482391357, + "learning_rate": 3.394341027962802e-06, + "loss": 0.5838, + "step": 2119 + }, + { + "epoch": 2.3713646532438477, + "grad_norm": 0.527559220790863, + "learning_rate": 3.392947588833758e-06, + "loss": 0.5288, + "step": 2120 + }, + { + "epoch": 2.37248322147651, + "grad_norm": 0.552585780620575, + "learning_rate": 3.391553831655783e-06, + "loss": 0.5806, + "step": 2121 + }, + { + "epoch": 2.373601789709172, + "grad_norm": 0.5294848680496216, + "learning_rate": 3.3901597569253037e-06, + "loss": 0.5324, + "step": 2122 + }, + { + "epoch": 2.3747203579418343, + "grad_norm": 0.5516006946563721, + "learning_rate": 3.3887653651388597e-06, + "loss": 0.5732, + "step": 2123 + }, + { + "epoch": 2.3758389261744965, + "grad_norm": 0.5479446053504944, + "learning_rate": 3.387370656793103e-06, + "loss": 0.5291, + "step": 2124 + }, + { + "epoch": 2.3769574944071588, + "grad_norm": 0.5469229221343994, + "learning_rate": 3.3859756323848015e-06, + "loss": 0.553, + "step": 2125 + }, + { + "epoch": 2.378076062639821, + "grad_norm": 0.5352817177772522, + "learning_rate": 3.3845802924108303e-06, + "loss": 0.5467, + "step": 2126 + }, + { + "epoch": 2.379194630872483, + "grad_norm": 0.5568694472312927, + "learning_rate": 3.383184637368181e-06, + "loss": 0.5623, + "step": 2127 + }, + { + "epoch": 2.3803131991051454, + "grad_norm": 0.5393657088279724, + "learning_rate": 3.3817886677539563e-06, + "loss": 0.5587, + "step": 2128 + }, + { + "epoch": 2.3814317673378076, + "grad_norm": 0.5407692790031433, + "learning_rate": 3.3803923840653716e-06, + "loss": 0.5686, + "step": 2129 + }, + { + "epoch": 2.38255033557047, + "grad_norm": 0.532427191734314, + "learning_rate": 3.378995786799752e-06, + "loss": 0.5607, + "step": 2130 + }, + { + "epoch": 2.383668903803132, + "grad_norm": 0.5193777680397034, + "learning_rate": 3.3775988764545363e-06, + "loss": 0.5788, + "step": 2131 + }, + { + "epoch": 2.384787472035794, + "grad_norm": 0.546089768409729, + "learning_rate": 3.3762016535272745e-06, + "loss": 0.5633, + "step": 2132 + }, + { + "epoch": 2.3859060402684564, + "grad_norm": 0.5442779064178467, + "learning_rate": 3.374804118515628e-06, + "loss": 0.5692, + "step": 2133 + }, + { + "epoch": 2.3870246085011186, + "grad_norm": 0.5481501221656799, + "learning_rate": 3.3734062719173686e-06, + "loss": 0.5623, + "step": 2134 + }, + { + "epoch": 2.388143176733781, + "grad_norm": 0.5439371466636658, + "learning_rate": 3.3720081142303795e-06, + "loss": 0.5634, + "step": 2135 + }, + { + "epoch": 2.389261744966443, + "grad_norm": 0.5466442704200745, + "learning_rate": 3.370609645952654e-06, + "loss": 0.5327, + "step": 2136 + }, + { + "epoch": 2.3903803131991053, + "grad_norm": 0.5351375937461853, + "learning_rate": 3.3692108675822975e-06, + "loss": 0.5536, + "step": 2137 + }, + { + "epoch": 2.3914988814317675, + "grad_norm": 0.5427590012550354, + "learning_rate": 3.367811779617526e-06, + "loss": 0.561, + "step": 2138 + }, + { + "epoch": 2.3926174496644297, + "grad_norm": 0.5482352375984192, + "learning_rate": 3.366412382556663e-06, + "loss": 0.5506, + "step": 2139 + }, + { + "epoch": 2.393736017897092, + "grad_norm": 0.552746057510376, + "learning_rate": 3.3650126768981452e-06, + "loss": 0.5463, + "step": 2140 + }, + { + "epoch": 2.394854586129754, + "grad_norm": 0.527290940284729, + "learning_rate": 3.363612663140518e-06, + "loss": 0.536, + "step": 2141 + }, + { + "epoch": 2.395973154362416, + "grad_norm": 0.5244056582450867, + "learning_rate": 3.362212341782435e-06, + "loss": 0.5464, + "step": 2142 + }, + { + "epoch": 2.3970917225950785, + "grad_norm": 0.5322519540786743, + "learning_rate": 3.3608117133226638e-06, + "loss": 0.5425, + "step": 2143 + }, + { + "epoch": 2.3982102908277403, + "grad_norm": 0.5500986576080322, + "learning_rate": 3.3594107782600754e-06, + "loss": 0.5512, + "step": 2144 + }, + { + "epoch": 2.3993288590604025, + "grad_norm": 0.5288020968437195, + "learning_rate": 3.3580095370936544e-06, + "loss": 0.5348, + "step": 2145 + }, + { + "epoch": 2.4004474272930647, + "grad_norm": 0.5364000797271729, + "learning_rate": 3.3566079903224936e-06, + "loss": 0.5415, + "step": 2146 + }, + { + "epoch": 2.401565995525727, + "grad_norm": 0.5493627190589905, + "learning_rate": 3.3552061384457927e-06, + "loss": 0.5628, + "step": 2147 + }, + { + "epoch": 2.402684563758389, + "grad_norm": 0.556971549987793, + "learning_rate": 3.3538039819628625e-06, + "loss": 0.5881, + "step": 2148 + }, + { + "epoch": 2.4038031319910513, + "grad_norm": 0.5485417246818542, + "learning_rate": 3.3524015213731216e-06, + "loss": 0.5954, + "step": 2149 + }, + { + "epoch": 2.4049217002237135, + "grad_norm": 0.5298888683319092, + "learning_rate": 3.350998757176096e-06, + "loss": 0.5521, + "step": 2150 + }, + { + "epoch": 2.4060402684563758, + "grad_norm": 0.5447022914886475, + "learning_rate": 3.349595689871421e-06, + "loss": 0.5481, + "step": 2151 + }, + { + "epoch": 2.407158836689038, + "grad_norm": 0.5325900912284851, + "learning_rate": 3.3481923199588385e-06, + "loss": 0.5631, + "step": 2152 + }, + { + "epoch": 2.4082774049217, + "grad_norm": 0.546597421169281, + "learning_rate": 3.3467886479381994e-06, + "loss": 0.5688, + "step": 2153 + }, + { + "epoch": 2.4093959731543624, + "grad_norm": 0.5312037467956543, + "learning_rate": 3.345384674309463e-06, + "loss": 0.5426, + "step": 2154 + }, + { + "epoch": 2.4105145413870246, + "grad_norm": 0.5359411835670471, + "learning_rate": 3.3439803995726928e-06, + "loss": 0.5565, + "step": 2155 + }, + { + "epoch": 2.411633109619687, + "grad_norm": 0.525406002998352, + "learning_rate": 3.342575824228062e-06, + "loss": 0.5433, + "step": 2156 + }, + { + "epoch": 2.412751677852349, + "grad_norm": 0.5494353175163269, + "learning_rate": 3.3411709487758516e-06, + "loss": 0.5557, + "step": 2157 + }, + { + "epoch": 2.413870246085011, + "grad_norm": 0.5250975489616394, + "learning_rate": 3.339765773716448e-06, + "loss": 0.5621, + "step": 2158 + }, + { + "epoch": 2.4149888143176734, + "grad_norm": 0.5497077703475952, + "learning_rate": 3.3383602995503433e-06, + "loss": 0.5433, + "step": 2159 + }, + { + "epoch": 2.4161073825503356, + "grad_norm": 0.5389668941497803, + "learning_rate": 3.336954526778139e-06, + "loss": 0.5752, + "step": 2160 + }, + { + "epoch": 2.417225950782998, + "grad_norm": 0.5405676960945129, + "learning_rate": 3.3355484559005415e-06, + "loss": 0.556, + "step": 2161 + }, + { + "epoch": 2.41834451901566, + "grad_norm": 0.5262788534164429, + "learning_rate": 3.334142087418362e-06, + "loss": 0.5439, + "step": 2162 + }, + { + "epoch": 2.4194630872483223, + "grad_norm": 0.5337355732917786, + "learning_rate": 3.3327354218325202e-06, + "loss": 0.5713, + "step": 2163 + }, + { + "epoch": 2.4205816554809845, + "grad_norm": 0.5495365858078003, + "learning_rate": 3.33132845964404e-06, + "loss": 0.5578, + "step": 2164 + }, + { + "epoch": 2.4217002237136467, + "grad_norm": 0.5273889303207397, + "learning_rate": 3.3299212013540524e-06, + "loss": 0.5351, + "step": 2165 + }, + { + "epoch": 2.422818791946309, + "grad_norm": 0.5490871667861938, + "learning_rate": 3.3285136474637912e-06, + "loss": 0.5483, + "step": 2166 + }, + { + "epoch": 2.423937360178971, + "grad_norm": 0.5483438372612, + "learning_rate": 3.327105798474598e-06, + "loss": 0.5665, + "step": 2167 + }, + { + "epoch": 2.4250559284116333, + "grad_norm": 0.5356787443161011, + "learning_rate": 3.3256976548879183e-06, + "loss": 0.5754, + "step": 2168 + }, + { + "epoch": 2.426174496644295, + "grad_norm": 0.5396536588668823, + "learning_rate": 3.3242892172053033e-06, + "loss": 0.5415, + "step": 2169 + }, + { + "epoch": 2.4272930648769577, + "grad_norm": 0.5232876539230347, + "learning_rate": 3.322880485928408e-06, + "loss": 0.5562, + "step": 2170 + }, + { + "epoch": 2.4284116331096195, + "grad_norm": 0.5494614243507385, + "learning_rate": 3.321471461558993e-06, + "loss": 0.542, + "step": 2171 + }, + { + "epoch": 2.4295302013422817, + "grad_norm": 0.551078736782074, + "learning_rate": 3.3200621445989227e-06, + "loss": 0.5597, + "step": 2172 + }, + { + "epoch": 2.430648769574944, + "grad_norm": 0.5445325374603271, + "learning_rate": 3.318652535550166e-06, + "loss": 0.5832, + "step": 2173 + }, + { + "epoch": 2.431767337807606, + "grad_norm": 0.5362185835838318, + "learning_rate": 3.3172426349147956e-06, + "loss": 0.5247, + "step": 2174 + }, + { + "epoch": 2.4328859060402683, + "grad_norm": 0.5543274879455566, + "learning_rate": 3.3158324431949884e-06, + "loss": 0.5637, + "step": 2175 + }, + { + "epoch": 2.4340044742729305, + "grad_norm": 0.5588807463645935, + "learning_rate": 3.3144219608930226e-06, + "loss": 0.5607, + "step": 2176 + }, + { + "epoch": 2.4351230425055927, + "grad_norm": 0.5348725318908691, + "learning_rate": 3.313011188511285e-06, + "loss": 0.5502, + "step": 2177 + }, + { + "epoch": 2.436241610738255, + "grad_norm": 0.5367739796638489, + "learning_rate": 3.3116001265522613e-06, + "loss": 0.5324, + "step": 2178 + }, + { + "epoch": 2.437360178970917, + "grad_norm": 0.5421575903892517, + "learning_rate": 3.310188775518541e-06, + "loss": 0.5286, + "step": 2179 + }, + { + "epoch": 2.4384787472035794, + "grad_norm": 0.540189802646637, + "learning_rate": 3.308777135912818e-06, + "loss": 0.5408, + "step": 2180 + }, + { + "epoch": 2.4395973154362416, + "grad_norm": 0.5315021872520447, + "learning_rate": 3.3073652082378887e-06, + "loss": 0.5536, + "step": 2181 + }, + { + "epoch": 2.440715883668904, + "grad_norm": 0.531907856464386, + "learning_rate": 3.3059529929966515e-06, + "loss": 0.5364, + "step": 2182 + }, + { + "epoch": 2.441834451901566, + "grad_norm": 0.5365740060806274, + "learning_rate": 3.304540490692107e-06, + "loss": 0.533, + "step": 2183 + }, + { + "epoch": 2.442953020134228, + "grad_norm": 0.5299287438392639, + "learning_rate": 3.3031277018273587e-06, + "loss": 0.5486, + "step": 2184 + }, + { + "epoch": 2.4440715883668904, + "grad_norm": 0.5473013520240784, + "learning_rate": 3.301714626905612e-06, + "loss": 0.5426, + "step": 2185 + }, + { + "epoch": 2.4451901565995526, + "grad_norm": 0.5398786664009094, + "learning_rate": 3.3003012664301747e-06, + "loss": 0.5509, + "step": 2186 + }, + { + "epoch": 2.446308724832215, + "grad_norm": 0.526070237159729, + "learning_rate": 3.2988876209044544e-06, + "loss": 0.5354, + "step": 2187 + }, + { + "epoch": 2.447427293064877, + "grad_norm": 0.5647167563438416, + "learning_rate": 3.297473690831963e-06, + "loss": 0.5304, + "step": 2188 + }, + { + "epoch": 2.4485458612975393, + "grad_norm": 0.5475811958312988, + "learning_rate": 3.296059476716311e-06, + "loss": 0.5725, + "step": 2189 + }, + { + "epoch": 2.4496644295302015, + "grad_norm": 0.5441768765449524, + "learning_rate": 3.2946449790612117e-06, + "loss": 0.5541, + "step": 2190 + }, + { + "epoch": 2.4507829977628637, + "grad_norm": 0.5527596473693848, + "learning_rate": 3.2932301983704797e-06, + "loss": 0.5697, + "step": 2191 + }, + { + "epoch": 2.451901565995526, + "grad_norm": 0.5591815114021301, + "learning_rate": 3.2918151351480287e-06, + "loss": 0.5803, + "step": 2192 + }, + { + "epoch": 2.453020134228188, + "grad_norm": 0.5628243088722229, + "learning_rate": 3.2903997898978755e-06, + "loss": 0.5587, + "step": 2193 + }, + { + "epoch": 2.45413870246085, + "grad_norm": 0.5457016229629517, + "learning_rate": 3.288984163124136e-06, + "loss": 0.5315, + "step": 2194 + }, + { + "epoch": 2.4552572706935125, + "grad_norm": 0.5472298860549927, + "learning_rate": 3.287568255331024e-06, + "loss": 0.553, + "step": 2195 + }, + { + "epoch": 2.4563758389261743, + "grad_norm": 0.5432956218719482, + "learning_rate": 3.2861520670228587e-06, + "loss": 0.5612, + "step": 2196 + }, + { + "epoch": 2.457494407158837, + "grad_norm": 0.5295101404190063, + "learning_rate": 3.284735598704054e-06, + "loss": 0.5323, + "step": 2197 + }, + { + "epoch": 2.4586129753914987, + "grad_norm": 0.5284572243690491, + "learning_rate": 3.283318850879127e-06, + "loss": 0.5529, + "step": 2198 + }, + { + "epoch": 2.459731543624161, + "grad_norm": 0.564844012260437, + "learning_rate": 3.2819018240526936e-06, + "loss": 0.5746, + "step": 2199 + }, + { + "epoch": 2.460850111856823, + "grad_norm": 0.5408284068107605, + "learning_rate": 3.2804845187294666e-06, + "loss": 0.578, + "step": 2200 + }, + { + "epoch": 2.4619686800894853, + "grad_norm": 0.5329684615135193, + "learning_rate": 3.279066935414261e-06, + "loss": 0.5618, + "step": 2201 + }, + { + "epoch": 2.4630872483221475, + "grad_norm": 0.5477270483970642, + "learning_rate": 3.277649074611991e-06, + "loss": 0.57, + "step": 2202 + }, + { + "epoch": 2.4642058165548097, + "grad_norm": 0.5396099090576172, + "learning_rate": 3.276230936827667e-06, + "loss": 0.5623, + "step": 2203 + }, + { + "epoch": 2.465324384787472, + "grad_norm": 0.5381316542625427, + "learning_rate": 3.2748125225663998e-06, + "loss": 0.5473, + "step": 2204 + }, + { + "epoch": 2.466442953020134, + "grad_norm": 0.5450034141540527, + "learning_rate": 3.273393832333398e-06, + "loss": 0.5671, + "step": 2205 + }, + { + "epoch": 2.4675615212527964, + "grad_norm": 0.5568398833274841, + "learning_rate": 3.27197486663397e-06, + "loss": 0.5754, + "step": 2206 + }, + { + "epoch": 2.4686800894854586, + "grad_norm": 0.5468841195106506, + "learning_rate": 3.2705556259735216e-06, + "loss": 0.5622, + "step": 2207 + }, + { + "epoch": 2.469798657718121, + "grad_norm": 0.5455905795097351, + "learning_rate": 3.2691361108575536e-06, + "loss": 0.5383, + "step": 2208 + }, + { + "epoch": 2.470917225950783, + "grad_norm": 0.5542500615119934, + "learning_rate": 3.267716321791669e-06, + "loss": 0.5585, + "step": 2209 + }, + { + "epoch": 2.472035794183445, + "grad_norm": 0.5463401675224304, + "learning_rate": 3.266296259281566e-06, + "loss": 0.534, + "step": 2210 + }, + { + "epoch": 2.4731543624161074, + "grad_norm": 0.559845507144928, + "learning_rate": 3.264875923833041e-06, + "loss": 0.56, + "step": 2211 + }, + { + "epoch": 2.4742729306487696, + "grad_norm": 0.5235906839370728, + "learning_rate": 3.2634553159519867e-06, + "loss": 0.5341, + "step": 2212 + }, + { + "epoch": 2.475391498881432, + "grad_norm": 0.5544738173484802, + "learning_rate": 3.2620344361443935e-06, + "loss": 0.5536, + "step": 2213 + }, + { + "epoch": 2.476510067114094, + "grad_norm": 0.5318319201469421, + "learning_rate": 3.2606132849163495e-06, + "loss": 0.5366, + "step": 2214 + }, + { + "epoch": 2.4776286353467563, + "grad_norm": 0.5351746082305908, + "learning_rate": 3.259191862774037e-06, + "loss": 0.5495, + "step": 2215 + }, + { + "epoch": 2.4787472035794185, + "grad_norm": 0.5455824732780457, + "learning_rate": 3.2577701702237374e-06, + "loss": 0.593, + "step": 2216 + }, + { + "epoch": 2.4798657718120807, + "grad_norm": 0.5470051765441895, + "learning_rate": 3.2563482077718267e-06, + "loss": 0.5519, + "step": 2217 + }, + { + "epoch": 2.480984340044743, + "grad_norm": 0.5384635925292969, + "learning_rate": 3.2549259759247786e-06, + "loss": 0.5496, + "step": 2218 + }, + { + "epoch": 2.482102908277405, + "grad_norm": 0.5520777702331543, + "learning_rate": 3.2535034751891614e-06, + "loss": 0.5537, + "step": 2219 + }, + { + "epoch": 2.4832214765100673, + "grad_norm": 0.5559943914413452, + "learning_rate": 3.2520807060716385e-06, + "loss": 0.5591, + "step": 2220 + }, + { + "epoch": 2.484340044742729, + "grad_norm": 0.5731797814369202, + "learning_rate": 3.250657669078971e-06, + "loss": 0.5595, + "step": 2221 + }, + { + "epoch": 2.4854586129753917, + "grad_norm": 0.5644947290420532, + "learning_rate": 3.249234364718014e-06, + "loss": 0.56, + "step": 2222 + }, + { + "epoch": 2.4865771812080535, + "grad_norm": 0.5595505237579346, + "learning_rate": 3.2478107934957194e-06, + "loss": 0.5576, + "step": 2223 + }, + { + "epoch": 2.4876957494407157, + "grad_norm": 0.5390176177024841, + "learning_rate": 3.2463869559191313e-06, + "loss": 0.5585, + "step": 2224 + }, + { + "epoch": 2.488814317673378, + "grad_norm": 0.550504744052887, + "learning_rate": 3.2449628524953912e-06, + "loss": 0.5891, + "step": 2225 + }, + { + "epoch": 2.48993288590604, + "grad_norm": 0.5437968969345093, + "learning_rate": 3.2435384837317347e-06, + "loss": 0.5636, + "step": 2226 + }, + { + "epoch": 2.4910514541387023, + "grad_norm": 0.5517329573631287, + "learning_rate": 3.2421138501354917e-06, + "loss": 0.5543, + "step": 2227 + }, + { + "epoch": 2.4921700223713645, + "grad_norm": 0.5422149896621704, + "learning_rate": 3.2406889522140854e-06, + "loss": 0.5639, + "step": 2228 + }, + { + "epoch": 2.4932885906040267, + "grad_norm": 0.5669721364974976, + "learning_rate": 3.2392637904750345e-06, + "loss": 0.5395, + "step": 2229 + }, + { + "epoch": 2.494407158836689, + "grad_norm": 0.5430675745010376, + "learning_rate": 3.237838365425952e-06, + "loss": 0.5743, + "step": 2230 + }, + { + "epoch": 2.495525727069351, + "grad_norm": 0.5475093722343445, + "learning_rate": 3.2364126775745435e-06, + "loss": 0.5667, + "step": 2231 + }, + { + "epoch": 2.4966442953020134, + "grad_norm": 0.5344734787940979, + "learning_rate": 3.2349867274286086e-06, + "loss": 0.5703, + "step": 2232 + }, + { + "epoch": 2.4977628635346756, + "grad_norm": 0.547351598739624, + "learning_rate": 3.2335605154960397e-06, + "loss": 0.5574, + "step": 2233 + }, + { + "epoch": 2.498881431767338, + "grad_norm": 0.5306218266487122, + "learning_rate": 3.2321340422848245e-06, + "loss": 0.5475, + "step": 2234 + }, + { + "epoch": 2.5, + "grad_norm": 0.543672502040863, + "learning_rate": 3.2307073083030414e-06, + "loss": 0.5562, + "step": 2235 + }, + { + "epoch": 2.501118568232662, + "grad_norm": 0.5413288474082947, + "learning_rate": 3.2292803140588634e-06, + "loss": 0.539, + "step": 2236 + }, + { + "epoch": 2.5022371364653244, + "grad_norm": 0.5387410521507263, + "learning_rate": 3.2278530600605553e-06, + "loss": 0.5399, + "step": 2237 + }, + { + "epoch": 2.5033557046979866, + "grad_norm": 0.5486075282096863, + "learning_rate": 3.226425546816474e-06, + "loss": 0.5592, + "step": 2238 + }, + { + "epoch": 2.504474272930649, + "grad_norm": 0.5441234707832336, + "learning_rate": 3.2249977748350714e-06, + "loss": 0.5724, + "step": 2239 + }, + { + "epoch": 2.505592841163311, + "grad_norm": 0.5390037298202515, + "learning_rate": 3.2235697446248857e-06, + "loss": 0.5496, + "step": 2240 + }, + { + "epoch": 2.5067114093959733, + "grad_norm": 0.5360434055328369, + "learning_rate": 3.222141456694554e-06, + "loss": 0.5435, + "step": 2241 + }, + { + "epoch": 2.5078299776286355, + "grad_norm": 0.5397891998291016, + "learning_rate": 3.2207129115528015e-06, + "loss": 0.5665, + "step": 2242 + }, + { + "epoch": 2.5089485458612977, + "grad_norm": 0.5250577926635742, + "learning_rate": 3.219284109708445e-06, + "loss": 0.5685, + "step": 2243 + }, + { + "epoch": 2.51006711409396, + "grad_norm": 0.5334742069244385, + "learning_rate": 3.2178550516703934e-06, + "loss": 0.5545, + "step": 2244 + }, + { + "epoch": 2.511185682326622, + "grad_norm": 0.5596793293952942, + "learning_rate": 3.2164257379476472e-06, + "loss": 0.5489, + "step": 2245 + }, + { + "epoch": 2.512304250559284, + "grad_norm": 0.5434040427207947, + "learning_rate": 3.2149961690492966e-06, + "loss": 0.5578, + "step": 2246 + }, + { + "epoch": 2.5134228187919465, + "grad_norm": 0.5347144603729248, + "learning_rate": 3.213566345484525e-06, + "loss": 0.5612, + "step": 2247 + }, + { + "epoch": 2.5145413870246083, + "grad_norm": 0.5433787107467651, + "learning_rate": 3.2121362677626037e-06, + "loss": 0.5798, + "step": 2248 + }, + { + "epoch": 2.515659955257271, + "grad_norm": 0.5426890254020691, + "learning_rate": 3.210705936392897e-06, + "loss": 0.5464, + "step": 2249 + }, + { + "epoch": 2.5167785234899327, + "grad_norm": 0.5374168753623962, + "learning_rate": 3.2092753518848584e-06, + "loss": 0.5576, + "step": 2250 + }, + { + "epoch": 2.5178970917225953, + "grad_norm": 0.5380726456642151, + "learning_rate": 3.207844514748032e-06, + "loss": 0.5857, + "step": 2251 + }, + { + "epoch": 2.519015659955257, + "grad_norm": 0.5299219489097595, + "learning_rate": 3.2064134254920497e-06, + "loss": 0.5527, + "step": 2252 + }, + { + "epoch": 2.5201342281879193, + "grad_norm": 0.5601388216018677, + "learning_rate": 3.204982084626637e-06, + "loss": 0.599, + "step": 2253 + }, + { + "epoch": 2.5212527964205815, + "grad_norm": 0.5510305166244507, + "learning_rate": 3.203550492661606e-06, + "loss": 0.5832, + "step": 2254 + }, + { + "epoch": 2.5223713646532437, + "grad_norm": 0.5395695567131042, + "learning_rate": 3.2021186501068603e-06, + "loss": 0.5701, + "step": 2255 + }, + { + "epoch": 2.523489932885906, + "grad_norm": 0.5172858834266663, + "learning_rate": 3.2006865574723907e-06, + "loss": 0.5543, + "step": 2256 + }, + { + "epoch": 2.524608501118568, + "grad_norm": 0.5478967428207397, + "learning_rate": 3.1992542152682783e-06, + "loss": 0.585, + "step": 2257 + }, + { + "epoch": 2.5257270693512304, + "grad_norm": 0.5554324388504028, + "learning_rate": 3.197821624004693e-06, + "loss": 0.5749, + "step": 2258 + }, + { + "epoch": 2.5268456375838926, + "grad_norm": 0.5577069520950317, + "learning_rate": 3.1963887841918944e-06, + "loss": 0.5601, + "step": 2259 + }, + { + "epoch": 2.527964205816555, + "grad_norm": 0.5465332269668579, + "learning_rate": 3.1949556963402283e-06, + "loss": 0.5795, + "step": 2260 + }, + { + "epoch": 2.529082774049217, + "grad_norm": 0.5358637571334839, + "learning_rate": 3.1935223609601303e-06, + "loss": 0.5465, + "step": 2261 + }, + { + "epoch": 2.530201342281879, + "grad_norm": 0.5596712827682495, + "learning_rate": 3.1920887785621233e-06, + "loss": 0.5773, + "step": 2262 + }, + { + "epoch": 2.5313199105145414, + "grad_norm": 0.5511174201965332, + "learning_rate": 3.190654949656821e-06, + "loss": 0.5492, + "step": 2263 + }, + { + "epoch": 2.5324384787472036, + "grad_norm": 0.5387730598449707, + "learning_rate": 3.1892208747549207e-06, + "loss": 0.5421, + "step": 2264 + }, + { + "epoch": 2.533557046979866, + "grad_norm": 0.5272213220596313, + "learning_rate": 3.1877865543672103e-06, + "loss": 0.5495, + "step": 2265 + }, + { + "epoch": 2.534675615212528, + "grad_norm": 0.5567978620529175, + "learning_rate": 3.1863519890045637e-06, + "loss": 0.5614, + "step": 2266 + }, + { + "epoch": 2.5357941834451903, + "grad_norm": 0.5389889478683472, + "learning_rate": 3.1849171791779434e-06, + "loss": 0.5436, + "step": 2267 + }, + { + "epoch": 2.5369127516778525, + "grad_norm": 0.542368471622467, + "learning_rate": 3.183482125398397e-06, + "loss": 0.5406, + "step": 2268 + }, + { + "epoch": 2.5380313199105147, + "grad_norm": 0.5602286458015442, + "learning_rate": 3.18204682817706e-06, + "loss": 0.5657, + "step": 2269 + }, + { + "epoch": 2.539149888143177, + "grad_norm": 0.542411208152771, + "learning_rate": 3.180611288025156e-06, + "loss": 0.5901, + "step": 2270 + }, + { + "epoch": 2.540268456375839, + "grad_norm": 0.5458567142486572, + "learning_rate": 3.179175505453994e-06, + "loss": 0.5482, + "step": 2271 + }, + { + "epoch": 2.5413870246085013, + "grad_norm": 0.536913275718689, + "learning_rate": 3.1777394809749677e-06, + "loss": 0.5537, + "step": 2272 + }, + { + "epoch": 2.542505592841163, + "grad_norm": 0.5402929782867432, + "learning_rate": 3.1763032150995597e-06, + "loss": 0.5677, + "step": 2273 + }, + { + "epoch": 2.5436241610738257, + "grad_norm": 0.5390001535415649, + "learning_rate": 3.174866708339336e-06, + "loss": 0.572, + "step": 2274 + }, + { + "epoch": 2.5447427293064875, + "grad_norm": 0.5416916608810425, + "learning_rate": 3.173429961205951e-06, + "loss": 0.5579, + "step": 2275 + }, + { + "epoch": 2.54586129753915, + "grad_norm": 0.5629043579101562, + "learning_rate": 3.1719929742111437e-06, + "loss": 0.5516, + "step": 2276 + }, + { + "epoch": 2.546979865771812, + "grad_norm": 0.5454716086387634, + "learning_rate": 3.170555747866737e-06, + "loss": 0.5886, + "step": 2277 + }, + { + "epoch": 2.548098434004474, + "grad_norm": 0.522717297077179, + "learning_rate": 3.169118282684641e-06, + "loss": 0.5353, + "step": 2278 + }, + { + "epoch": 2.5492170022371363, + "grad_norm": 0.5467501282691956, + "learning_rate": 3.1676805791768505e-06, + "loss": 0.5351, + "step": 2279 + }, + { + "epoch": 2.5503355704697985, + "grad_norm": 0.556818962097168, + "learning_rate": 3.1662426378554447e-06, + "loss": 0.5598, + "step": 2280 + }, + { + "epoch": 2.5514541387024607, + "grad_norm": 0.5304411053657532, + "learning_rate": 3.1648044592325876e-06, + "loss": 0.5497, + "step": 2281 + }, + { + "epoch": 2.552572706935123, + "grad_norm": 0.5290155410766602, + "learning_rate": 3.163366043820527e-06, + "loss": 0.5405, + "step": 2282 + }, + { + "epoch": 2.553691275167785, + "grad_norm": 0.545217752456665, + "learning_rate": 3.1619273921315976e-06, + "loss": 0.5627, + "step": 2283 + }, + { + "epoch": 2.5548098434004474, + "grad_norm": 0.5452941060066223, + "learning_rate": 3.1604885046782158e-06, + "loss": 0.5593, + "step": 2284 + }, + { + "epoch": 2.5559284116331096, + "grad_norm": 0.5433606505393982, + "learning_rate": 3.1590493819728818e-06, + "loss": 0.5701, + "step": 2285 + }, + { + "epoch": 2.557046979865772, + "grad_norm": 0.54730623960495, + "learning_rate": 3.1576100245281814e-06, + "loss": 0.5672, + "step": 2286 + }, + { + "epoch": 2.558165548098434, + "grad_norm": 0.5558343529701233, + "learning_rate": 3.1561704328567826e-06, + "loss": 0.5755, + "step": 2287 + }, + { + "epoch": 2.559284116331096, + "grad_norm": 0.5393132567405701, + "learning_rate": 3.1547306074714384e-06, + "loss": 0.5619, + "step": 2288 + }, + { + "epoch": 2.5604026845637584, + "grad_norm": 0.5464217662811279, + "learning_rate": 3.1532905488849825e-06, + "loss": 0.5749, + "step": 2289 + }, + { + "epoch": 2.5615212527964206, + "grad_norm": 0.5325460433959961, + "learning_rate": 3.151850257610334e-06, + "loss": 0.5543, + "step": 2290 + }, + { + "epoch": 2.562639821029083, + "grad_norm": 0.5297083258628845, + "learning_rate": 3.150409734160494e-06, + "loss": 0.5554, + "step": 2291 + }, + { + "epoch": 2.563758389261745, + "grad_norm": 0.5426949262619019, + "learning_rate": 3.148968979048547e-06, + "loss": 0.5423, + "step": 2292 + }, + { + "epoch": 2.5648769574944073, + "grad_norm": 0.5461225509643555, + "learning_rate": 3.1475279927876566e-06, + "loss": 0.5579, + "step": 2293 + }, + { + "epoch": 2.5659955257270695, + "grad_norm": 0.55253005027771, + "learning_rate": 3.146086775891074e-06, + "loss": 0.5584, + "step": 2294 + }, + { + "epoch": 2.5671140939597317, + "grad_norm": 0.5452852249145508, + "learning_rate": 3.144645328872129e-06, + "loss": 0.564, + "step": 2295 + }, + { + "epoch": 2.568232662192394, + "grad_norm": 0.5574961304664612, + "learning_rate": 3.1432036522442355e-06, + "loss": 0.5579, + "step": 2296 + }, + { + "epoch": 2.569351230425056, + "grad_norm": 0.5384403467178345, + "learning_rate": 3.141761746520886e-06, + "loss": 0.5572, + "step": 2297 + }, + { + "epoch": 2.570469798657718, + "grad_norm": 0.5540268421173096, + "learning_rate": 3.1403196122156586e-06, + "loss": 0.5563, + "step": 2298 + }, + { + "epoch": 2.5715883668903805, + "grad_norm": 0.550927460193634, + "learning_rate": 3.138877249842209e-06, + "loss": 0.5746, + "step": 2299 + }, + { + "epoch": 2.5727069351230423, + "grad_norm": 0.5478324294090271, + "learning_rate": 3.137434659914279e-06, + "loss": 0.5494, + "step": 2300 + }, + { + "epoch": 2.573825503355705, + "grad_norm": 0.5424575209617615, + "learning_rate": 3.1359918429456847e-06, + "loss": 0.5516, + "step": 2301 + }, + { + "epoch": 2.5749440715883667, + "grad_norm": 0.5427699089050293, + "learning_rate": 3.134548799450329e-06, + "loss": 0.5359, + "step": 2302 + }, + { + "epoch": 2.5760626398210293, + "grad_norm": 0.5260825157165527, + "learning_rate": 3.1331055299421943e-06, + "loss": 0.5079, + "step": 2303 + }, + { + "epoch": 2.577181208053691, + "grad_norm": 0.5435081124305725, + "learning_rate": 3.13166203493534e-06, + "loss": 0.578, + "step": 2304 + }, + { + "epoch": 2.5782997762863533, + "grad_norm": 0.5507857799530029, + "learning_rate": 3.1302183149439092e-06, + "loss": 0.5658, + "step": 2305 + }, + { + "epoch": 2.5794183445190155, + "grad_norm": 0.5269414782524109, + "learning_rate": 3.128774370482124e-06, + "loss": 0.558, + "step": 2306 + }, + { + "epoch": 2.5805369127516777, + "grad_norm": 0.5590683221817017, + "learning_rate": 3.1273302020642874e-06, + "loss": 0.5616, + "step": 2307 + }, + { + "epoch": 2.58165548098434, + "grad_norm": 0.5686662793159485, + "learning_rate": 3.1258858102047816e-06, + "loss": 0.5816, + "step": 2308 + }, + { + "epoch": 2.582774049217002, + "grad_norm": 0.5508021116256714, + "learning_rate": 3.1244411954180677e-06, + "loss": 0.5677, + "step": 2309 + }, + { + "epoch": 2.5838926174496644, + "grad_norm": 0.5285972952842712, + "learning_rate": 3.1229963582186862e-06, + "loss": 0.5481, + "step": 2310 + }, + { + "epoch": 2.5850111856823266, + "grad_norm": 0.5422084331512451, + "learning_rate": 3.1215512991212582e-06, + "loss": 0.5598, + "step": 2311 + }, + { + "epoch": 2.586129753914989, + "grad_norm": 0.5737012624740601, + "learning_rate": 3.1201060186404836e-06, + "loss": 0.5643, + "step": 2312 + }, + { + "epoch": 2.587248322147651, + "grad_norm": 0.5503541827201843, + "learning_rate": 3.1186605172911398e-06, + "loss": 0.5774, + "step": 2313 + }, + { + "epoch": 2.588366890380313, + "grad_norm": 0.5465447902679443, + "learning_rate": 3.1172147955880827e-06, + "loss": 0.5722, + "step": 2314 + }, + { + "epoch": 2.5894854586129754, + "grad_norm": 0.5510462522506714, + "learning_rate": 3.115768854046249e-06, + "loss": 0.5771, + "step": 2315 + }, + { + "epoch": 2.5906040268456376, + "grad_norm": 0.5356128811836243, + "learning_rate": 3.114322693180652e-06, + "loss": 0.5558, + "step": 2316 + }, + { + "epoch": 2.5917225950783, + "grad_norm": 0.560545802116394, + "learning_rate": 3.112876313506383e-06, + "loss": 0.559, + "step": 2317 + }, + { + "epoch": 2.592841163310962, + "grad_norm": 0.5442909598350525, + "learning_rate": 3.111429715538611e-06, + "loss": 0.5624, + "step": 2318 + }, + { + "epoch": 2.5939597315436242, + "grad_norm": 0.5462409257888794, + "learning_rate": 3.109982899792586e-06, + "loss": 0.5598, + "step": 2319 + }, + { + "epoch": 2.5950782997762865, + "grad_norm": 0.5364828109741211, + "learning_rate": 3.1085358667836307e-06, + "loss": 0.5689, + "step": 2320 + }, + { + "epoch": 2.5961968680089487, + "grad_norm": 0.5461208820343018, + "learning_rate": 3.1070886170271474e-06, + "loss": 0.5562, + "step": 2321 + }, + { + "epoch": 2.597315436241611, + "grad_norm": 0.5747359395027161, + "learning_rate": 3.105641151038617e-06, + "loss": 0.57, + "step": 2322 + }, + { + "epoch": 2.598434004474273, + "grad_norm": 0.5458117127418518, + "learning_rate": 3.1041934693335952e-06, + "loss": 0.555, + "step": 2323 + }, + { + "epoch": 2.5995525727069353, + "grad_norm": 0.5567789077758789, + "learning_rate": 3.102745572427716e-06, + "loss": 0.5172, + "step": 2324 + }, + { + "epoch": 2.600671140939597, + "grad_norm": 0.5759881138801575, + "learning_rate": 3.101297460836689e-06, + "loss": 0.5634, + "step": 2325 + }, + { + "epoch": 2.6017897091722597, + "grad_norm": 0.5526554584503174, + "learning_rate": 3.099849135076301e-06, + "loss": 0.5375, + "step": 2326 + }, + { + "epoch": 2.6029082774049215, + "grad_norm": 0.5458359122276306, + "learning_rate": 3.098400595662414e-06, + "loss": 0.5342, + "step": 2327 + }, + { + "epoch": 2.604026845637584, + "grad_norm": 0.5594093799591064, + "learning_rate": 3.0969518431109686e-06, + "loss": 0.5655, + "step": 2328 + }, + { + "epoch": 2.605145413870246, + "grad_norm": 0.5587557554244995, + "learning_rate": 3.095502877937978e-06, + "loss": 0.5626, + "step": 2329 + }, + { + "epoch": 2.6062639821029085, + "grad_norm": 0.5606288313865662, + "learning_rate": 3.094053700659533e-06, + "loss": 0.5824, + "step": 2330 + }, + { + "epoch": 2.6073825503355703, + "grad_norm": 0.5440497994422913, + "learning_rate": 3.0926043117918004e-06, + "loss": 0.5403, + "step": 2331 + }, + { + "epoch": 2.6085011185682325, + "grad_norm": 0.5627294182777405, + "learning_rate": 3.091154711851022e-06, + "loss": 0.5999, + "step": 2332 + }, + { + "epoch": 2.6096196868008947, + "grad_norm": 0.5549243688583374, + "learning_rate": 3.089704901353513e-06, + "loss": 0.5542, + "step": 2333 + }, + { + "epoch": 2.610738255033557, + "grad_norm": 0.5360384583473206, + "learning_rate": 3.0882548808156664e-06, + "loss": 0.5455, + "step": 2334 + }, + { + "epoch": 2.611856823266219, + "grad_norm": 0.532645046710968, + "learning_rate": 3.0868046507539482e-06, + "loss": 0.5279, + "step": 2335 + }, + { + "epoch": 2.6129753914988814, + "grad_norm": 0.5489477515220642, + "learning_rate": 3.0853542116848987e-06, + "loss": 0.5854, + "step": 2336 + }, + { + "epoch": 2.6140939597315436, + "grad_norm": 0.5463985800743103, + "learning_rate": 3.083903564125135e-06, + "loss": 0.535, + "step": 2337 + }, + { + "epoch": 2.615212527964206, + "grad_norm": 0.5312986969947815, + "learning_rate": 3.082452708591346e-06, + "loss": 0.5692, + "step": 2338 + }, + { + "epoch": 2.616331096196868, + "grad_norm": 0.5492674112319946, + "learning_rate": 3.081001645600295e-06, + "loss": 0.5461, + "step": 2339 + }, + { + "epoch": 2.61744966442953, + "grad_norm": 0.5591171383857727, + "learning_rate": 3.0795503756688212e-06, + "loss": 0.5659, + "step": 2340 + }, + { + "epoch": 2.6185682326621924, + "grad_norm": 0.5744614005088806, + "learning_rate": 3.078098899313835e-06, + "loss": 0.555, + "step": 2341 + }, + { + "epoch": 2.6196868008948546, + "grad_norm": 0.5478807687759399, + "learning_rate": 3.0766472170523208e-06, + "loss": 0.5831, + "step": 2342 + }, + { + "epoch": 2.620805369127517, + "grad_norm": 0.5491352677345276, + "learning_rate": 3.0751953294013387e-06, + "loss": 0.5539, + "step": 2343 + }, + { + "epoch": 2.621923937360179, + "grad_norm": 0.5593333840370178, + "learning_rate": 3.0737432368780196e-06, + "loss": 0.5644, + "step": 2344 + }, + { + "epoch": 2.6230425055928412, + "grad_norm": 0.5525262355804443, + "learning_rate": 3.072290939999567e-06, + "loss": 0.5366, + "step": 2345 + }, + { + "epoch": 2.6241610738255035, + "grad_norm": 0.5336546897888184, + "learning_rate": 3.070838439283259e-06, + "loss": 0.54, + "step": 2346 + }, + { + "epoch": 2.6252796420581657, + "grad_norm": 0.5656988620758057, + "learning_rate": 3.0693857352464445e-06, + "loss": 0.5818, + "step": 2347 + }, + { + "epoch": 2.626398210290828, + "grad_norm": 0.5220616459846497, + "learning_rate": 3.0679328284065474e-06, + "loss": 0.517, + "step": 2348 + }, + { + "epoch": 2.62751677852349, + "grad_norm": 0.554401695728302, + "learning_rate": 3.066479719281061e-06, + "loss": 0.5652, + "step": 2349 + }, + { + "epoch": 2.6286353467561523, + "grad_norm": 0.5401812791824341, + "learning_rate": 3.065026408387552e-06, + "loss": 0.5443, + "step": 2350 + }, + { + "epoch": 2.6297539149888145, + "grad_norm": 0.5526830554008484, + "learning_rate": 3.063572896243659e-06, + "loss": 0.5467, + "step": 2351 + }, + { + "epoch": 2.6308724832214763, + "grad_norm": 0.5644559860229492, + "learning_rate": 3.0621191833670923e-06, + "loss": 0.5764, + "step": 2352 + }, + { + "epoch": 2.631991051454139, + "grad_norm": 0.5451629757881165, + "learning_rate": 3.0606652702756336e-06, + "loss": 0.5558, + "step": 2353 + }, + { + "epoch": 2.6331096196868007, + "grad_norm": 0.5634286999702454, + "learning_rate": 3.0592111574871346e-06, + "loss": 0.5893, + "step": 2354 + }, + { + "epoch": 2.6342281879194633, + "grad_norm": 0.5433185696601868, + "learning_rate": 3.0577568455195207e-06, + "loss": 0.5464, + "step": 2355 + }, + { + "epoch": 2.635346756152125, + "grad_norm": 0.5487169027328491, + "learning_rate": 3.056302334890786e-06, + "loss": 0.5672, + "step": 2356 + }, + { + "epoch": 2.6364653243847873, + "grad_norm": 0.5432003140449524, + "learning_rate": 3.0548476261189973e-06, + "loss": 0.5456, + "step": 2357 + }, + { + "epoch": 2.6375838926174495, + "grad_norm": 0.5585076212882996, + "learning_rate": 3.05339271972229e-06, + "loss": 0.5405, + "step": 2358 + }, + { + "epoch": 2.6387024608501117, + "grad_norm": 0.5451131463050842, + "learning_rate": 3.051937616218871e-06, + "loss": 0.554, + "step": 2359 + }, + { + "epoch": 2.639821029082774, + "grad_norm": 0.549545407295227, + "learning_rate": 3.0504823161270165e-06, + "loss": 0.5492, + "step": 2360 + }, + { + "epoch": 2.640939597315436, + "grad_norm": 0.5548585653305054, + "learning_rate": 3.0490268199650754e-06, + "loss": 0.5787, + "step": 2361 + }, + { + "epoch": 2.6420581655480984, + "grad_norm": 0.5595759749412537, + "learning_rate": 3.0475711282514626e-06, + "loss": 0.5754, + "step": 2362 + }, + { + "epoch": 2.6431767337807606, + "grad_norm": 0.5477004647254944, + "learning_rate": 3.046115241504665e-06, + "loss": 0.542, + "step": 2363 + }, + { + "epoch": 2.6442953020134228, + "grad_norm": 0.5440866351127625, + "learning_rate": 3.0446591602432386e-06, + "loss": 0.5566, + "step": 2364 + }, + { + "epoch": 2.645413870246085, + "grad_norm": 0.5314123630523682, + "learning_rate": 3.0432028849858085e-06, + "loss": 0.5621, + "step": 2365 + }, + { + "epoch": 2.646532438478747, + "grad_norm": 0.5632365942001343, + "learning_rate": 3.0417464162510697e-06, + "loss": 0.5724, + "step": 2366 + }, + { + "epoch": 2.6476510067114094, + "grad_norm": 0.5360177159309387, + "learning_rate": 3.040289754557784e-06, + "loss": 0.536, + "step": 2367 + }, + { + "epoch": 2.6487695749440716, + "grad_norm": 0.5535303950309753, + "learning_rate": 3.038832900424784e-06, + "loss": 0.5692, + "step": 2368 + }, + { + "epoch": 2.649888143176734, + "grad_norm": 0.5483287572860718, + "learning_rate": 3.0373758543709714e-06, + "loss": 0.5241, + "step": 2369 + }, + { + "epoch": 2.651006711409396, + "grad_norm": 0.5349264144897461, + "learning_rate": 3.035918616915312e-06, + "loss": 0.5581, + "step": 2370 + }, + { + "epoch": 2.6521252796420582, + "grad_norm": 0.53511643409729, + "learning_rate": 3.0344611885768453e-06, + "loss": 0.5494, + "step": 2371 + }, + { + "epoch": 2.6532438478747205, + "grad_norm": 0.6043458580970764, + "learning_rate": 3.0330035698746756e-06, + "loss": 0.556, + "step": 2372 + }, + { + "epoch": 2.6543624161073827, + "grad_norm": 0.5656911730766296, + "learning_rate": 3.031545761327976e-06, + "loss": 0.5654, + "step": 2373 + }, + { + "epoch": 2.655480984340045, + "grad_norm": 0.5389603972434998, + "learning_rate": 3.0300877634559856e-06, + "loss": 0.5523, + "step": 2374 + }, + { + "epoch": 2.656599552572707, + "grad_norm": 0.5444291830062866, + "learning_rate": 3.028629576778013e-06, + "loss": 0.541, + "step": 2375 + }, + { + "epoch": 2.6577181208053693, + "grad_norm": 0.5597144961357117, + "learning_rate": 3.0271712018134337e-06, + "loss": 0.547, + "step": 2376 + }, + { + "epoch": 2.658836689038031, + "grad_norm": 0.535246729850769, + "learning_rate": 3.0257126390816903e-06, + "loss": 0.5567, + "step": 2377 + }, + { + "epoch": 2.6599552572706937, + "grad_norm": 0.5444280505180359, + "learning_rate": 3.0242538891022906e-06, + "loss": 0.5637, + "step": 2378 + }, + { + "epoch": 2.6610738255033555, + "grad_norm": 0.5488506555557251, + "learning_rate": 3.0227949523948097e-06, + "loss": 0.5501, + "step": 2379 + }, + { + "epoch": 2.662192393736018, + "grad_norm": 0.5609076023101807, + "learning_rate": 3.0213358294788913e-06, + "loss": 0.5637, + "step": 2380 + }, + { + "epoch": 2.66331096196868, + "grad_norm": 0.5687225461006165, + "learning_rate": 3.0198765208742435e-06, + "loss": 0.5575, + "step": 2381 + }, + { + "epoch": 2.6644295302013425, + "grad_norm": 0.5333688855171204, + "learning_rate": 3.0184170271006413e-06, + "loss": 0.538, + "step": 2382 + }, + { + "epoch": 2.6655480984340043, + "grad_norm": 0.5355653762817383, + "learning_rate": 3.016957348677924e-06, + "loss": 0.5212, + "step": 2383 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.5570645332336426, + "learning_rate": 3.0154974861259993e-06, + "loss": 0.5676, + "step": 2384 + }, + { + "epoch": 2.6677852348993287, + "grad_norm": 0.5462499856948853, + "learning_rate": 3.0140374399648394e-06, + "loss": 0.5606, + "step": 2385 + }, + { + "epoch": 2.668903803131991, + "grad_norm": 0.5412359833717346, + "learning_rate": 3.0125772107144802e-06, + "loss": 0.5125, + "step": 2386 + }, + { + "epoch": 2.670022371364653, + "grad_norm": 0.5487285256385803, + "learning_rate": 3.011116798895026e-06, + "loss": 0.5504, + "step": 2387 + }, + { + "epoch": 2.6711409395973154, + "grad_norm": 0.5610405206680298, + "learning_rate": 3.009656205026643e-06, + "loss": 0.5472, + "step": 2388 + }, + { + "epoch": 2.6722595078299776, + "grad_norm": 0.5420982241630554, + "learning_rate": 3.0081954296295645e-06, + "loss": 0.5403, + "step": 2389 + }, + { + "epoch": 2.6733780760626398, + "grad_norm": 0.5456695556640625, + "learning_rate": 3.0067344732240876e-06, + "loss": 0.5443, + "step": 2390 + }, + { + "epoch": 2.674496644295302, + "grad_norm": 0.5425966382026672, + "learning_rate": 3.005273336330573e-06, + "loss": 0.5558, + "step": 2391 + }, + { + "epoch": 2.675615212527964, + "grad_norm": 0.5529112219810486, + "learning_rate": 3.0038120194694477e-06, + "loss": 0.5713, + "step": 2392 + }, + { + "epoch": 2.6767337807606264, + "grad_norm": 0.5473445653915405, + "learning_rate": 3.002350523161201e-06, + "loss": 0.5622, + "step": 2393 + }, + { + "epoch": 2.6778523489932886, + "grad_norm": 0.5647788643836975, + "learning_rate": 3.0008888479263866e-06, + "loss": 0.5802, + "step": 2394 + }, + { + "epoch": 2.678970917225951, + "grad_norm": 0.5544139742851257, + "learning_rate": 2.999426994285623e-06, + "loss": 0.5626, + "step": 2395 + }, + { + "epoch": 2.680089485458613, + "grad_norm": 0.5572628974914551, + "learning_rate": 2.9979649627595904e-06, + "loss": 0.5744, + "step": 2396 + }, + { + "epoch": 2.6812080536912752, + "grad_norm": 0.5385755896568298, + "learning_rate": 2.9965027538690344e-06, + "loss": 0.5494, + "step": 2397 + }, + { + "epoch": 2.6823266219239374, + "grad_norm": 0.5390859246253967, + "learning_rate": 2.9950403681347617e-06, + "loss": 0.5614, + "step": 2398 + }, + { + "epoch": 2.6834451901565997, + "grad_norm": 0.5573400855064392, + "learning_rate": 2.9935778060776435e-06, + "loss": 0.5572, + "step": 2399 + }, + { + "epoch": 2.684563758389262, + "grad_norm": 0.5425155162811279, + "learning_rate": 2.992115068218613e-06, + "loss": 0.5629, + "step": 2400 + }, + { + "epoch": 2.685682326621924, + "grad_norm": 0.5649998188018799, + "learning_rate": 2.990652155078666e-06, + "loss": 0.5432, + "step": 2401 + }, + { + "epoch": 2.6868008948545863, + "grad_norm": 0.5427099466323853, + "learning_rate": 2.989189067178862e-06, + "loss": 0.5433, + "step": 2402 + }, + { + "epoch": 2.6879194630872485, + "grad_norm": 0.5437093377113342, + "learning_rate": 2.9877258050403214e-06, + "loss": 0.5323, + "step": 2403 + }, + { + "epoch": 2.6890380313199103, + "grad_norm": 0.5371407866477966, + "learning_rate": 2.986262369184226e-06, + "loss": 0.5416, + "step": 2404 + }, + { + "epoch": 2.690156599552573, + "grad_norm": 0.5429154634475708, + "learning_rate": 2.9847987601318224e-06, + "loss": 0.5329, + "step": 2405 + }, + { + "epoch": 2.6912751677852347, + "grad_norm": 0.5526633262634277, + "learning_rate": 2.9833349784044154e-06, + "loss": 0.5573, + "step": 2406 + }, + { + "epoch": 2.6923937360178973, + "grad_norm": 0.5496582388877869, + "learning_rate": 2.9818710245233733e-06, + "loss": 0.5516, + "step": 2407 + }, + { + "epoch": 2.693512304250559, + "grad_norm": 0.5592177510261536, + "learning_rate": 2.980406899010126e-06, + "loss": 0.5606, + "step": 2408 + }, + { + "epoch": 2.6946308724832218, + "grad_norm": 0.5417295694351196, + "learning_rate": 2.9789426023861633e-06, + "loss": 0.5395, + "step": 2409 + }, + { + "epoch": 2.6957494407158835, + "grad_norm": 0.5291399359703064, + "learning_rate": 2.977478135173037e-06, + "loss": 0.5312, + "step": 2410 + }, + { + "epoch": 2.6968680089485457, + "grad_norm": 0.5451543927192688, + "learning_rate": 2.9760134978923582e-06, + "loss": 0.5631, + "step": 2411 + }, + { + "epoch": 2.697986577181208, + "grad_norm": 0.5505276322364807, + "learning_rate": 2.9745486910657994e-06, + "loss": 0.5663, + "step": 2412 + }, + { + "epoch": 2.69910514541387, + "grad_norm": 0.5310605764389038, + "learning_rate": 2.9730837152150954e-06, + "loss": 0.5515, + "step": 2413 + }, + { + "epoch": 2.7002237136465324, + "grad_norm": 0.5517241358757019, + "learning_rate": 2.971618570862038e-06, + "loss": 0.5751, + "step": 2414 + }, + { + "epoch": 2.7013422818791946, + "grad_norm": 0.5553033947944641, + "learning_rate": 2.9701532585284804e-06, + "loss": 0.5917, + "step": 2415 + }, + { + "epoch": 2.7024608501118568, + "grad_norm": 0.5519202947616577, + "learning_rate": 2.9686877787363367e-06, + "loss": 0.5438, + "step": 2416 + }, + { + "epoch": 2.703579418344519, + "grad_norm": 0.575251042842865, + "learning_rate": 2.9672221320075784e-06, + "loss": 0.5639, + "step": 2417 + }, + { + "epoch": 2.704697986577181, + "grad_norm": 0.5733804702758789, + "learning_rate": 2.9657563188642385e-06, + "loss": 0.5656, + "step": 2418 + }, + { + "epoch": 2.7058165548098434, + "grad_norm": 0.5562912225723267, + "learning_rate": 2.9642903398284083e-06, + "loss": 0.5641, + "step": 2419 + }, + { + "epoch": 2.7069351230425056, + "grad_norm": 0.5579444766044617, + "learning_rate": 2.962824195422238e-06, + "loss": 0.5426, + "step": 2420 + }, + { + "epoch": 2.708053691275168, + "grad_norm": 0.5527287721633911, + "learning_rate": 2.961357886167937e-06, + "loss": 0.5609, + "step": 2421 + }, + { + "epoch": 2.70917225950783, + "grad_norm": 0.5619087815284729, + "learning_rate": 2.9598914125877744e-06, + "loss": 0.5459, + "step": 2422 + }, + { + "epoch": 2.7102908277404922, + "grad_norm": 0.536557674407959, + "learning_rate": 2.9584247752040758e-06, + "loss": 0.5464, + "step": 2423 + }, + { + "epoch": 2.7114093959731544, + "grad_norm": 0.5522740483283997, + "learning_rate": 2.9569579745392263e-06, + "loss": 0.573, + "step": 2424 + }, + { + "epoch": 2.7125279642058167, + "grad_norm": 0.5497687458992004, + "learning_rate": 2.95549101111567e-06, + "loss": 0.5515, + "step": 2425 + }, + { + "epoch": 2.713646532438479, + "grad_norm": 0.5363222360610962, + "learning_rate": 2.954023885455907e-06, + "loss": 0.5568, + "step": 2426 + }, + { + "epoch": 2.714765100671141, + "grad_norm": 0.5515537858009338, + "learning_rate": 2.952556598082497e-06, + "loss": 0.5989, + "step": 2427 + }, + { + "epoch": 2.7158836689038033, + "grad_norm": 0.5664210319519043, + "learning_rate": 2.951089149518056e-06, + "loss": 0.5556, + "step": 2428 + }, + { + "epoch": 2.717002237136465, + "grad_norm": 0.557420015335083, + "learning_rate": 2.9496215402852586e-06, + "loss": 0.5576, + "step": 2429 + }, + { + "epoch": 2.7181208053691277, + "grad_norm": 0.5330470204353333, + "learning_rate": 2.9481537709068362e-06, + "loss": 0.5222, + "step": 2430 + }, + { + "epoch": 2.7192393736017895, + "grad_norm": 0.5420283675193787, + "learning_rate": 2.9466858419055755e-06, + "loss": 0.5609, + "step": 2431 + }, + { + "epoch": 2.720357941834452, + "grad_norm": 0.5385798215866089, + "learning_rate": 2.9452177538043227e-06, + "loss": 0.5411, + "step": 2432 + }, + { + "epoch": 2.721476510067114, + "grad_norm": 0.5468939542770386, + "learning_rate": 2.943749507125979e-06, + "loss": 0.5641, + "step": 2433 + }, + { + "epoch": 2.7225950782997765, + "grad_norm": 0.5648471117019653, + "learning_rate": 2.942281102393504e-06, + "loss": 0.5266, + "step": 2434 + }, + { + "epoch": 2.7237136465324383, + "grad_norm": 0.5338712930679321, + "learning_rate": 2.94081254012991e-06, + "loss": 0.5708, + "step": 2435 + }, + { + "epoch": 2.7248322147651005, + "grad_norm": 0.5404524803161621, + "learning_rate": 2.9393438208582694e-06, + "loss": 0.5622, + "step": 2436 + }, + { + "epoch": 2.7259507829977627, + "grad_norm": 0.5760878324508667, + "learning_rate": 2.9378749451017074e-06, + "loss": 0.5668, + "step": 2437 + }, + { + "epoch": 2.727069351230425, + "grad_norm": 0.5336945056915283, + "learning_rate": 2.9364059133834083e-06, + "loss": 0.5289, + "step": 2438 + }, + { + "epoch": 2.728187919463087, + "grad_norm": 0.5537322759628296, + "learning_rate": 2.9349367262266077e-06, + "loss": 0.5491, + "step": 2439 + }, + { + "epoch": 2.7293064876957494, + "grad_norm": 0.5614214539527893, + "learning_rate": 2.9334673841546003e-06, + "loss": 0.5575, + "step": 2440 + }, + { + "epoch": 2.7304250559284116, + "grad_norm": 0.5491678714752197, + "learning_rate": 2.931997887690734e-06, + "loss": 0.5534, + "step": 2441 + }, + { + "epoch": 2.7315436241610738, + "grad_norm": 0.5477733612060547, + "learning_rate": 2.9305282373584127e-06, + "loss": 0.5508, + "step": 2442 + }, + { + "epoch": 2.732662192393736, + "grad_norm": 0.549692690372467, + "learning_rate": 2.9290584336810928e-06, + "loss": 0.5859, + "step": 2443 + }, + { + "epoch": 2.733780760626398, + "grad_norm": 0.5312687158584595, + "learning_rate": 2.9275884771822893e-06, + "loss": 0.5319, + "step": 2444 + }, + { + "epoch": 2.7348993288590604, + "grad_norm": 0.5523775219917297, + "learning_rate": 2.9261183683855677e-06, + "loss": 0.5392, + "step": 2445 + }, + { + "epoch": 2.7360178970917226, + "grad_norm": 0.567766010761261, + "learning_rate": 2.924648107814551e-06, + "loss": 0.5875, + "step": 2446 + }, + { + "epoch": 2.737136465324385, + "grad_norm": 0.5400151014328003, + "learning_rate": 2.9231776959929135e-06, + "loss": 0.5382, + "step": 2447 + }, + { + "epoch": 2.738255033557047, + "grad_norm": 0.5191541910171509, + "learning_rate": 2.9217071334443847e-06, + "loss": 0.5066, + "step": 2448 + }, + { + "epoch": 2.7393736017897092, + "grad_norm": 0.5546445846557617, + "learning_rate": 2.9202364206927484e-06, + "loss": 0.5388, + "step": 2449 + }, + { + "epoch": 2.7404921700223714, + "grad_norm": 0.5399242639541626, + "learning_rate": 2.9187655582618413e-06, + "loss": 0.5527, + "step": 2450 + }, + { + "epoch": 2.7416107382550337, + "grad_norm": 0.5366097092628479, + "learning_rate": 2.917294546675553e-06, + "loss": 0.5435, + "step": 2451 + }, + { + "epoch": 2.742729306487696, + "grad_norm": 0.5393624901771545, + "learning_rate": 2.9158233864578256e-06, + "loss": 0.5533, + "step": 2452 + }, + { + "epoch": 2.743847874720358, + "grad_norm": 0.5356276631355286, + "learning_rate": 2.9143520781326564e-06, + "loss": 0.5573, + "step": 2453 + }, + { + "epoch": 2.7449664429530203, + "grad_norm": 0.5415606498718262, + "learning_rate": 2.912880622224093e-06, + "loss": 0.528, + "step": 2454 + }, + { + "epoch": 2.7460850111856825, + "grad_norm": 0.5313133597373962, + "learning_rate": 2.911409019256237e-06, + "loss": 0.5427, + "step": 2455 + }, + { + "epoch": 2.7472035794183443, + "grad_norm": 0.5546746850013733, + "learning_rate": 2.909937269753243e-06, + "loss": 0.5809, + "step": 2456 + }, + { + "epoch": 2.748322147651007, + "grad_norm": 0.5541934370994568, + "learning_rate": 2.9084653742393158e-06, + "loss": 0.5762, + "step": 2457 + }, + { + "epoch": 2.7494407158836687, + "grad_norm": 0.5450883507728577, + "learning_rate": 2.906993333238714e-06, + "loss": 0.5231, + "step": 2458 + }, + { + "epoch": 2.7505592841163313, + "grad_norm": 0.5383808016777039, + "learning_rate": 2.9055211472757466e-06, + "loss": 0.5609, + "step": 2459 + }, + { + "epoch": 2.751677852348993, + "grad_norm": 0.5436487197875977, + "learning_rate": 2.9040488168747755e-06, + "loss": 0.5545, + "step": 2460 + }, + { + "epoch": 2.7527964205816557, + "grad_norm": 0.5587117671966553, + "learning_rate": 2.9025763425602127e-06, + "loss": 0.5589, + "step": 2461 + }, + { + "epoch": 2.7539149888143175, + "grad_norm": 0.558638870716095, + "learning_rate": 2.901103724856523e-06, + "loss": 0.5748, + "step": 2462 + }, + { + "epoch": 2.7550335570469797, + "grad_norm": 0.5497444868087769, + "learning_rate": 2.899630964288222e-06, + "loss": 0.5665, + "step": 2463 + }, + { + "epoch": 2.756152125279642, + "grad_norm": 0.5379343032836914, + "learning_rate": 2.898158061379874e-06, + "loss": 0.5463, + "step": 2464 + }, + { + "epoch": 2.757270693512304, + "grad_norm": 0.5531080961227417, + "learning_rate": 2.896685016656096e-06, + "loss": 0.5673, + "step": 2465 + }, + { + "epoch": 2.7583892617449663, + "grad_norm": 0.5656123161315918, + "learning_rate": 2.895211830641556e-06, + "loss": 0.5417, + "step": 2466 + }, + { + "epoch": 2.7595078299776286, + "grad_norm": 0.5386385321617126, + "learning_rate": 2.893738503860972e-06, + "loss": 0.5508, + "step": 2467 + }, + { + "epoch": 2.7606263982102908, + "grad_norm": 0.5320050120353699, + "learning_rate": 2.8922650368391103e-06, + "loss": 0.5371, + "step": 2468 + }, + { + "epoch": 2.761744966442953, + "grad_norm": 0.5440961122512817, + "learning_rate": 2.8907914301007893e-06, + "loss": 0.5543, + "step": 2469 + }, + { + "epoch": 2.762863534675615, + "grad_norm": 0.5435571074485779, + "learning_rate": 2.8893176841708764e-06, + "loss": 0.5483, + "step": 2470 + }, + { + "epoch": 2.7639821029082774, + "grad_norm": 0.5409116744995117, + "learning_rate": 2.887843799574288e-06, + "loss": 0.5721, + "step": 2471 + }, + { + "epoch": 2.7651006711409396, + "grad_norm": 0.5535840392112732, + "learning_rate": 2.886369776835991e-06, + "loss": 0.5884, + "step": 2472 + }, + { + "epoch": 2.766219239373602, + "grad_norm": 0.5394747853279114, + "learning_rate": 2.8848956164810007e-06, + "loss": 0.536, + "step": 2473 + }, + { + "epoch": 2.767337807606264, + "grad_norm": 0.5277183055877686, + "learning_rate": 2.883421319034381e-06, + "loss": 0.5351, + "step": 2474 + }, + { + "epoch": 2.7684563758389262, + "grad_norm": 0.5350673198699951, + "learning_rate": 2.881946885021246e-06, + "loss": 0.5059, + "step": 2475 + }, + { + "epoch": 2.7695749440715884, + "grad_norm": 0.5454860329627991, + "learning_rate": 2.880472314966758e-06, + "loss": 0.5526, + "step": 2476 + }, + { + "epoch": 2.7706935123042506, + "grad_norm": 0.5411888957023621, + "learning_rate": 2.878997609396127e-06, + "loss": 0.5468, + "step": 2477 + }, + { + "epoch": 2.771812080536913, + "grad_norm": 0.5470668077468872, + "learning_rate": 2.877522768834611e-06, + "loss": 0.5556, + "step": 2478 + }, + { + "epoch": 2.772930648769575, + "grad_norm": 0.5573850274085999, + "learning_rate": 2.8760477938075194e-06, + "loss": 0.5565, + "step": 2479 + }, + { + "epoch": 2.7740492170022373, + "grad_norm": 0.5545740127563477, + "learning_rate": 2.8745726848402037e-06, + "loss": 0.5446, + "step": 2480 + }, + { + "epoch": 2.7751677852348995, + "grad_norm": 0.5616520643234253, + "learning_rate": 2.873097442458069e-06, + "loss": 0.569, + "step": 2481 + }, + { + "epoch": 2.7762863534675617, + "grad_norm": 0.5486459136009216, + "learning_rate": 2.8716220671865635e-06, + "loss": 0.5499, + "step": 2482 + }, + { + "epoch": 2.7774049217002235, + "grad_norm": 0.5648736953735352, + "learning_rate": 2.870146559551185e-06, + "loss": 0.5705, + "step": 2483 + }, + { + "epoch": 2.778523489932886, + "grad_norm": 0.5584491491317749, + "learning_rate": 2.8686709200774782e-06, + "loss": 0.5624, + "step": 2484 + }, + { + "epoch": 2.779642058165548, + "grad_norm": 0.5620800256729126, + "learning_rate": 2.8671951492910337e-06, + "loss": 0.5592, + "step": 2485 + }, + { + "epoch": 2.7807606263982105, + "grad_norm": 0.5411422848701477, + "learning_rate": 2.8657192477174907e-06, + "loss": 0.5595, + "step": 2486 + }, + { + "epoch": 2.7818791946308723, + "grad_norm": 0.540215015411377, + "learning_rate": 2.8642432158825336e-06, + "loss": 0.5726, + "step": 2487 + }, + { + "epoch": 2.782997762863535, + "grad_norm": 0.548093855381012, + "learning_rate": 2.8627670543118924e-06, + "loss": 0.57, + "step": 2488 + }, + { + "epoch": 2.7841163310961967, + "grad_norm": 0.5470214486122131, + "learning_rate": 2.8612907635313464e-06, + "loss": 0.541, + "step": 2489 + }, + { + "epoch": 2.785234899328859, + "grad_norm": 0.5352866053581238, + "learning_rate": 2.859814344066717e-06, + "loss": 0.5392, + "step": 2490 + }, + { + "epoch": 2.786353467561521, + "grad_norm": 0.5467997193336487, + "learning_rate": 2.858337796443876e-06, + "loss": 0.549, + "step": 2491 + }, + { + "epoch": 2.7874720357941833, + "grad_norm": 0.5317350625991821, + "learning_rate": 2.8568611211887355e-06, + "loss": 0.5367, + "step": 2492 + }, + { + "epoch": 2.7885906040268456, + "grad_norm": 0.5586799383163452, + "learning_rate": 2.8553843188272567e-06, + "loss": 0.5702, + "step": 2493 + }, + { + "epoch": 2.7897091722595078, + "grad_norm": 0.5454741716384888, + "learning_rate": 2.853907389885446e-06, + "loss": 0.5338, + "step": 2494 + }, + { + "epoch": 2.79082774049217, + "grad_norm": 0.5470857620239258, + "learning_rate": 2.8524303348893534e-06, + "loss": 0.5624, + "step": 2495 + }, + { + "epoch": 2.791946308724832, + "grad_norm": 0.5443030595779419, + "learning_rate": 2.850953154365074e-06, + "loss": 0.5547, + "step": 2496 + }, + { + "epoch": 2.7930648769574944, + "grad_norm": 0.5613713264465332, + "learning_rate": 2.849475848838749e-06, + "loss": 0.5769, + "step": 2497 + }, + { + "epoch": 2.7941834451901566, + "grad_norm": 0.5577437281608582, + "learning_rate": 2.847998418836563e-06, + "loss": 0.554, + "step": 2498 + }, + { + "epoch": 2.795302013422819, + "grad_norm": 0.547347366809845, + "learning_rate": 2.8465208648847446e-06, + "loss": 0.5581, + "step": 2499 + }, + { + "epoch": 2.796420581655481, + "grad_norm": 0.568202555179596, + "learning_rate": 2.8450431875095673e-06, + "loss": 0.5551, + "step": 2500 + }, + { + "epoch": 2.7975391498881432, + "grad_norm": 0.5745724439620972, + "learning_rate": 2.8435653872373485e-06, + "loss": 0.5687, + "step": 2501 + }, + { + "epoch": 2.7986577181208054, + "grad_norm": 0.5461832880973816, + "learning_rate": 2.842087464594449e-06, + "loss": 0.5609, + "step": 2502 + }, + { + "epoch": 2.7997762863534676, + "grad_norm": 0.5509635210037231, + "learning_rate": 2.8406094201072744e-06, + "loss": 0.5721, + "step": 2503 + }, + { + "epoch": 2.80089485458613, + "grad_norm": 0.5565740466117859, + "learning_rate": 2.8391312543022715e-06, + "loss": 0.5456, + "step": 2504 + }, + { + "epoch": 2.802013422818792, + "grad_norm": 0.5471952557563782, + "learning_rate": 2.8376529677059318e-06, + "loss": 0.5496, + "step": 2505 + }, + { + "epoch": 2.8031319910514543, + "grad_norm": 0.5586651563644409, + "learning_rate": 2.836174560844789e-06, + "loss": 0.5636, + "step": 2506 + }, + { + "epoch": 2.8042505592841165, + "grad_norm": 0.5542725920677185, + "learning_rate": 2.834696034245422e-06, + "loss": 0.5407, + "step": 2507 + }, + { + "epoch": 2.8053691275167782, + "grad_norm": 0.5610932111740112, + "learning_rate": 2.8332173884344477e-06, + "loss": 0.5668, + "step": 2508 + }, + { + "epoch": 2.806487695749441, + "grad_norm": 0.5772159099578857, + "learning_rate": 2.8317386239385302e-06, + "loss": 0.5731, + "step": 2509 + }, + { + "epoch": 2.8076062639821027, + "grad_norm": 0.5300142168998718, + "learning_rate": 2.830259741284374e-06, + "loss": 0.5445, + "step": 2510 + }, + { + "epoch": 2.8087248322147653, + "grad_norm": 0.5480130910873413, + "learning_rate": 2.8287807409987255e-06, + "loss": 0.5538, + "step": 2511 + }, + { + "epoch": 2.809843400447427, + "grad_norm": 0.5556392669677734, + "learning_rate": 2.827301623608372e-06, + "loss": 0.5844, + "step": 2512 + }, + { + "epoch": 2.8109619686800897, + "grad_norm": 0.5518432855606079, + "learning_rate": 2.825822389640145e-06, + "loss": 0.5149, + "step": 2513 + }, + { + "epoch": 2.8120805369127515, + "grad_norm": 0.5658623576164246, + "learning_rate": 2.8243430396209156e-06, + "loss": 0.5686, + "step": 2514 + }, + { + "epoch": 2.8131991051454137, + "grad_norm": 0.5638464689254761, + "learning_rate": 2.8228635740775974e-06, + "loss": 0.5658, + "step": 2515 + }, + { + "epoch": 2.814317673378076, + "grad_norm": 0.5382245182991028, + "learning_rate": 2.821383993537144e-06, + "loss": 0.5447, + "step": 2516 + }, + { + "epoch": 2.815436241610738, + "grad_norm": 0.5304442048072815, + "learning_rate": 2.8199042985265496e-06, + "loss": 0.5498, + "step": 2517 + }, + { + "epoch": 2.8165548098434003, + "grad_norm": 0.5279328227043152, + "learning_rate": 2.818424489572851e-06, + "loss": 0.5279, + "step": 2518 + }, + { + "epoch": 2.8176733780760626, + "grad_norm": 0.5524958372116089, + "learning_rate": 2.8169445672031258e-06, + "loss": 0.5404, + "step": 2519 + }, + { + "epoch": 2.8187919463087248, + "grad_norm": 0.538012683391571, + "learning_rate": 2.815464531944489e-06, + "loss": 0.5333, + "step": 2520 + }, + { + "epoch": 2.819910514541387, + "grad_norm": 0.557972252368927, + "learning_rate": 2.8139843843240983e-06, + "loss": 0.5709, + "step": 2521 + }, + { + "epoch": 2.821029082774049, + "grad_norm": 0.5606746077537537, + "learning_rate": 2.8125041248691506e-06, + "loss": 0.5709, + "step": 2522 + }, + { + "epoch": 2.8221476510067114, + "grad_norm": 0.5624932050704956, + "learning_rate": 2.8110237541068835e-06, + "loss": 0.5678, + "step": 2523 + }, + { + "epoch": 2.8232662192393736, + "grad_norm": 0.5436639189720154, + "learning_rate": 2.809543272564573e-06, + "loss": 0.5327, + "step": 2524 + }, + { + "epoch": 2.824384787472036, + "grad_norm": 0.5370485186576843, + "learning_rate": 2.8080626807695354e-06, + "loss": 0.5356, + "step": 2525 + }, + { + "epoch": 2.825503355704698, + "grad_norm": 0.5638765692710876, + "learning_rate": 2.8065819792491263e-06, + "loss": 0.5873, + "step": 2526 + }, + { + "epoch": 2.8266219239373602, + "grad_norm": 0.5592923760414124, + "learning_rate": 2.805101168530739e-06, + "loss": 0.542, + "step": 2527 + }, + { + "epoch": 2.8277404921700224, + "grad_norm": 0.5437577366828918, + "learning_rate": 2.803620249141808e-06, + "loss": 0.5637, + "step": 2528 + }, + { + "epoch": 2.8288590604026846, + "grad_norm": 0.5569809079170227, + "learning_rate": 2.802139221609804e-06, + "loss": 0.5367, + "step": 2529 + }, + { + "epoch": 2.829977628635347, + "grad_norm": 0.5529781579971313, + "learning_rate": 2.8006580864622385e-06, + "loss": 0.5595, + "step": 2530 + }, + { + "epoch": 2.831096196868009, + "grad_norm": 0.5500108003616333, + "learning_rate": 2.79917684422666e-06, + "loss": 0.5459, + "step": 2531 + }, + { + "epoch": 2.8322147651006713, + "grad_norm": 0.5583110451698303, + "learning_rate": 2.7976954954306555e-06, + "loss": 0.5433, + "step": 2532 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.5412235856056213, + "learning_rate": 2.7962140406018496e-06, + "loss": 0.5543, + "step": 2533 + }, + { + "epoch": 2.8344519015659957, + "grad_norm": 0.5551288723945618, + "learning_rate": 2.7947324802679053e-06, + "loss": 0.5445, + "step": 2534 + }, + { + "epoch": 2.8355704697986575, + "grad_norm": 0.5394756197929382, + "learning_rate": 2.7932508149565235e-06, + "loss": 0.5299, + "step": 2535 + }, + { + "epoch": 2.83668903803132, + "grad_norm": 0.552911102771759, + "learning_rate": 2.791769045195441e-06, + "loss": 0.5792, + "step": 2536 + }, + { + "epoch": 2.837807606263982, + "grad_norm": 0.5488527417182922, + "learning_rate": 2.7902871715124324e-06, + "loss": 0.5714, + "step": 2537 + }, + { + "epoch": 2.8389261744966445, + "grad_norm": 0.542826771736145, + "learning_rate": 2.78880519443531e-06, + "loss": 0.528, + "step": 2538 + }, + { + "epoch": 2.8400447427293063, + "grad_norm": 0.5490564107894897, + "learning_rate": 2.7873231144919226e-06, + "loss": 0.5413, + "step": 2539 + }, + { + "epoch": 2.841163310961969, + "grad_norm": 0.5559353232383728, + "learning_rate": 2.7858409322101564e-06, + "loss": 0.5898, + "step": 2540 + }, + { + "epoch": 2.8422818791946307, + "grad_norm": 0.5642809271812439, + "learning_rate": 2.7843586481179314e-06, + "loss": 0.5643, + "step": 2541 + }, + { + "epoch": 2.843400447427293, + "grad_norm": 0.5449807047843933, + "learning_rate": 2.7828762627432063e-06, + "loss": 0.5384, + "step": 2542 + }, + { + "epoch": 2.844519015659955, + "grad_norm": 0.5522620677947998, + "learning_rate": 2.7813937766139765e-06, + "loss": 0.5516, + "step": 2543 + }, + { + "epoch": 2.8456375838926173, + "grad_norm": 0.5229454636573792, + "learning_rate": 2.7799111902582697e-06, + "loss": 0.5311, + "step": 2544 + }, + { + "epoch": 2.8467561521252795, + "grad_norm": 0.5598391890525818, + "learning_rate": 2.7784285042041536e-06, + "loss": 0.5594, + "step": 2545 + }, + { + "epoch": 2.8478747203579418, + "grad_norm": 0.5504311919212341, + "learning_rate": 2.7769457189797284e-06, + "loss": 0.5565, + "step": 2546 + }, + { + "epoch": 2.848993288590604, + "grad_norm": 0.5354952812194824, + "learning_rate": 2.7754628351131317e-06, + "loss": 0.5522, + "step": 2547 + }, + { + "epoch": 2.850111856823266, + "grad_norm": 0.5451246500015259, + "learning_rate": 2.773979853132534e-06, + "loss": 0.5545, + "step": 2548 + }, + { + "epoch": 2.8512304250559284, + "grad_norm": 0.5536582469940186, + "learning_rate": 2.772496773566142e-06, + "loss": 0.5513, + "step": 2549 + }, + { + "epoch": 2.8523489932885906, + "grad_norm": 0.5547369718551636, + "learning_rate": 2.7710135969421975e-06, + "loss": 0.5645, + "step": 2550 + }, + { + "epoch": 2.853467561521253, + "grad_norm": 0.538840651512146, + "learning_rate": 2.7695303237889764e-06, + "loss": 0.5377, + "step": 2551 + }, + { + "epoch": 2.854586129753915, + "grad_norm": 0.5319745540618896, + "learning_rate": 2.76804695463479e-06, + "loss": 0.5439, + "step": 2552 + }, + { + "epoch": 2.8557046979865772, + "grad_norm": 0.5550813674926758, + "learning_rate": 2.7665634900079806e-06, + "loss": 0.5344, + "step": 2553 + }, + { + "epoch": 2.8568232662192394, + "grad_norm": 0.5305697917938232, + "learning_rate": 2.7650799304369287e-06, + "loss": 0.5335, + "step": 2554 + }, + { + "epoch": 2.8579418344519016, + "grad_norm": 0.5529859066009521, + "learning_rate": 2.7635962764500453e-06, + "loss": 0.5592, + "step": 2555 + }, + { + "epoch": 2.859060402684564, + "grad_norm": 0.5587506294250488, + "learning_rate": 2.7621125285757778e-06, + "loss": 0.5633, + "step": 2556 + }, + { + "epoch": 2.860178970917226, + "grad_norm": 0.5648806095123291, + "learning_rate": 2.7606286873426048e-06, + "loss": 0.5352, + "step": 2557 + }, + { + "epoch": 2.8612975391498883, + "grad_norm": 0.5574096441268921, + "learning_rate": 2.759144753279039e-06, + "loss": 0.5544, + "step": 2558 + }, + { + "epoch": 2.8624161073825505, + "grad_norm": 0.564165472984314, + "learning_rate": 2.7576607269136257e-06, + "loss": 0.5887, + "step": 2559 + }, + { + "epoch": 2.8635346756152127, + "grad_norm": 0.5468332171440125, + "learning_rate": 2.7561766087749443e-06, + "loss": 0.5333, + "step": 2560 + }, + { + "epoch": 2.864653243847875, + "grad_norm": 0.5513910055160522, + "learning_rate": 2.754692399391605e-06, + "loss": 0.5519, + "step": 2561 + }, + { + "epoch": 2.8657718120805367, + "grad_norm": 0.5543265342712402, + "learning_rate": 2.7532080992922535e-06, + "loss": 0.5532, + "step": 2562 + }, + { + "epoch": 2.8668903803131993, + "grad_norm": 0.5612424612045288, + "learning_rate": 2.7517237090055644e-06, + "loss": 0.557, + "step": 2563 + }, + { + "epoch": 2.868008948545861, + "grad_norm": 0.537941575050354, + "learning_rate": 2.7502392290602463e-06, + "loss": 0.521, + "step": 2564 + }, + { + "epoch": 2.8691275167785237, + "grad_norm": 0.5556497573852539, + "learning_rate": 2.748754659985039e-06, + "loss": 0.5549, + "step": 2565 + }, + { + "epoch": 2.8702460850111855, + "grad_norm": 0.5541491508483887, + "learning_rate": 2.7472700023087153e-06, + "loss": 0.543, + "step": 2566 + }, + { + "epoch": 2.8713646532438477, + "grad_norm": 0.5541443228721619, + "learning_rate": 2.745785256560078e-06, + "loss": 0.5637, + "step": 2567 + }, + { + "epoch": 2.87248322147651, + "grad_norm": 0.5583652257919312, + "learning_rate": 2.744300423267963e-06, + "loss": 0.5699, + "step": 2568 + }, + { + "epoch": 2.873601789709172, + "grad_norm": 0.5382434725761414, + "learning_rate": 2.7428155029612345e-06, + "loss": 0.5431, + "step": 2569 + }, + { + "epoch": 2.8747203579418343, + "grad_norm": 0.5416514873504639, + "learning_rate": 2.7413304961687904e-06, + "loss": 0.5399, + "step": 2570 + }, + { + "epoch": 2.8758389261744965, + "grad_norm": 0.5517233610153198, + "learning_rate": 2.7398454034195588e-06, + "loss": 0.5485, + "step": 2571 + }, + { + "epoch": 2.8769574944071588, + "grad_norm": 0.5566003322601318, + "learning_rate": 2.7383602252424983e-06, + "loss": 0.537, + "step": 2572 + }, + { + "epoch": 2.878076062639821, + "grad_norm": 0.5358081459999084, + "learning_rate": 2.736874962166597e-06, + "loss": 0.5561, + "step": 2573 + }, + { + "epoch": 2.879194630872483, + "grad_norm": 0.551876425743103, + "learning_rate": 2.735389614720874e-06, + "loss": 0.5747, + "step": 2574 + }, + { + "epoch": 2.8803131991051454, + "grad_norm": 0.5637210011482239, + "learning_rate": 2.7339041834343795e-06, + "loss": 0.56, + "step": 2575 + }, + { + "epoch": 2.8814317673378076, + "grad_norm": 0.5494970679283142, + "learning_rate": 2.732418668836192e-06, + "loss": 0.5727, + "step": 2576 + }, + { + "epoch": 2.88255033557047, + "grad_norm": 0.5421887636184692, + "learning_rate": 2.7309330714554193e-06, + "loss": 0.5499, + "step": 2577 + }, + { + "epoch": 2.883668903803132, + "grad_norm": 0.5459975004196167, + "learning_rate": 2.729447391821201e-06, + "loss": 0.5588, + "step": 2578 + }, + { + "epoch": 2.884787472035794, + "grad_norm": 0.5452088713645935, + "learning_rate": 2.7279616304627044e-06, + "loss": 0.5453, + "step": 2579 + }, + { + "epoch": 2.8859060402684564, + "grad_norm": 0.5529616475105286, + "learning_rate": 2.726475787909125e-06, + "loss": 0.5528, + "step": 2580 + }, + { + "epoch": 2.8870246085011186, + "grad_norm": 0.5486506223678589, + "learning_rate": 2.7249898646896888e-06, + "loss": 0.5536, + "step": 2581 + }, + { + "epoch": 2.888143176733781, + "grad_norm": 0.560581386089325, + "learning_rate": 2.7235038613336503e-06, + "loss": 0.5556, + "step": 2582 + }, + { + "epoch": 2.889261744966443, + "grad_norm": 0.5591466426849365, + "learning_rate": 2.7220177783702913e-06, + "loss": 0.5528, + "step": 2583 + }, + { + "epoch": 2.8903803131991053, + "grad_norm": 0.5352728366851807, + "learning_rate": 2.720531616328925e-06, + "loss": 0.5602, + "step": 2584 + }, + { + "epoch": 2.8914988814317675, + "grad_norm": 0.5462122559547424, + "learning_rate": 2.7190453757388885e-06, + "loss": 0.5512, + "step": 2585 + }, + { + "epoch": 2.8926174496644297, + "grad_norm": 0.5758877396583557, + "learning_rate": 2.71755905712955e-06, + "loss": 0.5787, + "step": 2586 + }, + { + "epoch": 2.8937360178970915, + "grad_norm": 0.5675324201583862, + "learning_rate": 2.716072661030305e-06, + "loss": 0.5781, + "step": 2587 + }, + { + "epoch": 2.894854586129754, + "grad_norm": 0.5495246052742004, + "learning_rate": 2.714586187970576e-06, + "loss": 0.5608, + "step": 2588 + }, + { + "epoch": 2.895973154362416, + "grad_norm": 0.531521201133728, + "learning_rate": 2.713099638479813e-06, + "loss": 0.5444, + "step": 2589 + }, + { + "epoch": 2.8970917225950785, + "grad_norm": 0.5473596453666687, + "learning_rate": 2.7116130130874922e-06, + "loss": 0.5398, + "step": 2590 + }, + { + "epoch": 2.8982102908277403, + "grad_norm": 0.5658391714096069, + "learning_rate": 2.710126312323119e-06, + "loss": 0.5776, + "step": 2591 + }, + { + "epoch": 2.899328859060403, + "grad_norm": 0.5498483180999756, + "learning_rate": 2.708639536716225e-06, + "loss": 0.5609, + "step": 2592 + }, + { + "epoch": 2.9004474272930647, + "grad_norm": 0.5776868462562561, + "learning_rate": 2.707152686796367e-06, + "loss": 0.5571, + "step": 2593 + }, + { + "epoch": 2.901565995525727, + "grad_norm": 0.5405291318893433, + "learning_rate": 2.7056657630931298e-06, + "loss": 0.5472, + "step": 2594 + }, + { + "epoch": 2.902684563758389, + "grad_norm": 0.5518377423286438, + "learning_rate": 2.7041787661361243e-06, + "loss": 0.5348, + "step": 2595 + }, + { + "epoch": 2.9038031319910513, + "grad_norm": 0.5597490072250366, + "learning_rate": 2.7026916964549866e-06, + "loss": 0.5579, + "step": 2596 + }, + { + "epoch": 2.9049217002237135, + "grad_norm": 0.5632794499397278, + "learning_rate": 2.7012045545793793e-06, + "loss": 0.5881, + "step": 2597 + }, + { + "epoch": 2.9060402684563758, + "grad_norm": 0.5456726551055908, + "learning_rate": 2.699717341038991e-06, + "loss": 0.5566, + "step": 2598 + }, + { + "epoch": 2.907158836689038, + "grad_norm": 0.5435263514518738, + "learning_rate": 2.698230056363535e-06, + "loss": 0.5506, + "step": 2599 + }, + { + "epoch": 2.9082774049217, + "grad_norm": 0.5349182486534119, + "learning_rate": 2.696742701082752e-06, + "loss": 0.5537, + "step": 2600 + }, + { + "epoch": 2.9093959731543624, + "grad_norm": 0.5587396025657654, + "learning_rate": 2.6952552757264044e-06, + "loss": 0.5859, + "step": 2601 + }, + { + "epoch": 2.9105145413870246, + "grad_norm": 0.5457752346992493, + "learning_rate": 2.6937677808242824e-06, + "loss": 0.5436, + "step": 2602 + }, + { + "epoch": 2.911633109619687, + "grad_norm": 0.5724230408668518, + "learning_rate": 2.6922802169061997e-06, + "loss": 0.5911, + "step": 2603 + }, + { + "epoch": 2.912751677852349, + "grad_norm": 0.5488688349723816, + "learning_rate": 2.6907925845019944e-06, + "loss": 0.5629, + "step": 2604 + }, + { + "epoch": 2.913870246085011, + "grad_norm": 0.543271005153656, + "learning_rate": 2.6893048841415313e-06, + "loss": 0.5554, + "step": 2605 + }, + { + "epoch": 2.9149888143176734, + "grad_norm": 0.5463168621063232, + "learning_rate": 2.687817116354696e-06, + "loss": 0.5724, + "step": 2606 + }, + { + "epoch": 2.9161073825503356, + "grad_norm": 0.5488230586051941, + "learning_rate": 2.6863292816714005e-06, + "loss": 0.5425, + "step": 2607 + }, + { + "epoch": 2.917225950782998, + "grad_norm": 0.5395553112030029, + "learning_rate": 2.6848413806215793e-06, + "loss": 0.5764, + "step": 2608 + }, + { + "epoch": 2.91834451901566, + "grad_norm": 0.5511579513549805, + "learning_rate": 2.683353413735192e-06, + "loss": 0.5561, + "step": 2609 + }, + { + "epoch": 2.9194630872483223, + "grad_norm": 0.54108065366745, + "learning_rate": 2.6818653815422195e-06, + "loss": 0.5349, + "step": 2610 + }, + { + "epoch": 2.9205816554809845, + "grad_norm": 0.5389105081558228, + "learning_rate": 2.680377284572668e-06, + "loss": 0.5338, + "step": 2611 + }, + { + "epoch": 2.9217002237136467, + "grad_norm": 0.5608965754508972, + "learning_rate": 2.6788891233565656e-06, + "loss": 0.5656, + "step": 2612 + }, + { + "epoch": 2.922818791946309, + "grad_norm": 0.5487237572669983, + "learning_rate": 2.677400898423964e-06, + "loss": 0.5512, + "step": 2613 + }, + { + "epoch": 2.9239373601789707, + "grad_norm": 0.5492651462554932, + "learning_rate": 2.6759126103049372e-06, + "loss": 0.5183, + "step": 2614 + }, + { + "epoch": 2.9250559284116333, + "grad_norm": 0.569842517375946, + "learning_rate": 2.6744242595295817e-06, + "loss": 0.5702, + "step": 2615 + }, + { + "epoch": 2.926174496644295, + "grad_norm": 0.5518653392791748, + "learning_rate": 2.6729358466280157e-06, + "loss": 0.5426, + "step": 2616 + }, + { + "epoch": 2.9272930648769577, + "grad_norm": 0.5671966075897217, + "learning_rate": 2.671447372130382e-06, + "loss": 0.5429, + "step": 2617 + }, + { + "epoch": 2.9284116331096195, + "grad_norm": 0.5829213261604309, + "learning_rate": 2.6699588365668417e-06, + "loss": 0.587, + "step": 2618 + }, + { + "epoch": 2.929530201342282, + "grad_norm": 0.5526999235153198, + "learning_rate": 2.66847024046758e-06, + "loss": 0.533, + "step": 2619 + }, + { + "epoch": 2.930648769574944, + "grad_norm": 0.551668643951416, + "learning_rate": 2.6669815843628043e-06, + "loss": 0.5584, + "step": 2620 + }, + { + "epoch": 2.931767337807606, + "grad_norm": 0.5449694991111755, + "learning_rate": 2.6654928687827407e-06, + "loss": 0.5725, + "step": 2621 + }, + { + "epoch": 2.9328859060402683, + "grad_norm": 0.5545471906661987, + "learning_rate": 2.664004094257639e-06, + "loss": 0.5527, + "step": 2622 + }, + { + "epoch": 2.9340044742729305, + "grad_norm": 0.5464156270027161, + "learning_rate": 2.662515261317768e-06, + "loss": 0.5383, + "step": 2623 + }, + { + "epoch": 2.9351230425055927, + "grad_norm": 0.5514599084854126, + "learning_rate": 2.6610263704934185e-06, + "loss": 0.5362, + "step": 2624 + }, + { + "epoch": 2.936241610738255, + "grad_norm": 0.5682641863822937, + "learning_rate": 2.6595374223149033e-06, + "loss": 0.5618, + "step": 2625 + }, + { + "epoch": 2.937360178970917, + "grad_norm": 0.5577966570854187, + "learning_rate": 2.658048417312552e-06, + "loss": 0.5456, + "step": 2626 + }, + { + "epoch": 2.9384787472035794, + "grad_norm": 0.557861328125, + "learning_rate": 2.656559356016718e-06, + "loss": 0.5655, + "step": 2627 + }, + { + "epoch": 2.9395973154362416, + "grad_norm": 0.5510232448577881, + "learning_rate": 2.655070238957772e-06, + "loss": 0.571, + "step": 2628 + }, + { + "epoch": 2.940715883668904, + "grad_norm": 0.5610975027084351, + "learning_rate": 2.653581066666107e-06, + "loss": 0.5537, + "step": 2629 + }, + { + "epoch": 2.941834451901566, + "grad_norm": 0.5405017137527466, + "learning_rate": 2.652091839672134e-06, + "loss": 0.5721, + "step": 2630 + }, + { + "epoch": 2.942953020134228, + "grad_norm": 0.5660196542739868, + "learning_rate": 2.6506025585062837e-06, + "loss": 0.5455, + "step": 2631 + }, + { + "epoch": 2.9440715883668904, + "grad_norm": 0.5373615622520447, + "learning_rate": 2.6491132236990067e-06, + "loss": 0.563, + "step": 2632 + }, + { + "epoch": 2.9451901565995526, + "grad_norm": 0.530661404132843, + "learning_rate": 2.6476238357807726e-06, + "loss": 0.5758, + "step": 2633 + }, + { + "epoch": 2.946308724832215, + "grad_norm": 0.5533543229103088, + "learning_rate": 2.646134395282069e-06, + "loss": 0.5816, + "step": 2634 + }, + { + "epoch": 2.947427293064877, + "grad_norm": 0.5634375810623169, + "learning_rate": 2.644644902733403e-06, + "loss": 0.5539, + "step": 2635 + }, + { + "epoch": 2.9485458612975393, + "grad_norm": 0.539598286151886, + "learning_rate": 2.6431553586653003e-06, + "loss": 0.5361, + "step": 2636 + }, + { + "epoch": 2.9496644295302015, + "grad_norm": 0.5568811297416687, + "learning_rate": 2.641665763608306e-06, + "loss": 0.5716, + "step": 2637 + }, + { + "epoch": 2.9507829977628637, + "grad_norm": 0.5459252595901489, + "learning_rate": 2.6401761180929798e-06, + "loss": 0.5486, + "step": 2638 + }, + { + "epoch": 2.951901565995526, + "grad_norm": 0.5622868537902832, + "learning_rate": 2.6386864226499033e-06, + "loss": 0.5569, + "step": 2639 + }, + { + "epoch": 2.953020134228188, + "grad_norm": 0.5561519265174866, + "learning_rate": 2.637196677809674e-06, + "loss": 0.5792, + "step": 2640 + }, + { + "epoch": 2.95413870246085, + "grad_norm": 0.5592246055603027, + "learning_rate": 2.635706884102908e-06, + "loss": 0.5386, + "step": 2641 + }, + { + "epoch": 2.9552572706935125, + "grad_norm": 0.5718494653701782, + "learning_rate": 2.6342170420602375e-06, + "loss": 0.5803, + "step": 2642 + }, + { + "epoch": 2.9563758389261743, + "grad_norm": 0.5384505391120911, + "learning_rate": 2.6327271522123114e-06, + "loss": 0.5521, + "step": 2643 + }, + { + "epoch": 2.957494407158837, + "grad_norm": 0.5544297099113464, + "learning_rate": 2.6312372150897985e-06, + "loss": 0.5515, + "step": 2644 + }, + { + "epoch": 2.9586129753914987, + "grad_norm": 0.5528863072395325, + "learning_rate": 2.6297472312233824e-06, + "loss": 0.5426, + "step": 2645 + }, + { + "epoch": 2.959731543624161, + "grad_norm": 0.5625507235527039, + "learning_rate": 2.628257201143762e-06, + "loss": 0.5543, + "step": 2646 + }, + { + "epoch": 2.960850111856823, + "grad_norm": 0.53592848777771, + "learning_rate": 2.6267671253816566e-06, + "loss": 0.5246, + "step": 2647 + }, + { + "epoch": 2.9619686800894853, + "grad_norm": 0.546793520450592, + "learning_rate": 2.625277004467798e-06, + "loss": 0.5297, + "step": 2648 + }, + { + "epoch": 2.9630872483221475, + "grad_norm": 0.5562222599983215, + "learning_rate": 2.6237868389329374e-06, + "loss": 0.54, + "step": 2649 + }, + { + "epoch": 2.9642058165548097, + "grad_norm": 0.5625802874565125, + "learning_rate": 2.6222966293078374e-06, + "loss": 0.5543, + "step": 2650 + }, + { + "epoch": 2.965324384787472, + "grad_norm": 0.5509119629859924, + "learning_rate": 2.6208063761232804e-06, + "loss": 0.5934, + "step": 2651 + }, + { + "epoch": 2.966442953020134, + "grad_norm": 0.5902206301689148, + "learning_rate": 2.6193160799100633e-06, + "loss": 0.5806, + "step": 2652 + }, + { + "epoch": 2.9675615212527964, + "grad_norm": 0.559611976146698, + "learning_rate": 2.617825741198997e-06, + "loss": 0.5643, + "step": 2653 + }, + { + "epoch": 2.9686800894854586, + "grad_norm": 0.5830098390579224, + "learning_rate": 2.616335360520909e-06, + "loss": 0.5307, + "step": 2654 + }, + { + "epoch": 2.969798657718121, + "grad_norm": 0.5459845662117004, + "learning_rate": 2.6148449384066403e-06, + "loss": 0.5436, + "step": 2655 + }, + { + "epoch": 2.970917225950783, + "grad_norm": 0.5625338554382324, + "learning_rate": 2.6133544753870487e-06, + "loss": 0.5497, + "step": 2656 + }, + { + "epoch": 2.972035794183445, + "grad_norm": 0.5530948042869568, + "learning_rate": 2.6118639719930057e-06, + "loss": 0.5711, + "step": 2657 + }, + { + "epoch": 2.9731543624161074, + "grad_norm": 0.5448921918869019, + "learning_rate": 2.610373428755395e-06, + "loss": 0.5406, + "step": 2658 + }, + { + "epoch": 2.9742729306487696, + "grad_norm": 0.5586265921592712, + "learning_rate": 2.6088828462051174e-06, + "loss": 0.5419, + "step": 2659 + }, + { + "epoch": 2.975391498881432, + "grad_norm": 0.5703328251838684, + "learning_rate": 2.6073922248730872e-06, + "loss": 0.5448, + "step": 2660 + }, + { + "epoch": 2.976510067114094, + "grad_norm": 0.5623937249183655, + "learning_rate": 2.605901565290232e-06, + "loss": 0.5531, + "step": 2661 + }, + { + "epoch": 2.9776286353467563, + "grad_norm": 0.5540557503700256, + "learning_rate": 2.604410867987492e-06, + "loss": 0.5645, + "step": 2662 + }, + { + "epoch": 2.9787472035794185, + "grad_norm": 0.550144374370575, + "learning_rate": 2.602920133495822e-06, + "loss": 0.5392, + "step": 2663 + }, + { + "epoch": 2.9798657718120807, + "grad_norm": 0.5573859214782715, + "learning_rate": 2.601429362346192e-06, + "loss": 0.5736, + "step": 2664 + }, + { + "epoch": 2.980984340044743, + "grad_norm": 0.5610393285751343, + "learning_rate": 2.59993855506958e-06, + "loss": 0.5406, + "step": 2665 + }, + { + "epoch": 2.9821029082774047, + "grad_norm": 0.5900465250015259, + "learning_rate": 2.598447712196982e-06, + "loss": 0.588, + "step": 2666 + }, + { + "epoch": 2.9832214765100673, + "grad_norm": 0.5596694350242615, + "learning_rate": 2.596956834259403e-06, + "loss": 0.5363, + "step": 2667 + }, + { + "epoch": 2.984340044742729, + "grad_norm": 0.5899379849433899, + "learning_rate": 2.5954659217878625e-06, + "loss": 0.5757, + "step": 2668 + }, + { + "epoch": 2.9854586129753917, + "grad_norm": 0.5596752762794495, + "learning_rate": 2.593974975313393e-06, + "loss": 0.5709, + "step": 2669 + }, + { + "epoch": 2.9865771812080535, + "grad_norm": 0.5592782497406006, + "learning_rate": 2.592483995367036e-06, + "loss": 0.5512, + "step": 2670 + }, + { + "epoch": 2.987695749440716, + "grad_norm": 0.5398252606391907, + "learning_rate": 2.590992982479848e-06, + "loss": 0.5378, + "step": 2671 + }, + { + "epoch": 2.988814317673378, + "grad_norm": 0.5614280700683594, + "learning_rate": 2.5895019371828957e-06, + "loss": 0.5633, + "step": 2672 + }, + { + "epoch": 2.98993288590604, + "grad_norm": 0.5591630935668945, + "learning_rate": 2.5880108600072583e-06, + "loss": 0.5457, + "step": 2673 + }, + { + "epoch": 2.9910514541387023, + "grad_norm": 0.5614907145500183, + "learning_rate": 2.586519751484026e-06, + "loss": 0.5737, + "step": 2674 + }, + { + "epoch": 2.9921700223713645, + "grad_norm": 0.5646727681159973, + "learning_rate": 2.5850286121442974e-06, + "loss": 0.5651, + "step": 2675 + }, + { + "epoch": 2.9932885906040267, + "grad_norm": 0.5485831499099731, + "learning_rate": 2.5835374425191867e-06, + "loss": 0.5408, + "step": 2676 + }, + { + "epoch": 2.994407158836689, + "grad_norm": 0.5491880178451538, + "learning_rate": 2.582046243139817e-06, + "loss": 0.5428, + "step": 2677 + }, + { + "epoch": 2.995525727069351, + "grad_norm": 0.5322139263153076, + "learning_rate": 2.5805550145373217e-06, + "loss": 0.5377, + "step": 2678 + }, + { + "epoch": 2.9966442953020134, + "grad_norm": 0.5418295860290527, + "learning_rate": 2.5790637572428432e-06, + "loss": 0.5475, + "step": 2679 + }, + { + "epoch": 2.9977628635346756, + "grad_norm": 0.566874086856842, + "learning_rate": 2.5775724717875372e-06, + "loss": 0.5761, + "step": 2680 + }, + { + "epoch": 2.998881431767338, + "grad_norm": 0.5525014996528625, + "learning_rate": 2.576081158702567e-06, + "loss": 0.5545, + "step": 2681 + }, + { + "epoch": 3.0, + "grad_norm": 0.5818064212799072, + "learning_rate": 2.5745898185191075e-06, + "loss": 0.5229, + "step": 2682 + }, + { + "epoch": 3.001118568232662, + "grad_norm": 0.5252112150192261, + "learning_rate": 2.5730984517683414e-06, + "loss": 0.5138, + "step": 2683 + }, + { + "epoch": 3.0022371364653244, + "grad_norm": 0.5460484623908997, + "learning_rate": 2.5716070589814622e-06, + "loss": 0.542, + "step": 2684 + }, + { + "epoch": 3.0033557046979866, + "grad_norm": 0.5523890852928162, + "learning_rate": 2.5701156406896726e-06, + "loss": 0.5609, + "step": 2685 + }, + { + "epoch": 3.004474272930649, + "grad_norm": 0.5453260540962219, + "learning_rate": 2.568624197424184e-06, + "loss": 0.5331, + "step": 2686 + }, + { + "epoch": 3.005592841163311, + "grad_norm": 0.5523380041122437, + "learning_rate": 2.567132729716216e-06, + "loss": 0.533, + "step": 2687 + }, + { + "epoch": 3.0067114093959733, + "grad_norm": 0.5618038177490234, + "learning_rate": 2.5656412380969975e-06, + "loss": 0.5239, + "step": 2688 + }, + { + "epoch": 3.0078299776286355, + "grad_norm": 0.5418974161148071, + "learning_rate": 2.5641497230977673e-06, + "loss": 0.5243, + "step": 2689 + }, + { + "epoch": 3.0089485458612977, + "grad_norm": 0.5615602135658264, + "learning_rate": 2.5626581852497713e-06, + "loss": 0.495, + "step": 2690 + }, + { + "epoch": 3.01006711409396, + "grad_norm": 0.5758011341094971, + "learning_rate": 2.5611666250842617e-06, + "loss": 0.5775, + "step": 2691 + }, + { + "epoch": 3.011185682326622, + "grad_norm": 0.5791272521018982, + "learning_rate": 2.5596750431325022e-06, + "loss": 0.5382, + "step": 2692 + }, + { + "epoch": 3.0123042505592843, + "grad_norm": 0.5698820352554321, + "learning_rate": 2.558183439925762e-06, + "loss": 0.5407, + "step": 2693 + }, + { + "epoch": 3.0134228187919465, + "grad_norm": 0.5475990772247314, + "learning_rate": 2.5566918159953193e-06, + "loss": 0.5178, + "step": 2694 + }, + { + "epoch": 3.0145413870246087, + "grad_norm": 0.555365800857544, + "learning_rate": 2.555200171872458e-06, + "loss": 0.5384, + "step": 2695 + }, + { + "epoch": 3.0156599552572705, + "grad_norm": 0.5880224108695984, + "learning_rate": 2.5537085080884694e-06, + "loss": 0.5501, + "step": 2696 + }, + { + "epoch": 3.0167785234899327, + "grad_norm": 0.599018931388855, + "learning_rate": 2.5522168251746534e-06, + "loss": 0.556, + "step": 2697 + }, + { + "epoch": 3.017897091722595, + "grad_norm": 0.5536356568336487, + "learning_rate": 2.5507251236623147e-06, + "loss": 0.5237, + "step": 2698 + }, + { + "epoch": 3.019015659955257, + "grad_norm": 0.5767291188240051, + "learning_rate": 2.549233404082767e-06, + "loss": 0.5588, + "step": 2699 + }, + { + "epoch": 3.0201342281879193, + "grad_norm": 0.549694836139679, + "learning_rate": 2.5477416669673276e-06, + "loss": 0.5218, + "step": 2700 + }, + { + "epoch": 3.0212527964205815, + "grad_norm": 0.5600660443305969, + "learning_rate": 2.5462499128473227e-06, + "loss": 0.5308, + "step": 2701 + }, + { + "epoch": 3.0223713646532437, + "grad_norm": 0.5611363649368286, + "learning_rate": 2.5447581422540834e-06, + "loss": 0.5447, + "step": 2702 + }, + { + "epoch": 3.023489932885906, + "grad_norm": 0.5761244893074036, + "learning_rate": 2.5432663557189458e-06, + "loss": 0.5463, + "step": 2703 + }, + { + "epoch": 3.024608501118568, + "grad_norm": 0.5451679825782776, + "learning_rate": 2.5417745537732524e-06, + "loss": 0.502, + "step": 2704 + }, + { + "epoch": 3.0257270693512304, + "grad_norm": 0.5496947765350342, + "learning_rate": 2.540282736948352e-06, + "loss": 0.5142, + "step": 2705 + }, + { + "epoch": 3.0268456375838926, + "grad_norm": 0.5574445724487305, + "learning_rate": 2.5387909057755994e-06, + "loss": 0.5396, + "step": 2706 + }, + { + "epoch": 3.027964205816555, + "grad_norm": 0.5375776290893555, + "learning_rate": 2.53729906078635e-06, + "loss": 0.5568, + "step": 2707 + }, + { + "epoch": 3.029082774049217, + "grad_norm": 0.558097779750824, + "learning_rate": 2.535807202511969e-06, + "loss": 0.5392, + "step": 2708 + }, + { + "epoch": 3.030201342281879, + "grad_norm": 0.5726040005683899, + "learning_rate": 2.5343153314838242e-06, + "loss": 0.5525, + "step": 2709 + }, + { + "epoch": 3.0313199105145414, + "grad_norm": 0.5495469570159912, + "learning_rate": 2.532823448233289e-06, + "loss": 0.5307, + "step": 2710 + }, + { + "epoch": 3.0324384787472036, + "grad_norm": 0.5759256482124329, + "learning_rate": 2.53133155329174e-06, + "loss": 0.5377, + "step": 2711 + }, + { + "epoch": 3.033557046979866, + "grad_norm": 0.5546212196350098, + "learning_rate": 2.5298396471905577e-06, + "loss": 0.536, + "step": 2712 + }, + { + "epoch": 3.034675615212528, + "grad_norm": 0.5617177486419678, + "learning_rate": 2.5283477304611283e-06, + "loss": 0.5252, + "step": 2713 + }, + { + "epoch": 3.0357941834451903, + "grad_norm": 0.5784680843353271, + "learning_rate": 2.5268558036348412e-06, + "loss": 0.5414, + "step": 2714 + }, + { + "epoch": 3.0369127516778525, + "grad_norm": 0.5552501678466797, + "learning_rate": 2.5253638672430873e-06, + "loss": 0.523, + "step": 2715 + }, + { + "epoch": 3.0380313199105147, + "grad_norm": 0.5609502792358398, + "learning_rate": 2.5238719218172638e-06, + "loss": 0.5378, + "step": 2716 + }, + { + "epoch": 3.039149888143177, + "grad_norm": 0.5623546242713928, + "learning_rate": 2.52237996788877e-06, + "loss": 0.5341, + "step": 2717 + }, + { + "epoch": 3.040268456375839, + "grad_norm": 0.5461615324020386, + "learning_rate": 2.520888005989008e-06, + "loss": 0.5167, + "step": 2718 + }, + { + "epoch": 3.0413870246085013, + "grad_norm": 0.5605127811431885, + "learning_rate": 2.5193960366493825e-06, + "loss": 0.517, + "step": 2719 + }, + { + "epoch": 3.0425055928411635, + "grad_norm": 0.5874475836753845, + "learning_rate": 2.517904060401301e-06, + "loss": 0.551, + "step": 2720 + }, + { + "epoch": 3.0436241610738257, + "grad_norm": 0.5500011444091797, + "learning_rate": 2.5164120777761747e-06, + "loss": 0.5508, + "step": 2721 + }, + { + "epoch": 3.0447427293064875, + "grad_norm": 0.5621449947357178, + "learning_rate": 2.5149200893054153e-06, + "loss": 0.5392, + "step": 2722 + }, + { + "epoch": 3.0458612975391497, + "grad_norm": 0.5683169960975647, + "learning_rate": 2.5134280955204377e-06, + "loss": 0.5454, + "step": 2723 + }, + { + "epoch": 3.046979865771812, + "grad_norm": 0.5664412975311279, + "learning_rate": 2.511936096952658e-06, + "loss": 0.5338, + "step": 2724 + }, + { + "epoch": 3.048098434004474, + "grad_norm": 0.5398328304290771, + "learning_rate": 2.5104440941334947e-06, + "loss": 0.5139, + "step": 2725 + }, + { + "epoch": 3.0492170022371363, + "grad_norm": 0.5616039633750916, + "learning_rate": 2.5089520875943672e-06, + "loss": 0.5144, + "step": 2726 + }, + { + "epoch": 3.0503355704697985, + "grad_norm": 0.567246675491333, + "learning_rate": 2.507460077866697e-06, + "loss": 0.5436, + "step": 2727 + }, + { + "epoch": 3.0514541387024607, + "grad_norm": 0.5571737289428711, + "learning_rate": 2.5059680654819053e-06, + "loss": 0.5305, + "step": 2728 + }, + { + "epoch": 3.052572706935123, + "grad_norm": 0.5480892658233643, + "learning_rate": 2.5044760509714156e-06, + "loss": 0.5024, + "step": 2729 + }, + { + "epoch": 3.053691275167785, + "grad_norm": 0.5547569394111633, + "learning_rate": 2.502984034866652e-06, + "loss": 0.5311, + "step": 2730 + }, + { + "epoch": 3.0548098434004474, + "grad_norm": 0.5570432543754578, + "learning_rate": 2.5014920176990388e-06, + "loss": 0.5459, + "step": 2731 + }, + { + "epoch": 3.0559284116331096, + "grad_norm": 0.5663108229637146, + "learning_rate": 2.5e-06, + "loss": 0.5138, + "step": 2732 + }, + { + "epoch": 3.057046979865772, + "grad_norm": 0.5601660013198853, + "learning_rate": 2.4985079823009625e-06, + "loss": 0.5072, + "step": 2733 + }, + { + "epoch": 3.058165548098434, + "grad_norm": 0.5544062852859497, + "learning_rate": 2.497015965133349e-06, + "loss": 0.5374, + "step": 2734 + }, + { + "epoch": 3.059284116331096, + "grad_norm": 0.5805025696754456, + "learning_rate": 2.4955239490285857e-06, + "loss": 0.5152, + "step": 2735 + }, + { + "epoch": 3.0604026845637584, + "grad_norm": 0.580656886100769, + "learning_rate": 2.4940319345180955e-06, + "loss": 0.56, + "step": 2736 + }, + { + "epoch": 3.0615212527964206, + "grad_norm": 0.5633717775344849, + "learning_rate": 2.492539922133304e-06, + "loss": 0.5338, + "step": 2737 + }, + { + "epoch": 3.062639821029083, + "grad_norm": 0.5653939843177795, + "learning_rate": 2.491047912405633e-06, + "loss": 0.5236, + "step": 2738 + }, + { + "epoch": 3.063758389261745, + "grad_norm": 0.5767054557800293, + "learning_rate": 2.4895559058665065e-06, + "loss": 0.5508, + "step": 2739 + }, + { + "epoch": 3.0648769574944073, + "grad_norm": 0.5750643014907837, + "learning_rate": 2.4880639030473423e-06, + "loss": 0.5394, + "step": 2740 + }, + { + "epoch": 3.0659955257270695, + "grad_norm": 0.5739333629608154, + "learning_rate": 2.486571904479563e-06, + "loss": 0.5501, + "step": 2741 + }, + { + "epoch": 3.0671140939597317, + "grad_norm": 0.5624204277992249, + "learning_rate": 2.485079910694585e-06, + "loss": 0.5051, + "step": 2742 + }, + { + "epoch": 3.068232662192394, + "grad_norm": 0.5817630887031555, + "learning_rate": 2.483587922223826e-06, + "loss": 0.5614, + "step": 2743 + }, + { + "epoch": 3.069351230425056, + "grad_norm": 0.5646341443061829, + "learning_rate": 2.4820959395987e-06, + "loss": 0.525, + "step": 2744 + }, + { + "epoch": 3.0704697986577183, + "grad_norm": 0.5615726113319397, + "learning_rate": 2.4806039633506184e-06, + "loss": 0.5239, + "step": 2745 + }, + { + "epoch": 3.0715883668903805, + "grad_norm": 0.5744274258613586, + "learning_rate": 2.479111994010993e-06, + "loss": 0.5514, + "step": 2746 + }, + { + "epoch": 3.0727069351230427, + "grad_norm": 0.559465765953064, + "learning_rate": 2.4776200321112307e-06, + "loss": 0.4935, + "step": 2747 + }, + { + "epoch": 3.073825503355705, + "grad_norm": 0.5706081390380859, + "learning_rate": 2.476128078182737e-06, + "loss": 0.5202, + "step": 2748 + }, + { + "epoch": 3.0749440715883667, + "grad_norm": 0.58463054895401, + "learning_rate": 2.474636132756913e-06, + "loss": 0.5514, + "step": 2749 + }, + { + "epoch": 3.076062639821029, + "grad_norm": 0.5574572682380676, + "learning_rate": 2.4731441963651596e-06, + "loss": 0.5606, + "step": 2750 + }, + { + "epoch": 3.077181208053691, + "grad_norm": 0.5571308135986328, + "learning_rate": 2.471652269538872e-06, + "loss": 0.5022, + "step": 2751 + }, + { + "epoch": 3.0782997762863533, + "grad_norm": 0.5649480223655701, + "learning_rate": 2.470160352809443e-06, + "loss": 0.5593, + "step": 2752 + }, + { + "epoch": 3.0794183445190155, + "grad_norm": 0.5802405476570129, + "learning_rate": 2.4686684467082604e-06, + "loss": 0.5598, + "step": 2753 + }, + { + "epoch": 3.0805369127516777, + "grad_norm": 0.5596590638160706, + "learning_rate": 2.4671765517667114e-06, + "loss": 0.5391, + "step": 2754 + }, + { + "epoch": 3.08165548098434, + "grad_norm": 0.5846322178840637, + "learning_rate": 2.4656846685161766e-06, + "loss": 0.5521, + "step": 2755 + }, + { + "epoch": 3.082774049217002, + "grad_norm": 0.5704472064971924, + "learning_rate": 2.4641927974880317e-06, + "loss": 0.5652, + "step": 2756 + }, + { + "epoch": 3.0838926174496644, + "grad_norm": 0.5620293021202087, + "learning_rate": 2.462700939213651e-06, + "loss": 0.519, + "step": 2757 + }, + { + "epoch": 3.0850111856823266, + "grad_norm": 0.5806359648704529, + "learning_rate": 2.4612090942244015e-06, + "loss": 0.5728, + "step": 2758 + }, + { + "epoch": 3.086129753914989, + "grad_norm": 0.5596415400505066, + "learning_rate": 2.4597172630516487e-06, + "loss": 0.5349, + "step": 2759 + }, + { + "epoch": 3.087248322147651, + "grad_norm": 0.5782260894775391, + "learning_rate": 2.4582254462267476e-06, + "loss": 0.5079, + "step": 2760 + }, + { + "epoch": 3.088366890380313, + "grad_norm": 0.5587027668952942, + "learning_rate": 2.456733644281055e-06, + "loss": 0.5411, + "step": 2761 + }, + { + "epoch": 3.0894854586129754, + "grad_norm": 0.5730692744255066, + "learning_rate": 2.455241857745917e-06, + "loss": 0.5215, + "step": 2762 + }, + { + "epoch": 3.0906040268456376, + "grad_norm": 0.5701208710670471, + "learning_rate": 2.453750087152678e-06, + "loss": 0.5044, + "step": 2763 + }, + { + "epoch": 3.0917225950783, + "grad_norm": 0.5539160370826721, + "learning_rate": 2.4522583330326736e-06, + "loss": 0.541, + "step": 2764 + }, + { + "epoch": 3.092841163310962, + "grad_norm": 0.5716198682785034, + "learning_rate": 2.4507665959172337e-06, + "loss": 0.5263, + "step": 2765 + }, + { + "epoch": 3.0939597315436242, + "grad_norm": 0.5674124956130981, + "learning_rate": 2.449274876337686e-06, + "loss": 0.5302, + "step": 2766 + }, + { + "epoch": 3.0950782997762865, + "grad_norm": 0.582098662853241, + "learning_rate": 2.447783174825348e-06, + "loss": 0.537, + "step": 2767 + }, + { + "epoch": 3.0961968680089487, + "grad_norm": 0.5744993090629578, + "learning_rate": 2.446291491911532e-06, + "loss": 0.5409, + "step": 2768 + }, + { + "epoch": 3.097315436241611, + "grad_norm": 0.5894611477851868, + "learning_rate": 2.444799828127543e-06, + "loss": 0.5431, + "step": 2769 + }, + { + "epoch": 3.098434004474273, + "grad_norm": 0.5880317687988281, + "learning_rate": 2.443308184004681e-06, + "loss": 0.5188, + "step": 2770 + }, + { + "epoch": 3.0995525727069353, + "grad_norm": 0.5492719411849976, + "learning_rate": 2.441816560074238e-06, + "loss": 0.5154, + "step": 2771 + }, + { + "epoch": 3.1006711409395975, + "grad_norm": 0.5766882300376892, + "learning_rate": 2.440324956867498e-06, + "loss": 0.5555, + "step": 2772 + }, + { + "epoch": 3.1017897091722597, + "grad_norm": 0.5702850222587585, + "learning_rate": 2.4388333749157383e-06, + "loss": 0.5443, + "step": 2773 + }, + { + "epoch": 3.1029082774049215, + "grad_norm": 0.5869524478912354, + "learning_rate": 2.43734181475023e-06, + "loss": 0.5419, + "step": 2774 + }, + { + "epoch": 3.1040268456375837, + "grad_norm": 0.5696037411689758, + "learning_rate": 2.4358502769022335e-06, + "loss": 0.5493, + "step": 2775 + }, + { + "epoch": 3.105145413870246, + "grad_norm": 0.5568296909332275, + "learning_rate": 2.4343587619030033e-06, + "loss": 0.534, + "step": 2776 + }, + { + "epoch": 3.106263982102908, + "grad_norm": 0.5688674449920654, + "learning_rate": 2.4328672702837854e-06, + "loss": 0.5583, + "step": 2777 + }, + { + "epoch": 3.1073825503355703, + "grad_norm": 0.5805652737617493, + "learning_rate": 2.4313758025758166e-06, + "loss": 0.5495, + "step": 2778 + }, + { + "epoch": 3.1085011185682325, + "grad_norm": 0.5927707552909851, + "learning_rate": 2.429884359310328e-06, + "loss": 0.5297, + "step": 2779 + }, + { + "epoch": 3.1096196868008947, + "grad_norm": 0.5667550563812256, + "learning_rate": 2.428392941018538e-06, + "loss": 0.5479, + "step": 2780 + }, + { + "epoch": 3.110738255033557, + "grad_norm": 0.5672300457954407, + "learning_rate": 2.4269015482316594e-06, + "loss": 0.5363, + "step": 2781 + }, + { + "epoch": 3.111856823266219, + "grad_norm": 0.5730635523796082, + "learning_rate": 2.4254101814808925e-06, + "loss": 0.5306, + "step": 2782 + }, + { + "epoch": 3.1129753914988814, + "grad_norm": 0.587618350982666, + "learning_rate": 2.4239188412974338e-06, + "loss": 0.5534, + "step": 2783 + }, + { + "epoch": 3.1140939597315436, + "grad_norm": 0.5512918829917908, + "learning_rate": 2.422427528212463e-06, + "loss": 0.5244, + "step": 2784 + }, + { + "epoch": 3.115212527964206, + "grad_norm": 0.5900892615318298, + "learning_rate": 2.4209362427571576e-06, + "loss": 0.5444, + "step": 2785 + }, + { + "epoch": 3.116331096196868, + "grad_norm": 0.5613123774528503, + "learning_rate": 2.41944498546268e-06, + "loss": 0.5174, + "step": 2786 + }, + { + "epoch": 3.11744966442953, + "grad_norm": 0.5768151879310608, + "learning_rate": 2.417953756860184e-06, + "loss": 0.5676, + "step": 2787 + }, + { + "epoch": 3.1185682326621924, + "grad_norm": 0.5624108910560608, + "learning_rate": 2.4164625574808145e-06, + "loss": 0.5029, + "step": 2788 + }, + { + "epoch": 3.1196868008948546, + "grad_norm": 0.5923402905464172, + "learning_rate": 2.4149713878557034e-06, + "loss": 0.5313, + "step": 2789 + }, + { + "epoch": 3.120805369127517, + "grad_norm": 0.5879419445991516, + "learning_rate": 2.413480248515976e-06, + "loss": 0.564, + "step": 2790 + }, + { + "epoch": 3.121923937360179, + "grad_norm": 0.5634798407554626, + "learning_rate": 2.411989139992742e-06, + "loss": 0.5333, + "step": 2791 + }, + { + "epoch": 3.1230425055928412, + "grad_norm": 0.5679693818092346, + "learning_rate": 2.4104980628171047e-06, + "loss": 0.5376, + "step": 2792 + }, + { + "epoch": 3.1241610738255035, + "grad_norm": 0.5618317723274231, + "learning_rate": 2.409007017520152e-06, + "loss": 0.5225, + "step": 2793 + }, + { + "epoch": 3.1252796420581657, + "grad_norm": 0.5773863792419434, + "learning_rate": 2.4075160046329647e-06, + "loss": 0.5373, + "step": 2794 + }, + { + "epoch": 3.126398210290828, + "grad_norm": 0.5908944606781006, + "learning_rate": 2.4060250246866075e-06, + "loss": 0.5428, + "step": 2795 + }, + { + "epoch": 3.12751677852349, + "grad_norm": 0.5735543966293335, + "learning_rate": 2.404534078212138e-06, + "loss": 0.5437, + "step": 2796 + }, + { + "epoch": 3.1286353467561523, + "grad_norm": 0.5470800995826721, + "learning_rate": 2.4030431657405985e-06, + "loss": 0.5335, + "step": 2797 + }, + { + "epoch": 3.1297539149888145, + "grad_norm": 0.5766976475715637, + "learning_rate": 2.401552287803019e-06, + "loss": 0.5349, + "step": 2798 + }, + { + "epoch": 3.1308724832214767, + "grad_norm": 0.5561223030090332, + "learning_rate": 2.400061444930421e-06, + "loss": 0.554, + "step": 2799 + }, + { + "epoch": 3.131991051454139, + "grad_norm": 0.5804010033607483, + "learning_rate": 2.398570637653809e-06, + "loss": 0.555, + "step": 2800 + }, + { + "epoch": 3.1331096196868007, + "grad_norm": 0.5694336295127869, + "learning_rate": 2.3970798665041785e-06, + "loss": 0.5493, + "step": 2801 + }, + { + "epoch": 3.134228187919463, + "grad_norm": 0.5735706686973572, + "learning_rate": 2.3955891320125085e-06, + "loss": 0.5363, + "step": 2802 + }, + { + "epoch": 3.135346756152125, + "grad_norm": 0.572468101978302, + "learning_rate": 2.394098434709769e-06, + "loss": 0.5543, + "step": 2803 + }, + { + "epoch": 3.1364653243847873, + "grad_norm": 0.5897607803344727, + "learning_rate": 2.392607775126913e-06, + "loss": 0.5604, + "step": 2804 + }, + { + "epoch": 3.1375838926174495, + "grad_norm": 0.5683246850967407, + "learning_rate": 2.3911171537948834e-06, + "loss": 0.5491, + "step": 2805 + }, + { + "epoch": 3.1387024608501117, + "grad_norm": 0.5807965397834778, + "learning_rate": 2.3896265712446052e-06, + "loss": 0.5612, + "step": 2806 + }, + { + "epoch": 3.139821029082774, + "grad_norm": 0.5941694974899292, + "learning_rate": 2.3881360280069955e-06, + "loss": 0.5472, + "step": 2807 + }, + { + "epoch": 3.140939597315436, + "grad_norm": 0.5739585161209106, + "learning_rate": 2.3866455246129525e-06, + "loss": 0.5433, + "step": 2808 + }, + { + "epoch": 3.1420581655480984, + "grad_norm": 0.572344183921814, + "learning_rate": 2.38515506159336e-06, + "loss": 0.5311, + "step": 2809 + }, + { + "epoch": 3.1431767337807606, + "grad_norm": 0.5814085602760315, + "learning_rate": 2.3836646394790923e-06, + "loss": 0.5271, + "step": 2810 + }, + { + "epoch": 3.1442953020134228, + "grad_norm": 0.5764447450637817, + "learning_rate": 2.3821742588010037e-06, + "loss": 0.5643, + "step": 2811 + }, + { + "epoch": 3.145413870246085, + "grad_norm": 0.5797690153121948, + "learning_rate": 2.380683920089938e-06, + "loss": 0.5545, + "step": 2812 + }, + { + "epoch": 3.146532438478747, + "grad_norm": 0.5764464735984802, + "learning_rate": 2.3791936238767196e-06, + "loss": 0.5413, + "step": 2813 + }, + { + "epoch": 3.1476510067114094, + "grad_norm": 0.578752875328064, + "learning_rate": 2.377703370692163e-06, + "loss": 0.5506, + "step": 2814 + }, + { + "epoch": 3.1487695749440716, + "grad_norm": 0.5465778708457947, + "learning_rate": 2.3762131610670634e-06, + "loss": 0.5236, + "step": 2815 + }, + { + "epoch": 3.149888143176734, + "grad_norm": 0.56758052110672, + "learning_rate": 2.3747229955322022e-06, + "loss": 0.5451, + "step": 2816 + }, + { + "epoch": 3.151006711409396, + "grad_norm": 0.5690299272537231, + "learning_rate": 2.3732328746183442e-06, + "loss": 0.5406, + "step": 2817 + }, + { + "epoch": 3.1521252796420582, + "grad_norm": 0.5651488900184631, + "learning_rate": 2.371742798856238e-06, + "loss": 0.533, + "step": 2818 + }, + { + "epoch": 3.1532438478747205, + "grad_norm": 0.5636695623397827, + "learning_rate": 2.370252768776619e-06, + "loss": 0.5316, + "step": 2819 + }, + { + "epoch": 3.1543624161073827, + "grad_norm": 0.5816517472267151, + "learning_rate": 2.3687627849102023e-06, + "loss": 0.5273, + "step": 2820 + }, + { + "epoch": 3.155480984340045, + "grad_norm": 0.5666426420211792, + "learning_rate": 2.36727284778769e-06, + "loss": 0.5177, + "step": 2821 + }, + { + "epoch": 3.156599552572707, + "grad_norm": 0.5623897910118103, + "learning_rate": 2.3657829579397633e-06, + "loss": 0.4996, + "step": 2822 + }, + { + "epoch": 3.1577181208053693, + "grad_norm": 0.5772681832313538, + "learning_rate": 2.3642931158970927e-06, + "loss": 0.5438, + "step": 2823 + }, + { + "epoch": 3.1588366890380315, + "grad_norm": 0.6019139885902405, + "learning_rate": 2.3628033221903262e-06, + "loss": 0.5523, + "step": 2824 + }, + { + "epoch": 3.1599552572706937, + "grad_norm": 0.5670963525772095, + "learning_rate": 2.361313577350097e-06, + "loss": 0.5314, + "step": 2825 + }, + { + "epoch": 3.1610738255033555, + "grad_norm": 0.5672596096992493, + "learning_rate": 2.3598238819070206e-06, + "loss": 0.5244, + "step": 2826 + }, + { + "epoch": 3.162192393736018, + "grad_norm": 0.552393913269043, + "learning_rate": 2.358334236391695e-06, + "loss": 0.5248, + "step": 2827 + }, + { + "epoch": 3.16331096196868, + "grad_norm": 0.5768967866897583, + "learning_rate": 2.3568446413347e-06, + "loss": 0.5339, + "step": 2828 + }, + { + "epoch": 3.164429530201342, + "grad_norm": 0.5951476693153381, + "learning_rate": 2.3553550972665977e-06, + "loss": 0.5465, + "step": 2829 + }, + { + "epoch": 3.1655480984340043, + "grad_norm": 0.5732336640357971, + "learning_rate": 2.353865604717932e-06, + "loss": 0.5267, + "step": 2830 + }, + { + "epoch": 3.1666666666666665, + "grad_norm": 0.5979350805282593, + "learning_rate": 2.352376164219228e-06, + "loss": 0.5375, + "step": 2831 + }, + { + "epoch": 3.1677852348993287, + "grad_norm": 0.5624974966049194, + "learning_rate": 2.3508867763009937e-06, + "loss": 0.5218, + "step": 2832 + }, + { + "epoch": 3.168903803131991, + "grad_norm": 0.5693974494934082, + "learning_rate": 2.3493974414937167e-06, + "loss": 0.5336, + "step": 2833 + }, + { + "epoch": 3.170022371364653, + "grad_norm": 0.5990610718727112, + "learning_rate": 2.3479081603278665e-06, + "loss": 0.5408, + "step": 2834 + }, + { + "epoch": 3.1711409395973154, + "grad_norm": 0.5916415452957153, + "learning_rate": 2.3464189333338933e-06, + "loss": 0.5502, + "step": 2835 + }, + { + "epoch": 3.1722595078299776, + "grad_norm": 0.5629110932350159, + "learning_rate": 2.3449297610422285e-06, + "loss": 0.5238, + "step": 2836 + }, + { + "epoch": 3.1733780760626398, + "grad_norm": 0.5962100028991699, + "learning_rate": 2.3434406439832825e-06, + "loss": 0.5773, + "step": 2837 + }, + { + "epoch": 3.174496644295302, + "grad_norm": 0.5593759417533875, + "learning_rate": 2.3419515826874482e-06, + "loss": 0.5349, + "step": 2838 + }, + { + "epoch": 3.175615212527964, + "grad_norm": 0.5972168445587158, + "learning_rate": 2.340462577685098e-06, + "loss": 0.5552, + "step": 2839 + }, + { + "epoch": 3.1767337807606264, + "grad_norm": 0.5750653147697449, + "learning_rate": 2.338973629506582e-06, + "loss": 0.5316, + "step": 2840 + }, + { + "epoch": 3.1778523489932886, + "grad_norm": 0.5720837712287903, + "learning_rate": 2.3374847386822338e-06, + "loss": 0.5419, + "step": 2841 + }, + { + "epoch": 3.178970917225951, + "grad_norm": 0.5903941988945007, + "learning_rate": 2.335995905742362e-06, + "loss": 0.5503, + "step": 2842 + }, + { + "epoch": 3.180089485458613, + "grad_norm": 0.5774462223052979, + "learning_rate": 2.33450713121726e-06, + "loss": 0.5273, + "step": 2843 + }, + { + "epoch": 3.1812080536912752, + "grad_norm": 0.5863364338874817, + "learning_rate": 2.333018415637196e-06, + "loss": 0.5644, + "step": 2844 + }, + { + "epoch": 3.1823266219239374, + "grad_norm": 0.5794649124145508, + "learning_rate": 2.3315297595324204e-06, + "loss": 0.5221, + "step": 2845 + }, + { + "epoch": 3.1834451901565997, + "grad_norm": 0.5991812348365784, + "learning_rate": 2.3300411634331583e-06, + "loss": 0.5618, + "step": 2846 + }, + { + "epoch": 3.184563758389262, + "grad_norm": 0.5651556253433228, + "learning_rate": 2.3285526278696185e-06, + "loss": 0.5407, + "step": 2847 + }, + { + "epoch": 3.185682326621924, + "grad_norm": 0.5711520314216614, + "learning_rate": 2.3270641533719843e-06, + "loss": 0.5457, + "step": 2848 + }, + { + "epoch": 3.1868008948545863, + "grad_norm": 0.5785650014877319, + "learning_rate": 2.325575740470419e-06, + "loss": 0.5324, + "step": 2849 + }, + { + "epoch": 3.1879194630872485, + "grad_norm": 0.5938114523887634, + "learning_rate": 2.324087389695064e-06, + "loss": 0.557, + "step": 2850 + }, + { + "epoch": 3.1890380313199107, + "grad_norm": 0.5903030037879944, + "learning_rate": 2.3225991015760362e-06, + "loss": 0.5628, + "step": 2851 + }, + { + "epoch": 3.190156599552573, + "grad_norm": 0.5637936592102051, + "learning_rate": 2.321110876643435e-06, + "loss": 0.5279, + "step": 2852 + }, + { + "epoch": 3.1912751677852347, + "grad_norm": 0.583587646484375, + "learning_rate": 2.3196227154273325e-06, + "loss": 0.544, + "step": 2853 + }, + { + "epoch": 3.192393736017897, + "grad_norm": 0.5671531558036804, + "learning_rate": 2.318134618457782e-06, + "loss": 0.5263, + "step": 2854 + }, + { + "epoch": 3.193512304250559, + "grad_norm": 0.593549370765686, + "learning_rate": 2.3166465862648085e-06, + "loss": 0.5402, + "step": 2855 + }, + { + "epoch": 3.1946308724832213, + "grad_norm": 0.5791521668434143, + "learning_rate": 2.315158619378421e-06, + "loss": 0.5113, + "step": 2856 + }, + { + "epoch": 3.1957494407158835, + "grad_norm": 0.5875315070152283, + "learning_rate": 2.3136707183286e-06, + "loss": 0.5513, + "step": 2857 + }, + { + "epoch": 3.1968680089485457, + "grad_norm": 0.5931426286697388, + "learning_rate": 2.3121828836453043e-06, + "loss": 0.5292, + "step": 2858 + }, + { + "epoch": 3.197986577181208, + "grad_norm": 0.5670058131217957, + "learning_rate": 2.3106951158584687e-06, + "loss": 0.516, + "step": 2859 + }, + { + "epoch": 3.19910514541387, + "grad_norm": 0.5911577343940735, + "learning_rate": 2.309207415498006e-06, + "loss": 0.5465, + "step": 2860 + }, + { + "epoch": 3.2002237136465324, + "grad_norm": 0.5895060300827026, + "learning_rate": 2.307719783093802e-06, + "loss": 0.5631, + "step": 2861 + }, + { + "epoch": 3.2013422818791946, + "grad_norm": 0.5665014982223511, + "learning_rate": 2.3062322191757184e-06, + "loss": 0.5365, + "step": 2862 + }, + { + "epoch": 3.2024608501118568, + "grad_norm": 0.5766090750694275, + "learning_rate": 2.304744724273597e-06, + "loss": 0.534, + "step": 2863 + }, + { + "epoch": 3.203579418344519, + "grad_norm": 0.5952649116516113, + "learning_rate": 2.303257298917249e-06, + "loss": 0.5412, + "step": 2864 + }, + { + "epoch": 3.204697986577181, + "grad_norm": 0.575677216053009, + "learning_rate": 2.3017699436364657e-06, + "loss": 0.5344, + "step": 2865 + }, + { + "epoch": 3.2058165548098434, + "grad_norm": 0.5784211158752441, + "learning_rate": 2.3002826589610095e-06, + "loss": 0.5582, + "step": 2866 + }, + { + "epoch": 3.2069351230425056, + "grad_norm": 0.5567923784255981, + "learning_rate": 2.2987954454206215e-06, + "loss": 0.5178, + "step": 2867 + }, + { + "epoch": 3.208053691275168, + "grad_norm": 0.6026732325553894, + "learning_rate": 2.297308303545014e-06, + "loss": 0.5585, + "step": 2868 + }, + { + "epoch": 3.20917225950783, + "grad_norm": 0.5894998908042908, + "learning_rate": 2.295821233863877e-06, + "loss": 0.5463, + "step": 2869 + }, + { + "epoch": 3.2102908277404922, + "grad_norm": 0.5688816905021667, + "learning_rate": 2.2943342369068715e-06, + "loss": 0.5425, + "step": 2870 + }, + { + "epoch": 3.2114093959731544, + "grad_norm": 0.5922964811325073, + "learning_rate": 2.2928473132036334e-06, + "loss": 0.576, + "step": 2871 + }, + { + "epoch": 3.2125279642058167, + "grad_norm": 0.5725808143615723, + "learning_rate": 2.291360463283776e-06, + "loss": 0.524, + "step": 2872 + }, + { + "epoch": 3.213646532438479, + "grad_norm": 0.5995866060256958, + "learning_rate": 2.2898736876768816e-06, + "loss": 0.5313, + "step": 2873 + }, + { + "epoch": 3.214765100671141, + "grad_norm": 0.5817602276802063, + "learning_rate": 2.288386986912509e-06, + "loss": 0.5718, + "step": 2874 + }, + { + "epoch": 3.2158836689038033, + "grad_norm": 0.5726406574249268, + "learning_rate": 2.286900361520188e-06, + "loss": 0.549, + "step": 2875 + }, + { + "epoch": 3.2170022371364655, + "grad_norm": 0.5653071403503418, + "learning_rate": 2.285413812029425e-06, + "loss": 0.5431, + "step": 2876 + }, + { + "epoch": 3.2181208053691277, + "grad_norm": 0.5832003951072693, + "learning_rate": 2.283927338969695e-06, + "loss": 0.5525, + "step": 2877 + }, + { + "epoch": 3.21923937360179, + "grad_norm": 0.5818856358528137, + "learning_rate": 2.2824409428704505e-06, + "loss": 0.5357, + "step": 2878 + }, + { + "epoch": 3.220357941834452, + "grad_norm": 0.565303385257721, + "learning_rate": 2.2809546242611115e-06, + "loss": 0.5097, + "step": 2879 + }, + { + "epoch": 3.221476510067114, + "grad_norm": 0.5662059187889099, + "learning_rate": 2.2794683836710755e-06, + "loss": 0.527, + "step": 2880 + }, + { + "epoch": 3.222595078299776, + "grad_norm": 0.549142599105835, + "learning_rate": 2.277982221629709e-06, + "loss": 0.5365, + "step": 2881 + }, + { + "epoch": 3.2237136465324383, + "grad_norm": 0.5961729288101196, + "learning_rate": 2.276496138666351e-06, + "loss": 0.54, + "step": 2882 + }, + { + "epoch": 3.2248322147651005, + "grad_norm": 0.5753577947616577, + "learning_rate": 2.2750101353103125e-06, + "loss": 0.5496, + "step": 2883 + }, + { + "epoch": 3.2259507829977627, + "grad_norm": 0.5859640836715698, + "learning_rate": 2.2735242120908757e-06, + "loss": 0.544, + "step": 2884 + }, + { + "epoch": 3.227069351230425, + "grad_norm": 0.5634665489196777, + "learning_rate": 2.2720383695372973e-06, + "loss": 0.5202, + "step": 2885 + }, + { + "epoch": 3.228187919463087, + "grad_norm": 0.5784491896629333, + "learning_rate": 2.2705526081787993e-06, + "loss": 0.535, + "step": 2886 + }, + { + "epoch": 3.2293064876957494, + "grad_norm": 0.5842520594596863, + "learning_rate": 2.269066928544581e-06, + "loss": 0.5473, + "step": 2887 + }, + { + "epoch": 3.2304250559284116, + "grad_norm": 0.5698022842407227, + "learning_rate": 2.267581331163809e-06, + "loss": 0.542, + "step": 2888 + }, + { + "epoch": 3.2315436241610738, + "grad_norm": 0.5946921110153198, + "learning_rate": 2.2660958165656214e-06, + "loss": 0.5527, + "step": 2889 + }, + { + "epoch": 3.232662192393736, + "grad_norm": 0.6098816990852356, + "learning_rate": 2.264610385279126e-06, + "loss": 0.5481, + "step": 2890 + }, + { + "epoch": 3.233780760626398, + "grad_norm": 0.5838576555252075, + "learning_rate": 2.263125037833404e-06, + "loss": 0.5216, + "step": 2891 + }, + { + "epoch": 3.2348993288590604, + "grad_norm": 0.5654276013374329, + "learning_rate": 2.261639774757503e-06, + "loss": 0.5409, + "step": 2892 + }, + { + "epoch": 3.2360178970917226, + "grad_norm": 0.5757707357406616, + "learning_rate": 2.260154596580442e-06, + "loss": 0.5304, + "step": 2893 + }, + { + "epoch": 3.237136465324385, + "grad_norm": 0.5709238052368164, + "learning_rate": 2.2586695038312105e-06, + "loss": 0.5224, + "step": 2894 + }, + { + "epoch": 3.238255033557047, + "grad_norm": 0.5524784922599792, + "learning_rate": 2.2571844970387663e-06, + "loss": 0.48, + "step": 2895 + }, + { + "epoch": 3.2393736017897092, + "grad_norm": 0.5874065160751343, + "learning_rate": 2.2556995767320385e-06, + "loss": 0.5252, + "step": 2896 + }, + { + "epoch": 3.2404921700223714, + "grad_norm": 0.5680007338523865, + "learning_rate": 2.2542147434399224e-06, + "loss": 0.5284, + "step": 2897 + }, + { + "epoch": 3.2416107382550337, + "grad_norm": 0.5712788701057434, + "learning_rate": 2.252729997691285e-06, + "loss": 0.5441, + "step": 2898 + }, + { + "epoch": 3.242729306487696, + "grad_norm": 0.5960977077484131, + "learning_rate": 2.251245340014961e-06, + "loss": 0.5439, + "step": 2899 + }, + { + "epoch": 3.243847874720358, + "grad_norm": 0.5556855201721191, + "learning_rate": 2.249760770939754e-06, + "loss": 0.5198, + "step": 2900 + }, + { + "epoch": 3.2449664429530203, + "grad_norm": 0.5673580169677734, + "learning_rate": 2.248276290994436e-06, + "loss": 0.5353, + "step": 2901 + }, + { + "epoch": 3.2460850111856825, + "grad_norm": 0.5855379104614258, + "learning_rate": 2.2467919007077474e-06, + "loss": 0.5432, + "step": 2902 + }, + { + "epoch": 3.2472035794183447, + "grad_norm": 0.6051118969917297, + "learning_rate": 2.2453076006083958e-06, + "loss": 0.5539, + "step": 2903 + }, + { + "epoch": 3.248322147651007, + "grad_norm": 0.5714535117149353, + "learning_rate": 2.2438233912250565e-06, + "loss": 0.5128, + "step": 2904 + }, + { + "epoch": 3.2494407158836687, + "grad_norm": 0.5715987086296082, + "learning_rate": 2.242339273086375e-06, + "loss": 0.5059, + "step": 2905 + }, + { + "epoch": 3.2505592841163313, + "grad_norm": 0.5945152044296265, + "learning_rate": 2.240855246720962e-06, + "loss": 0.5347, + "step": 2906 + }, + { + "epoch": 3.251677852348993, + "grad_norm": 0.5830382108688354, + "learning_rate": 2.2393713126573965e-06, + "loss": 0.5423, + "step": 2907 + }, + { + "epoch": 3.2527964205816553, + "grad_norm": 0.5926584005355835, + "learning_rate": 2.2378874714242222e-06, + "loss": 0.5621, + "step": 2908 + }, + { + "epoch": 3.2539149888143175, + "grad_norm": 0.5766351222991943, + "learning_rate": 2.236403723549955e-06, + "loss": 0.535, + "step": 2909 + }, + { + "epoch": 3.2550335570469797, + "grad_norm": 0.568735659122467, + "learning_rate": 2.2349200695630718e-06, + "loss": 0.5176, + "step": 2910 + }, + { + "epoch": 3.256152125279642, + "grad_norm": 0.5824649930000305, + "learning_rate": 2.23343650999202e-06, + "loss": 0.5497, + "step": 2911 + }, + { + "epoch": 3.257270693512304, + "grad_norm": 0.5679856538772583, + "learning_rate": 2.2319530453652108e-06, + "loss": 0.4881, + "step": 2912 + }, + { + "epoch": 3.2583892617449663, + "grad_norm": 0.5747111439704895, + "learning_rate": 2.230469676211024e-06, + "loss": 0.5263, + "step": 2913 + }, + { + "epoch": 3.2595078299776286, + "grad_norm": 0.5793637633323669, + "learning_rate": 2.2289864030578033e-06, + "loss": 0.4959, + "step": 2914 + }, + { + "epoch": 3.2606263982102908, + "grad_norm": 0.5759130716323853, + "learning_rate": 2.2275032264338587e-06, + "loss": 0.5368, + "step": 2915 + }, + { + "epoch": 3.261744966442953, + "grad_norm": 0.5845560431480408, + "learning_rate": 2.226020146867467e-06, + "loss": 0.5546, + "step": 2916 + }, + { + "epoch": 3.262863534675615, + "grad_norm": 0.5886693596839905, + "learning_rate": 2.224537164886869e-06, + "loss": 0.5554, + "step": 2917 + }, + { + "epoch": 3.2639821029082774, + "grad_norm": 0.5884382724761963, + "learning_rate": 2.223054281020272e-06, + "loss": 0.5368, + "step": 2918 + }, + { + "epoch": 3.2651006711409396, + "grad_norm": 0.5753760933876038, + "learning_rate": 2.2215714957958464e-06, + "loss": 0.5681, + "step": 2919 + }, + { + "epoch": 3.266219239373602, + "grad_norm": 0.5743902921676636, + "learning_rate": 2.2200888097417308e-06, + "loss": 0.5527, + "step": 2920 + }, + { + "epoch": 3.267337807606264, + "grad_norm": 0.5883992910385132, + "learning_rate": 2.2186062233860243e-06, + "loss": 0.5397, + "step": 2921 + }, + { + "epoch": 3.2684563758389262, + "grad_norm": 0.5645486116409302, + "learning_rate": 2.217123737256794e-06, + "loss": 0.5332, + "step": 2922 + }, + { + "epoch": 3.2695749440715884, + "grad_norm": 0.5832722783088684, + "learning_rate": 2.215641351882069e-06, + "loss": 0.5454, + "step": 2923 + }, + { + "epoch": 3.2706935123042506, + "grad_norm": 0.5675692558288574, + "learning_rate": 2.2141590677898444e-06, + "loss": 0.5383, + "step": 2924 + }, + { + "epoch": 3.271812080536913, + "grad_norm": 0.5801993608474731, + "learning_rate": 2.212676885508078e-06, + "loss": 0.5585, + "step": 2925 + }, + { + "epoch": 3.272930648769575, + "grad_norm": 0.5799306035041809, + "learning_rate": 2.2111948055646904e-06, + "loss": 0.5333, + "step": 2926 + }, + { + "epoch": 3.2740492170022373, + "grad_norm": 0.6004437804222107, + "learning_rate": 2.209712828487569e-06, + "loss": 0.5433, + "step": 2927 + }, + { + "epoch": 3.2751677852348995, + "grad_norm": 0.5742294788360596, + "learning_rate": 2.2082309548045595e-06, + "loss": 0.545, + "step": 2928 + }, + { + "epoch": 3.2762863534675617, + "grad_norm": 0.5941585898399353, + "learning_rate": 2.2067491850434773e-06, + "loss": 0.5646, + "step": 2929 + }, + { + "epoch": 3.277404921700224, + "grad_norm": 0.5658648014068604, + "learning_rate": 2.205267519732095e-06, + "loss": 0.5451, + "step": 2930 + }, + { + "epoch": 3.278523489932886, + "grad_norm": 0.5727295875549316, + "learning_rate": 2.203785959398151e-06, + "loss": 0.5225, + "step": 2931 + }, + { + "epoch": 3.279642058165548, + "grad_norm": 0.5861002802848816, + "learning_rate": 2.202304504569345e-06, + "loss": 0.532, + "step": 2932 + }, + { + "epoch": 3.2807606263982105, + "grad_norm": 0.5914974808692932, + "learning_rate": 2.2008231557733407e-06, + "loss": 0.551, + "step": 2933 + }, + { + "epoch": 3.2818791946308723, + "grad_norm": 0.5633930563926697, + "learning_rate": 2.199341913537763e-06, + "loss": 0.519, + "step": 2934 + }, + { + "epoch": 3.2829977628635345, + "grad_norm": 0.5826354622840881, + "learning_rate": 2.1978607783901967e-06, + "loss": 0.5267, + "step": 2935 + }, + { + "epoch": 3.2841163310961967, + "grad_norm": 0.5788497924804688, + "learning_rate": 2.1963797508581937e-06, + "loss": 0.5577, + "step": 2936 + }, + { + "epoch": 3.285234899328859, + "grad_norm": 0.5772359371185303, + "learning_rate": 2.194898831469262e-06, + "loss": 0.5408, + "step": 2937 + }, + { + "epoch": 3.286353467561521, + "grad_norm": 0.5948665738105774, + "learning_rate": 2.193418020750875e-06, + "loss": 0.5188, + "step": 2938 + }, + { + "epoch": 3.2874720357941833, + "grad_norm": 0.5852000713348389, + "learning_rate": 2.1919373192304646e-06, + "loss": 0.5279, + "step": 2939 + }, + { + "epoch": 3.2885906040268456, + "grad_norm": 0.5882996916770935, + "learning_rate": 2.1904567274354273e-06, + "loss": 0.5194, + "step": 2940 + }, + { + "epoch": 3.2897091722595078, + "grad_norm": 0.5863009095191956, + "learning_rate": 2.188976245893117e-06, + "loss": 0.5348, + "step": 2941 + }, + { + "epoch": 3.29082774049217, + "grad_norm": 0.5813042521476746, + "learning_rate": 2.18749587513085e-06, + "loss": 0.5604, + "step": 2942 + }, + { + "epoch": 3.291946308724832, + "grad_norm": 0.5710304975509644, + "learning_rate": 2.186015615675902e-06, + "loss": 0.5222, + "step": 2943 + }, + { + "epoch": 3.2930648769574944, + "grad_norm": 0.5950871706008911, + "learning_rate": 2.184535468055512e-06, + "loss": 0.5387, + "step": 2944 + }, + { + "epoch": 3.2941834451901566, + "grad_norm": 0.5959584712982178, + "learning_rate": 2.1830554327968755e-06, + "loss": 0.5385, + "step": 2945 + }, + { + "epoch": 3.295302013422819, + "grad_norm": 0.5772879123687744, + "learning_rate": 2.1815755104271493e-06, + "loss": 0.5461, + "step": 2946 + }, + { + "epoch": 3.296420581655481, + "grad_norm": 0.5787171125411987, + "learning_rate": 2.1800957014734513e-06, + "loss": 0.5267, + "step": 2947 + }, + { + "epoch": 3.2975391498881432, + "grad_norm": 0.5778903365135193, + "learning_rate": 2.178616006462857e-06, + "loss": 0.5323, + "step": 2948 + }, + { + "epoch": 3.2986577181208054, + "grad_norm": 0.5893623232841492, + "learning_rate": 2.1771364259224034e-06, + "loss": 0.5335, + "step": 2949 + }, + { + "epoch": 3.2997762863534676, + "grad_norm": 0.5840058326721191, + "learning_rate": 2.1756569603790844e-06, + "loss": 0.5465, + "step": 2950 + }, + { + "epoch": 3.30089485458613, + "grad_norm": 0.5979543328285217, + "learning_rate": 2.1741776103598553e-06, + "loss": 0.5334, + "step": 2951 + }, + { + "epoch": 3.302013422818792, + "grad_norm": 0.5793008208274841, + "learning_rate": 2.172698376391628e-06, + "loss": 0.5305, + "step": 2952 + }, + { + "epoch": 3.3031319910514543, + "grad_norm": 0.5808400511741638, + "learning_rate": 2.1712192590012754e-06, + "loss": 0.5386, + "step": 2953 + }, + { + "epoch": 3.3042505592841165, + "grad_norm": 0.5705307126045227, + "learning_rate": 2.1697402587156262e-06, + "loss": 0.5441, + "step": 2954 + }, + { + "epoch": 3.3053691275167787, + "grad_norm": 0.5752269625663757, + "learning_rate": 2.1682613760614706e-06, + "loss": 0.5306, + "step": 2955 + }, + { + "epoch": 3.306487695749441, + "grad_norm": 0.5589823126792908, + "learning_rate": 2.1667826115655536e-06, + "loss": 0.5449, + "step": 2956 + }, + { + "epoch": 3.3076062639821027, + "grad_norm": 0.5624956488609314, + "learning_rate": 2.1653039657545794e-06, + "loss": 0.5191, + "step": 2957 + }, + { + "epoch": 3.3087248322147653, + "grad_norm": 0.5764420032501221, + "learning_rate": 2.163825439155212e-06, + "loss": 0.523, + "step": 2958 + }, + { + "epoch": 3.309843400447427, + "grad_norm": 0.5696988105773926, + "learning_rate": 2.1623470322940695e-06, + "loss": 0.563, + "step": 2959 + }, + { + "epoch": 3.3109619686800893, + "grad_norm": 0.5889440774917603, + "learning_rate": 2.1608687456977298e-06, + "loss": 0.561, + "step": 2960 + }, + { + "epoch": 3.3120805369127515, + "grad_norm": 0.5663565397262573, + "learning_rate": 2.1593905798927264e-06, + "loss": 0.5405, + "step": 2961 + }, + { + "epoch": 3.3131991051454137, + "grad_norm": 0.5894338488578796, + "learning_rate": 2.1579125354055517e-06, + "loss": 0.511, + "step": 2962 + }, + { + "epoch": 3.314317673378076, + "grad_norm": 0.5960800647735596, + "learning_rate": 2.156434612762652e-06, + "loss": 0.5476, + "step": 2963 + }, + { + "epoch": 3.315436241610738, + "grad_norm": 0.5952925086021423, + "learning_rate": 2.154956812490433e-06, + "loss": 0.5427, + "step": 2964 + }, + { + "epoch": 3.3165548098434003, + "grad_norm": 0.5822497606277466, + "learning_rate": 2.1534791351152562e-06, + "loss": 0.5526, + "step": 2965 + }, + { + "epoch": 3.3176733780760626, + "grad_norm": 0.5573047995567322, + "learning_rate": 2.152001581163438e-06, + "loss": 0.5328, + "step": 2966 + }, + { + "epoch": 3.3187919463087248, + "grad_norm": 0.5778958201408386, + "learning_rate": 2.1505241511612522e-06, + "loss": 0.5415, + "step": 2967 + }, + { + "epoch": 3.319910514541387, + "grad_norm": 0.5854326486587524, + "learning_rate": 2.1490468456349262e-06, + "loss": 0.5483, + "step": 2968 + }, + { + "epoch": 3.321029082774049, + "grad_norm": 0.591769278049469, + "learning_rate": 2.147569665110648e-06, + "loss": 0.5223, + "step": 2969 + }, + { + "epoch": 3.3221476510067114, + "grad_norm": 0.5759871602058411, + "learning_rate": 2.146092610114555e-06, + "loss": 0.5407, + "step": 2970 + }, + { + "epoch": 3.3232662192393736, + "grad_norm": 0.5628697276115417, + "learning_rate": 2.144615681172744e-06, + "loss": 0.5482, + "step": 2971 + }, + { + "epoch": 3.324384787472036, + "grad_norm": 0.5733519792556763, + "learning_rate": 2.143138878811265e-06, + "loss": 0.551, + "step": 2972 + }, + { + "epoch": 3.325503355704698, + "grad_norm": 0.562671959400177, + "learning_rate": 2.1416622035561254e-06, + "loss": 0.5057, + "step": 2973 + }, + { + "epoch": 3.3266219239373602, + "grad_norm": 0.564473569393158, + "learning_rate": 2.140185655933283e-06, + "loss": 0.5289, + "step": 2974 + }, + { + "epoch": 3.3277404921700224, + "grad_norm": 0.573268473148346, + "learning_rate": 2.1387092364686544e-06, + "loss": 0.5517, + "step": 2975 + }, + { + "epoch": 3.3288590604026846, + "grad_norm": 0.5765975713729858, + "learning_rate": 2.1372329456881076e-06, + "loss": 0.5623, + "step": 2976 + }, + { + "epoch": 3.329977628635347, + "grad_norm": 0.5586515665054321, + "learning_rate": 2.1357567841174673e-06, + "loss": 0.5148, + "step": 2977 + }, + { + "epoch": 3.331096196868009, + "grad_norm": 0.5946812033653259, + "learning_rate": 2.13428075228251e-06, + "loss": 0.5733, + "step": 2978 + }, + { + "epoch": 3.3322147651006713, + "grad_norm": 0.5841031074523926, + "learning_rate": 2.1328048507089667e-06, + "loss": 0.5302, + "step": 2979 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.5731162428855896, + "learning_rate": 2.131329079922523e-06, + "loss": 0.5467, + "step": 2980 + }, + { + "epoch": 3.3344519015659957, + "grad_norm": 0.5872629284858704, + "learning_rate": 2.1298534404488155e-06, + "loss": 0.4947, + "step": 2981 + }, + { + "epoch": 3.335570469798658, + "grad_norm": 0.567255973815918, + "learning_rate": 2.1283779328134373e-06, + "loss": 0.5135, + "step": 2982 + }, + { + "epoch": 3.33668903803132, + "grad_norm": 0.5988405346870422, + "learning_rate": 2.1269025575419314e-06, + "loss": 0.538, + "step": 2983 + }, + { + "epoch": 3.337807606263982, + "grad_norm": 0.5989606976509094, + "learning_rate": 2.1254273151597967e-06, + "loss": 0.5427, + "step": 2984 + }, + { + "epoch": 3.3389261744966445, + "grad_norm": 0.5712903738021851, + "learning_rate": 2.123952206192481e-06, + "loss": 0.5568, + "step": 2985 + }, + { + "epoch": 3.3400447427293063, + "grad_norm": 0.571904718875885, + "learning_rate": 2.1224772311653892e-06, + "loss": 0.5345, + "step": 2986 + }, + { + "epoch": 3.3411633109619685, + "grad_norm": 0.5647782683372498, + "learning_rate": 2.1210023906038744e-06, + "loss": 0.5186, + "step": 2987 + }, + { + "epoch": 3.3422818791946307, + "grad_norm": 0.5909580588340759, + "learning_rate": 2.119527685033243e-06, + "loss": 0.585, + "step": 2988 + }, + { + "epoch": 3.343400447427293, + "grad_norm": 0.6060347557067871, + "learning_rate": 2.1180531149787547e-06, + "loss": 0.5453, + "step": 2989 + }, + { + "epoch": 3.344519015659955, + "grad_norm": 0.5901708602905273, + "learning_rate": 2.11657868096562e-06, + "loss": 0.5127, + "step": 2990 + }, + { + "epoch": 3.3456375838926173, + "grad_norm": 0.572715163230896, + "learning_rate": 2.115104383519001e-06, + "loss": 0.5385, + "step": 2991 + }, + { + "epoch": 3.3467561521252795, + "grad_norm": 0.5861586928367615, + "learning_rate": 2.1136302231640097e-06, + "loss": 0.5182, + "step": 2992 + }, + { + "epoch": 3.3478747203579418, + "grad_norm": 0.587785542011261, + "learning_rate": 2.112156200425713e-06, + "loss": 0.509, + "step": 2993 + }, + { + "epoch": 3.348993288590604, + "grad_norm": 0.5828824043273926, + "learning_rate": 2.1106823158291244e-06, + "loss": 0.5561, + "step": 2994 + }, + { + "epoch": 3.350111856823266, + "grad_norm": 0.6093428134918213, + "learning_rate": 2.1092085698992116e-06, + "loss": 0.5543, + "step": 2995 + }, + { + "epoch": 3.3512304250559284, + "grad_norm": 0.5749115347862244, + "learning_rate": 2.1077349631608897e-06, + "loss": 0.5407, + "step": 2996 + }, + { + "epoch": 3.3523489932885906, + "grad_norm": 0.5780017971992493, + "learning_rate": 2.1062614961390283e-06, + "loss": 0.5445, + "step": 2997 + }, + { + "epoch": 3.353467561521253, + "grad_norm": 0.5813626050949097, + "learning_rate": 2.1047881693584448e-06, + "loss": 0.537, + "step": 2998 + }, + { + "epoch": 3.354586129753915, + "grad_norm": 0.5969443917274475, + "learning_rate": 2.1033149833439044e-06, + "loss": 0.5429, + "step": 2999 + }, + { + "epoch": 3.3557046979865772, + "grad_norm": 0.5845437049865723, + "learning_rate": 2.101841938620128e-06, + "loss": 0.5323, + "step": 3000 + }, + { + "epoch": 3.3568232662192394, + "grad_norm": 0.5675383806228638, + "learning_rate": 2.100369035711779e-06, + "loss": 0.5197, + "step": 3001 + }, + { + "epoch": 3.3579418344519016, + "grad_norm": 0.594617486000061, + "learning_rate": 2.0988962751434773e-06, + "loss": 0.5743, + "step": 3002 + }, + { + "epoch": 3.359060402684564, + "grad_norm": 0.6038890480995178, + "learning_rate": 2.0974236574397877e-06, + "loss": 0.5472, + "step": 3003 + }, + { + "epoch": 3.360178970917226, + "grad_norm": 0.5878751873970032, + "learning_rate": 2.0959511831252253e-06, + "loss": 0.5321, + "step": 3004 + }, + { + "epoch": 3.3612975391498883, + "grad_norm": 0.5802765488624573, + "learning_rate": 2.0944788527242534e-06, + "loss": 0.551, + "step": 3005 + }, + { + "epoch": 3.3624161073825505, + "grad_norm": 0.5891462564468384, + "learning_rate": 2.0930066667612865e-06, + "loss": 0.539, + "step": 3006 + }, + { + "epoch": 3.3635346756152127, + "grad_norm": 0.582206130027771, + "learning_rate": 2.0915346257606846e-06, + "loss": 0.5267, + "step": 3007 + }, + { + "epoch": 3.364653243847875, + "grad_norm": 0.5985295176506042, + "learning_rate": 2.090062730246758e-06, + "loss": 0.5533, + "step": 3008 + }, + { + "epoch": 3.365771812080537, + "grad_norm": 0.5766201019287109, + "learning_rate": 2.0885909807437636e-06, + "loss": 0.5396, + "step": 3009 + }, + { + "epoch": 3.3668903803131993, + "grad_norm": 0.572158932685852, + "learning_rate": 2.0871193777759074e-06, + "loss": 0.5142, + "step": 3010 + }, + { + "epoch": 3.368008948545861, + "grad_norm": 0.5813375115394592, + "learning_rate": 2.0856479218673453e-06, + "loss": 0.5396, + "step": 3011 + }, + { + "epoch": 3.3691275167785237, + "grad_norm": 0.5880120992660522, + "learning_rate": 2.0841766135421753e-06, + "loss": 0.5494, + "step": 3012 + }, + { + "epoch": 3.3702460850111855, + "grad_norm": 0.5889708995819092, + "learning_rate": 2.0827054533244485e-06, + "loss": 0.5333, + "step": 3013 + }, + { + "epoch": 3.3713646532438477, + "grad_norm": 0.5861124992370605, + "learning_rate": 2.0812344417381595e-06, + "loss": 0.5407, + "step": 3014 + }, + { + "epoch": 3.37248322147651, + "grad_norm": 0.5954474210739136, + "learning_rate": 2.0797635793072524e-06, + "loss": 0.5201, + "step": 3015 + }, + { + "epoch": 3.373601789709172, + "grad_norm": 0.599679172039032, + "learning_rate": 2.0782928665556153e-06, + "loss": 0.5534, + "step": 3016 + }, + { + "epoch": 3.3747203579418343, + "grad_norm": 0.5633800625801086, + "learning_rate": 2.0768223040070873e-06, + "loss": 0.5232, + "step": 3017 + }, + { + "epoch": 3.3758389261744965, + "grad_norm": 0.5800679922103882, + "learning_rate": 2.07535189218545e-06, + "loss": 0.5209, + "step": 3018 + }, + { + "epoch": 3.3769574944071588, + "grad_norm": 0.6014657616615295, + "learning_rate": 2.073881631614433e-06, + "loss": 0.5419, + "step": 3019 + }, + { + "epoch": 3.378076062639821, + "grad_norm": 0.576155424118042, + "learning_rate": 2.072411522817712e-06, + "loss": 0.5202, + "step": 3020 + }, + { + "epoch": 3.379194630872483, + "grad_norm": 0.5844408273696899, + "learning_rate": 2.0709415663189076e-06, + "loss": 0.557, + "step": 3021 + }, + { + "epoch": 3.3803131991051454, + "grad_norm": 0.5831338167190552, + "learning_rate": 2.0694717626415886e-06, + "loss": 0.5697, + "step": 3022 + }, + { + "epoch": 3.3814317673378076, + "grad_norm": 0.5777684450149536, + "learning_rate": 2.0680021123092665e-06, + "loss": 0.5178, + "step": 3023 + }, + { + "epoch": 3.38255033557047, + "grad_norm": 0.5882266163825989, + "learning_rate": 2.0665326158454006e-06, + "loss": 0.5374, + "step": 3024 + }, + { + "epoch": 3.383668903803132, + "grad_norm": 0.5824880599975586, + "learning_rate": 2.0650632737733923e-06, + "loss": 0.5307, + "step": 3025 + }, + { + "epoch": 3.384787472035794, + "grad_norm": 0.5969476103782654, + "learning_rate": 2.0635940866165925e-06, + "loss": 0.5514, + "step": 3026 + }, + { + "epoch": 3.3859060402684564, + "grad_norm": 0.5888137221336365, + "learning_rate": 2.0621250548982926e-06, + "loss": 0.5384, + "step": 3027 + }, + { + "epoch": 3.3870246085011186, + "grad_norm": 0.5858564376831055, + "learning_rate": 2.060656179141732e-06, + "loss": 0.5352, + "step": 3028 + }, + { + "epoch": 3.388143176733781, + "grad_norm": 0.6135839223861694, + "learning_rate": 2.0591874598700905e-06, + "loss": 0.544, + "step": 3029 + }, + { + "epoch": 3.389261744966443, + "grad_norm": 0.6021990180015564, + "learning_rate": 2.0577188976064972e-06, + "loss": 0.5393, + "step": 3030 + }, + { + "epoch": 3.3903803131991053, + "grad_norm": 0.5606169104576111, + "learning_rate": 2.0562504928740216e-06, + "loss": 0.5073, + "step": 3031 + }, + { + "epoch": 3.3914988814317675, + "grad_norm": 0.5875310897827148, + "learning_rate": 2.054782246195678e-06, + "loss": 0.5407, + "step": 3032 + }, + { + "epoch": 3.3926174496644297, + "grad_norm": 0.5954650044441223, + "learning_rate": 2.0533141580944258e-06, + "loss": 0.5656, + "step": 3033 + }, + { + "epoch": 3.393736017897092, + "grad_norm": 0.6038705110549927, + "learning_rate": 2.0518462290931646e-06, + "loss": 0.5606, + "step": 3034 + }, + { + "epoch": 3.394854586129754, + "grad_norm": 0.5860815048217773, + "learning_rate": 2.0503784597147423e-06, + "loss": 0.5485, + "step": 3035 + }, + { + "epoch": 3.395973154362416, + "grad_norm": 0.5871373414993286, + "learning_rate": 2.048910850481944e-06, + "loss": 0.5602, + "step": 3036 + }, + { + "epoch": 3.3970917225950785, + "grad_norm": 0.5743905901908875, + "learning_rate": 2.047443401917504e-06, + "loss": 0.5354, + "step": 3037 + }, + { + "epoch": 3.3982102908277403, + "grad_norm": 0.5994888544082642, + "learning_rate": 2.0459761145440932e-06, + "loss": 0.5455, + "step": 3038 + }, + { + "epoch": 3.3993288590604025, + "grad_norm": 0.5735637545585632, + "learning_rate": 2.0445089888843313e-06, + "loss": 0.5264, + "step": 3039 + }, + { + "epoch": 3.4004474272930647, + "grad_norm": 0.5801488161087036, + "learning_rate": 2.043042025460775e-06, + "loss": 0.5292, + "step": 3040 + }, + { + "epoch": 3.401565995525727, + "grad_norm": 0.5982528924942017, + "learning_rate": 2.041575224795925e-06, + "loss": 0.5655, + "step": 3041 + }, + { + "epoch": 3.402684563758389, + "grad_norm": 0.6030799746513367, + "learning_rate": 2.040108587412227e-06, + "loss": 0.5627, + "step": 3042 + }, + { + "epoch": 3.4038031319910513, + "grad_norm": 0.5627343058586121, + "learning_rate": 2.038642113832064e-06, + "loss": 0.5008, + "step": 3043 + }, + { + "epoch": 3.4049217002237135, + "grad_norm": 0.5961188673973083, + "learning_rate": 2.0371758045777634e-06, + "loss": 0.5437, + "step": 3044 + }, + { + "epoch": 3.4060402684563758, + "grad_norm": 0.5721891522407532, + "learning_rate": 2.035709660171592e-06, + "loss": 0.526, + "step": 3045 + }, + { + "epoch": 3.407158836689038, + "grad_norm": 0.5916818380355835, + "learning_rate": 2.0342436811357623e-06, + "loss": 0.5466, + "step": 3046 + }, + { + "epoch": 3.4082774049217, + "grad_norm": 0.5888948440551758, + "learning_rate": 2.032777867992422e-06, + "loss": 0.5715, + "step": 3047 + }, + { + "epoch": 3.4093959731543624, + "grad_norm": 0.5998460650444031, + "learning_rate": 2.0313122212636645e-06, + "loss": 0.5482, + "step": 3048 + }, + { + "epoch": 3.4105145413870246, + "grad_norm": 0.6015839576721191, + "learning_rate": 2.0298467414715196e-06, + "loss": 0.547, + "step": 3049 + }, + { + "epoch": 3.411633109619687, + "grad_norm": 0.5714446902275085, + "learning_rate": 2.028381429137963e-06, + "loss": 0.5196, + "step": 3050 + }, + { + "epoch": 3.412751677852349, + "grad_norm": 0.5919701457023621, + "learning_rate": 2.0269162847849054e-06, + "loss": 0.5423, + "step": 3051 + }, + { + "epoch": 3.413870246085011, + "grad_norm": 0.5841549038887024, + "learning_rate": 2.025451308934201e-06, + "loss": 0.5404, + "step": 3052 + }, + { + "epoch": 3.4149888143176734, + "grad_norm": 0.622018575668335, + "learning_rate": 2.0239865021076435e-06, + "loss": 0.5254, + "step": 3053 + }, + { + "epoch": 3.4161073825503356, + "grad_norm": 0.5861012935638428, + "learning_rate": 2.0225218648269636e-06, + "loss": 0.5307, + "step": 3054 + }, + { + "epoch": 3.417225950782998, + "grad_norm": 0.6065356135368347, + "learning_rate": 2.021057397613837e-06, + "loss": 0.5608, + "step": 3055 + }, + { + "epoch": 3.41834451901566, + "grad_norm": 0.5714085102081299, + "learning_rate": 2.0195931009898745e-06, + "loss": 0.5272, + "step": 3056 + }, + { + "epoch": 3.4194630872483223, + "grad_norm": 0.5810417532920837, + "learning_rate": 2.018128975476627e-06, + "loss": 0.5342, + "step": 3057 + }, + { + "epoch": 3.4205816554809845, + "grad_norm": 0.5855004191398621, + "learning_rate": 2.016665021595585e-06, + "loss": 0.5001, + "step": 3058 + }, + { + "epoch": 3.4217002237136467, + "grad_norm": 0.6063441634178162, + "learning_rate": 2.0152012398681784e-06, + "loss": 0.5415, + "step": 3059 + }, + { + "epoch": 3.422818791946309, + "grad_norm": 0.5679510831832886, + "learning_rate": 2.0137376308157743e-06, + "loss": 0.5053, + "step": 3060 + }, + { + "epoch": 3.423937360178971, + "grad_norm": 0.6010608673095703, + "learning_rate": 2.01227419495968e-06, + "loss": 0.5549, + "step": 3061 + }, + { + "epoch": 3.4250559284116333, + "grad_norm": 0.5789581537246704, + "learning_rate": 2.0108109328211393e-06, + "loss": 0.5408, + "step": 3062 + }, + { + "epoch": 3.426174496644295, + "grad_norm": 0.5854089856147766, + "learning_rate": 2.009347844921335e-06, + "loss": 0.5366, + "step": 3063 + }, + { + "epoch": 3.4272930648769577, + "grad_norm": 0.5768666863441467, + "learning_rate": 2.0078849317813887e-06, + "loss": 0.5304, + "step": 3064 + }, + { + "epoch": 3.4284116331096195, + "grad_norm": 0.5701066255569458, + "learning_rate": 2.006422193922357e-06, + "loss": 0.5266, + "step": 3065 + }, + { + "epoch": 3.4295302013422817, + "grad_norm": 0.5916892290115356, + "learning_rate": 2.004959631865239e-06, + "loss": 0.524, + "step": 3066 + }, + { + "epoch": 3.430648769574944, + "grad_norm": 0.5868574380874634, + "learning_rate": 2.0034972461309665e-06, + "loss": 0.5483, + "step": 3067 + }, + { + "epoch": 3.431767337807606, + "grad_norm": 0.5739445686340332, + "learning_rate": 2.0020350372404104e-06, + "loss": 0.531, + "step": 3068 + }, + { + "epoch": 3.4328859060402683, + "grad_norm": 0.5930995941162109, + "learning_rate": 2.0005730057143776e-06, + "loss": 0.5484, + "step": 3069 + }, + { + "epoch": 3.4340044742729305, + "grad_norm": 0.579956591129303, + "learning_rate": 1.999111152073614e-06, + "loss": 0.5302, + "step": 3070 + }, + { + "epoch": 3.4351230425055927, + "grad_norm": 0.5914560556411743, + "learning_rate": 1.9976494768388e-06, + "loss": 0.5468, + "step": 3071 + }, + { + "epoch": 3.436241610738255, + "grad_norm": 0.5825742483139038, + "learning_rate": 1.9961879805305536e-06, + "loss": 0.5281, + "step": 3072 + }, + { + "epoch": 3.437360178970917, + "grad_norm": 0.5924240946769714, + "learning_rate": 1.9947266636694283e-06, + "loss": 0.5415, + "step": 3073 + }, + { + "epoch": 3.4384787472035794, + "grad_norm": 0.5900630950927734, + "learning_rate": 1.9932655267759132e-06, + "loss": 0.5394, + "step": 3074 + }, + { + "epoch": 3.4395973154362416, + "grad_norm": 0.6008167862892151, + "learning_rate": 1.9918045703704363e-06, + "loss": 0.5426, + "step": 3075 + }, + { + "epoch": 3.440715883668904, + "grad_norm": 0.6024075150489807, + "learning_rate": 1.9903437949733574e-06, + "loss": 0.5524, + "step": 3076 + }, + { + "epoch": 3.441834451901566, + "grad_norm": 0.588410496711731, + "learning_rate": 1.988883201104975e-06, + "loss": 0.5264, + "step": 3077 + }, + { + "epoch": 3.442953020134228, + "grad_norm": 0.5900465250015259, + "learning_rate": 1.9874227892855198e-06, + "loss": 0.5243, + "step": 3078 + }, + { + "epoch": 3.4440715883668904, + "grad_norm": 0.6028493642807007, + "learning_rate": 1.985962560035161e-06, + "loss": 0.5779, + "step": 3079 + }, + { + "epoch": 3.4451901565995526, + "grad_norm": 0.5902518630027771, + "learning_rate": 1.9845025138740007e-06, + "loss": 0.5272, + "step": 3080 + }, + { + "epoch": 3.446308724832215, + "grad_norm": 0.5870692729949951, + "learning_rate": 1.9830426513220764e-06, + "loss": 0.5337, + "step": 3081 + }, + { + "epoch": 3.447427293064877, + "grad_norm": 0.5896787047386169, + "learning_rate": 1.981582972899359e-06, + "loss": 0.5407, + "step": 3082 + }, + { + "epoch": 3.4485458612975393, + "grad_norm": 0.5881497859954834, + "learning_rate": 1.980123479125757e-06, + "loss": 0.5346, + "step": 3083 + }, + { + "epoch": 3.4496644295302015, + "grad_norm": 0.5895851850509644, + "learning_rate": 1.978664170521109e-06, + "loss": 0.5416, + "step": 3084 + }, + { + "epoch": 3.4507829977628637, + "grad_norm": 0.5882962346076965, + "learning_rate": 1.9772050476051907e-06, + "loss": 0.535, + "step": 3085 + }, + { + "epoch": 3.451901565995526, + "grad_norm": 0.5909952521324158, + "learning_rate": 1.975746110897711e-06, + "loss": 0.5779, + "step": 3086 + }, + { + "epoch": 3.453020134228188, + "grad_norm": 0.5988467335700989, + "learning_rate": 1.97428736091831e-06, + "loss": 0.5515, + "step": 3087 + }, + { + "epoch": 3.45413870246085, + "grad_norm": 0.5785988569259644, + "learning_rate": 1.9728287981865667e-06, + "loss": 0.553, + "step": 3088 + }, + { + "epoch": 3.4552572706935125, + "grad_norm": 0.5952177047729492, + "learning_rate": 1.971370423221987e-06, + "loss": 0.5342, + "step": 3089 + }, + { + "epoch": 3.4563758389261743, + "grad_norm": 0.595833420753479, + "learning_rate": 1.9699122365440153e-06, + "loss": 0.5092, + "step": 3090 + }, + { + "epoch": 3.457494407158837, + "grad_norm": 0.5858739614486694, + "learning_rate": 1.968454238672025e-06, + "loss": 0.5395, + "step": 3091 + }, + { + "epoch": 3.4586129753914987, + "grad_norm": 0.5878705978393555, + "learning_rate": 1.9669964301253252e-06, + "loss": 0.5523, + "step": 3092 + }, + { + "epoch": 3.459731543624161, + "grad_norm": 0.5905231833457947, + "learning_rate": 1.9655388114231556e-06, + "loss": 0.5422, + "step": 3093 + }, + { + "epoch": 3.460850111856823, + "grad_norm": 0.5753354430198669, + "learning_rate": 1.9640813830846883e-06, + "loss": 0.5417, + "step": 3094 + }, + { + "epoch": 3.4619686800894853, + "grad_norm": 0.592563807964325, + "learning_rate": 1.9626241456290303e-06, + "loss": 0.5565, + "step": 3095 + }, + { + "epoch": 3.4630872483221475, + "grad_norm": 0.5797046422958374, + "learning_rate": 1.9611670995752164e-06, + "loss": 0.5283, + "step": 3096 + }, + { + "epoch": 3.4642058165548097, + "grad_norm": 0.60841965675354, + "learning_rate": 1.959710245442217e-06, + "loss": 0.514, + "step": 3097 + }, + { + "epoch": 3.465324384787472, + "grad_norm": 0.5961316823959351, + "learning_rate": 1.9582535837489307e-06, + "loss": 0.5548, + "step": 3098 + }, + { + "epoch": 3.466442953020134, + "grad_norm": 0.5709353089332581, + "learning_rate": 1.956797115014192e-06, + "loss": 0.5287, + "step": 3099 + }, + { + "epoch": 3.4675615212527964, + "grad_norm": 0.5842140913009644, + "learning_rate": 1.955340839756762e-06, + "loss": 0.5595, + "step": 3100 + }, + { + "epoch": 3.4686800894854586, + "grad_norm": 0.6045351028442383, + "learning_rate": 1.953884758495336e-06, + "loss": 0.5457, + "step": 3101 + }, + { + "epoch": 3.469798657718121, + "grad_norm": 0.5748609304428101, + "learning_rate": 1.9524288717485378e-06, + "loss": 0.5257, + "step": 3102 + }, + { + "epoch": 3.470917225950783, + "grad_norm": 0.5759230852127075, + "learning_rate": 1.9509731800349254e-06, + "loss": 0.5451, + "step": 3103 + }, + { + "epoch": 3.472035794183445, + "grad_norm": 0.6004695892333984, + "learning_rate": 1.949517683872984e-06, + "loss": 0.5441, + "step": 3104 + }, + { + "epoch": 3.4731543624161074, + "grad_norm": 0.5948774218559265, + "learning_rate": 1.94806238378113e-06, + "loss": 0.5319, + "step": 3105 + }, + { + "epoch": 3.4742729306487696, + "grad_norm": 0.5795431733131409, + "learning_rate": 1.9466072802777115e-06, + "loss": 0.5328, + "step": 3106 + }, + { + "epoch": 3.475391498881432, + "grad_norm": 0.5903091430664062, + "learning_rate": 1.945152373881003e-06, + "loss": 0.544, + "step": 3107 + }, + { + "epoch": 3.476510067114094, + "grad_norm": 0.5926859974861145, + "learning_rate": 1.9436976651092143e-06, + "loss": 0.5375, + "step": 3108 + }, + { + "epoch": 3.4776286353467563, + "grad_norm": 0.5743698477745056, + "learning_rate": 1.9422431544804797e-06, + "loss": 0.5407, + "step": 3109 + }, + { + "epoch": 3.4787472035794185, + "grad_norm": 0.5736781358718872, + "learning_rate": 1.9407888425128663e-06, + "loss": 0.5332, + "step": 3110 + }, + { + "epoch": 3.4798657718120807, + "grad_norm": 0.5804797410964966, + "learning_rate": 1.939334729724367e-06, + "loss": 0.5131, + "step": 3111 + }, + { + "epoch": 3.480984340044743, + "grad_norm": 0.5829744338989258, + "learning_rate": 1.9378808166329085e-06, + "loss": 0.5518, + "step": 3112 + }, + { + "epoch": 3.482102908277405, + "grad_norm": 0.6032028794288635, + "learning_rate": 1.936427103756341e-06, + "loss": 0.5632, + "step": 3113 + }, + { + "epoch": 3.4832214765100673, + "grad_norm": 0.615389347076416, + "learning_rate": 1.9349735916124486e-06, + "loss": 0.5583, + "step": 3114 + }, + { + "epoch": 3.484340044742729, + "grad_norm": 0.5674188733100891, + "learning_rate": 1.93352028071894e-06, + "loss": 0.5155, + "step": 3115 + }, + { + "epoch": 3.4854586129753917, + "grad_norm": 0.5841509699821472, + "learning_rate": 1.9320671715934534e-06, + "loss": 0.535, + "step": 3116 + }, + { + "epoch": 3.4865771812080535, + "grad_norm": 0.5757308006286621, + "learning_rate": 1.9306142647535564e-06, + "loss": 0.5329, + "step": 3117 + }, + { + "epoch": 3.4876957494407157, + "grad_norm": 0.6070057153701782, + "learning_rate": 1.929161560716742e-06, + "loss": 0.5431, + "step": 3118 + }, + { + "epoch": 3.488814317673378, + "grad_norm": 0.5943608283996582, + "learning_rate": 1.927709060000434e-06, + "loss": 0.5612, + "step": 3119 + }, + { + "epoch": 3.48993288590604, + "grad_norm": 0.6025094985961914, + "learning_rate": 1.9262567631219813e-06, + "loss": 0.5284, + "step": 3120 + }, + { + "epoch": 3.4910514541387023, + "grad_norm": 0.5863309502601624, + "learning_rate": 1.924804670598662e-06, + "loss": 0.5434, + "step": 3121 + }, + { + "epoch": 3.4921700223713645, + "grad_norm": 0.5863771438598633, + "learning_rate": 1.923352782947679e-06, + "loss": 0.5344, + "step": 3122 + }, + { + "epoch": 3.4932885906040267, + "grad_norm": 0.606030285358429, + "learning_rate": 1.921901100686166e-06, + "loss": 0.5652, + "step": 3123 + }, + { + "epoch": 3.494407158836689, + "grad_norm": 0.5807695984840393, + "learning_rate": 1.920449624331179e-06, + "loss": 0.512, + "step": 3124 + }, + { + "epoch": 3.495525727069351, + "grad_norm": 0.5898115038871765, + "learning_rate": 1.9189983543997055e-06, + "loss": 0.5143, + "step": 3125 + }, + { + "epoch": 3.4966442953020134, + "grad_norm": 0.5925139784812927, + "learning_rate": 1.9175472914086553e-06, + "loss": 0.5529, + "step": 3126 + }, + { + "epoch": 3.4977628635346756, + "grad_norm": 0.5879681706428528, + "learning_rate": 1.9160964358748652e-06, + "loss": 0.5486, + "step": 3127 + }, + { + "epoch": 3.498881431767338, + "grad_norm": 0.5888491272926331, + "learning_rate": 1.9146457883151017e-06, + "loss": 0.5419, + "step": 3128 + }, + { + "epoch": 3.5, + "grad_norm": 0.5992128849029541, + "learning_rate": 1.9131953492460526e-06, + "loss": 0.5342, + "step": 3129 + }, + { + "epoch": 3.501118568232662, + "grad_norm": 0.5754707455635071, + "learning_rate": 1.9117451191843344e-06, + "loss": 0.529, + "step": 3130 + }, + { + "epoch": 3.5022371364653244, + "grad_norm": 0.5758741497993469, + "learning_rate": 1.910295098646487e-06, + "loss": 0.521, + "step": 3131 + }, + { + "epoch": 3.5033557046979866, + "grad_norm": 0.5902168154716492, + "learning_rate": 1.908845288148979e-06, + "loss": 0.5208, + "step": 3132 + }, + { + "epoch": 3.504474272930649, + "grad_norm": 0.6037560105323792, + "learning_rate": 1.9073956882081996e-06, + "loss": 0.5517, + "step": 3133 + }, + { + "epoch": 3.505592841163311, + "grad_norm": 0.588798999786377, + "learning_rate": 1.9059462993404676e-06, + "loss": 0.5362, + "step": 3134 + }, + { + "epoch": 3.5067114093959733, + "grad_norm": 0.5682362914085388, + "learning_rate": 1.9044971220620224e-06, + "loss": 0.5369, + "step": 3135 + }, + { + "epoch": 3.5078299776286355, + "grad_norm": 0.597952663898468, + "learning_rate": 1.9030481568890322e-06, + "loss": 0.5497, + "step": 3136 + }, + { + "epoch": 3.5089485458612977, + "grad_norm": 0.6021576523780823, + "learning_rate": 1.9015994043375872e-06, + "loss": 0.5771, + "step": 3137 + }, + { + "epoch": 3.51006711409396, + "grad_norm": 0.5880240797996521, + "learning_rate": 1.9001508649237e-06, + "loss": 0.5608, + "step": 3138 + }, + { + "epoch": 3.511185682326622, + "grad_norm": 0.595424234867096, + "learning_rate": 1.898702539163312e-06, + "loss": 0.5453, + "step": 3139 + }, + { + "epoch": 3.512304250559284, + "grad_norm": 0.5903159976005554, + "learning_rate": 1.8972544275722849e-06, + "loss": 0.5654, + "step": 3140 + }, + { + "epoch": 3.5134228187919465, + "grad_norm": 0.5835764408111572, + "learning_rate": 1.8958065306664056e-06, + "loss": 0.5236, + "step": 3141 + }, + { + "epoch": 3.5145413870246083, + "grad_norm": 0.5918243527412415, + "learning_rate": 1.894358848961383e-06, + "loss": 0.5412, + "step": 3142 + }, + { + "epoch": 3.515659955257271, + "grad_norm": 0.6037173271179199, + "learning_rate": 1.892911382972853e-06, + "loss": 0.537, + "step": 3143 + }, + { + "epoch": 3.5167785234899327, + "grad_norm": 0.5999971628189087, + "learning_rate": 1.8914641332163702e-06, + "loss": 0.5596, + "step": 3144 + }, + { + "epoch": 3.5178970917225953, + "grad_norm": 0.5568059682846069, + "learning_rate": 1.890017100207415e-06, + "loss": 0.5095, + "step": 3145 + }, + { + "epoch": 3.519015659955257, + "grad_norm": 0.5705674886703491, + "learning_rate": 1.8885702844613896e-06, + "loss": 0.5277, + "step": 3146 + }, + { + "epoch": 3.5201342281879193, + "grad_norm": 0.59906005859375, + "learning_rate": 1.8871236864936176e-06, + "loss": 0.5466, + "step": 3147 + }, + { + "epoch": 3.5212527964205815, + "grad_norm": 0.5792226195335388, + "learning_rate": 1.885677306819349e-06, + "loss": 0.5412, + "step": 3148 + }, + { + "epoch": 3.5223713646532437, + "grad_norm": 0.5757958292961121, + "learning_rate": 1.8842311459537516e-06, + "loss": 0.5244, + "step": 3149 + }, + { + "epoch": 3.523489932885906, + "grad_norm": 0.5833378434181213, + "learning_rate": 1.8827852044119182e-06, + "loss": 0.5622, + "step": 3150 + }, + { + "epoch": 3.524608501118568, + "grad_norm": 0.5563960075378418, + "learning_rate": 1.881339482708861e-06, + "loss": 0.5234, + "step": 3151 + }, + { + "epoch": 3.5257270693512304, + "grad_norm": 0.5982696413993835, + "learning_rate": 1.8798939813595169e-06, + "loss": 0.5463, + "step": 3152 + }, + { + "epoch": 3.5268456375838926, + "grad_norm": 0.5711531639099121, + "learning_rate": 1.8784487008787416e-06, + "loss": 0.5494, + "step": 3153 + }, + { + "epoch": 3.527964205816555, + "grad_norm": 0.5683625936508179, + "learning_rate": 1.8770036417813142e-06, + "loss": 0.5445, + "step": 3154 + }, + { + "epoch": 3.529082774049217, + "grad_norm": 0.5892717242240906, + "learning_rate": 1.8755588045819325e-06, + "loss": 0.5511, + "step": 3155 + }, + { + "epoch": 3.530201342281879, + "grad_norm": 0.59117192029953, + "learning_rate": 1.874114189795219e-06, + "loss": 0.5285, + "step": 3156 + }, + { + "epoch": 3.5313199105145414, + "grad_norm": 0.5776271224021912, + "learning_rate": 1.8726697979357134e-06, + "loss": 0.5561, + "step": 3157 + }, + { + "epoch": 3.5324384787472036, + "grad_norm": 0.5786055326461792, + "learning_rate": 1.8712256295178767e-06, + "loss": 0.54, + "step": 3158 + }, + { + "epoch": 3.533557046979866, + "grad_norm": 0.5960258841514587, + "learning_rate": 1.8697816850560925e-06, + "loss": 0.5295, + "step": 3159 + }, + { + "epoch": 3.534675615212528, + "grad_norm": 0.588821291923523, + "learning_rate": 1.8683379650646607e-06, + "loss": 0.5325, + "step": 3160 + }, + { + "epoch": 3.5357941834451903, + "grad_norm": 0.5906977653503418, + "learning_rate": 1.8668944700578067e-06, + "loss": 0.5619, + "step": 3161 + }, + { + "epoch": 3.5369127516778525, + "grad_norm": 0.594870924949646, + "learning_rate": 1.8654512005496706e-06, + "loss": 0.515, + "step": 3162 + }, + { + "epoch": 3.5380313199105147, + "grad_norm": 0.5749700665473938, + "learning_rate": 1.8640081570543157e-06, + "loss": 0.5304, + "step": 3163 + }, + { + "epoch": 3.539149888143177, + "grad_norm": 0.6075375080108643, + "learning_rate": 1.8625653400857219e-06, + "loss": 0.5391, + "step": 3164 + }, + { + "epoch": 3.540268456375839, + "grad_norm": 0.5842822194099426, + "learning_rate": 1.861122750157791e-06, + "loss": 0.5269, + "step": 3165 + }, + { + "epoch": 3.5413870246085013, + "grad_norm": 0.5921899676322937, + "learning_rate": 1.8596803877843418e-06, + "loss": 0.5266, + "step": 3166 + }, + { + "epoch": 3.542505592841163, + "grad_norm": 0.6025802493095398, + "learning_rate": 1.8582382534791145e-06, + "loss": 0.5334, + "step": 3167 + }, + { + "epoch": 3.5436241610738257, + "grad_norm": 0.5829567313194275, + "learning_rate": 1.856796347755766e-06, + "loss": 0.534, + "step": 3168 + }, + { + "epoch": 3.5447427293064875, + "grad_norm": 0.579674243927002, + "learning_rate": 1.8553546711278718e-06, + "loss": 0.5397, + "step": 3169 + }, + { + "epoch": 3.54586129753915, + "grad_norm": 0.597607433795929, + "learning_rate": 1.8539132241089274e-06, + "loss": 0.5284, + "step": 3170 + }, + { + "epoch": 3.546979865771812, + "grad_norm": 0.5941487550735474, + "learning_rate": 1.8524720072123442e-06, + "loss": 0.5415, + "step": 3171 + }, + { + "epoch": 3.548098434004474, + "grad_norm": 0.5870916247367859, + "learning_rate": 1.8510310209514548e-06, + "loss": 0.5196, + "step": 3172 + }, + { + "epoch": 3.5492170022371363, + "grad_norm": 0.5694733262062073, + "learning_rate": 1.8495902658395064e-06, + "loss": 0.523, + "step": 3173 + }, + { + "epoch": 3.5503355704697985, + "grad_norm": 0.5773245692253113, + "learning_rate": 1.8481497423896669e-06, + "loss": 0.5219, + "step": 3174 + }, + { + "epoch": 3.5514541387024607, + "grad_norm": 0.5973610281944275, + "learning_rate": 1.8467094511150177e-06, + "loss": 0.533, + "step": 3175 + }, + { + "epoch": 3.552572706935123, + "grad_norm": 0.5735256671905518, + "learning_rate": 1.8452693925285626e-06, + "loss": 0.5365, + "step": 3176 + }, + { + "epoch": 3.553691275167785, + "grad_norm": 0.5804485082626343, + "learning_rate": 1.8438295671432176e-06, + "loss": 0.5404, + "step": 3177 + }, + { + "epoch": 3.5548098434004474, + "grad_norm": 0.5760172605514526, + "learning_rate": 1.8423899754718195e-06, + "loss": 0.5161, + "step": 3178 + }, + { + "epoch": 3.5559284116331096, + "grad_norm": 0.6126583814620972, + "learning_rate": 1.8409506180271195e-06, + "loss": 0.5345, + "step": 3179 + }, + { + "epoch": 3.557046979865772, + "grad_norm": 0.6027931571006775, + "learning_rate": 1.8395114953217853e-06, + "loss": 0.5549, + "step": 3180 + }, + { + "epoch": 3.558165548098434, + "grad_norm": 0.5928865671157837, + "learning_rate": 1.838072607868403e-06, + "loss": 0.5458, + "step": 3181 + }, + { + "epoch": 3.559284116331096, + "grad_norm": 0.5712553262710571, + "learning_rate": 1.8366339561794732e-06, + "loss": 0.5316, + "step": 3182 + }, + { + "epoch": 3.5604026845637584, + "grad_norm": 0.5772141814231873, + "learning_rate": 1.8351955407674137e-06, + "loss": 0.5242, + "step": 3183 + }, + { + "epoch": 3.5615212527964206, + "grad_norm": 0.5941332578659058, + "learning_rate": 1.8337573621445555e-06, + "loss": 0.5362, + "step": 3184 + }, + { + "epoch": 3.562639821029083, + "grad_norm": 0.5654463768005371, + "learning_rate": 1.8323194208231497e-06, + "loss": 0.5366, + "step": 3185 + }, + { + "epoch": 3.563758389261745, + "grad_norm": 0.574494481086731, + "learning_rate": 1.8308817173153592e-06, + "loss": 0.5381, + "step": 3186 + }, + { + "epoch": 3.5648769574944073, + "grad_norm": 0.5847533345222473, + "learning_rate": 1.8294442521332637e-06, + "loss": 0.5607, + "step": 3187 + }, + { + "epoch": 3.5659955257270695, + "grad_norm": 0.5897619724273682, + "learning_rate": 1.8280070257888565e-06, + "loss": 0.5622, + "step": 3188 + }, + { + "epoch": 3.5671140939597317, + "grad_norm": 0.5827111601829529, + "learning_rate": 1.8265700387940493e-06, + "loss": 0.5234, + "step": 3189 + }, + { + "epoch": 3.568232662192394, + "grad_norm": 0.5691210627555847, + "learning_rate": 1.825133291660665e-06, + "loss": 0.5419, + "step": 3190 + }, + { + "epoch": 3.569351230425056, + "grad_norm": 0.5886090993881226, + "learning_rate": 1.8236967849004411e-06, + "loss": 0.5281, + "step": 3191 + }, + { + "epoch": 3.570469798657718, + "grad_norm": 0.5840694904327393, + "learning_rate": 1.8222605190250331e-06, + "loss": 0.5263, + "step": 3192 + }, + { + "epoch": 3.5715883668903805, + "grad_norm": 0.60052490234375, + "learning_rate": 1.8208244945460069e-06, + "loss": 0.5381, + "step": 3193 + }, + { + "epoch": 3.5727069351230423, + "grad_norm": 0.5887637734413147, + "learning_rate": 1.8193887119748446e-06, + "loss": 0.5361, + "step": 3194 + }, + { + "epoch": 3.573825503355705, + "grad_norm": 0.5854399800300598, + "learning_rate": 1.8179531718229398e-06, + "loss": 0.5314, + "step": 3195 + }, + { + "epoch": 3.5749440715883667, + "grad_norm": 0.5906941890716553, + "learning_rate": 1.8165178746016042e-06, + "loss": 0.5153, + "step": 3196 + }, + { + "epoch": 3.5760626398210293, + "grad_norm": 0.6009395122528076, + "learning_rate": 1.8150828208220577e-06, + "loss": 0.5434, + "step": 3197 + }, + { + "epoch": 3.577181208053691, + "grad_norm": 0.5883455276489258, + "learning_rate": 1.8136480109954371e-06, + "loss": 0.5238, + "step": 3198 + }, + { + "epoch": 3.5782997762863533, + "grad_norm": 0.6168086528778076, + "learning_rate": 1.8122134456327912e-06, + "loss": 0.5567, + "step": 3199 + }, + { + "epoch": 3.5794183445190155, + "grad_norm": 0.5873827934265137, + "learning_rate": 1.8107791252450795e-06, + "loss": 0.556, + "step": 3200 + }, + { + "epoch": 3.5805369127516777, + "grad_norm": 0.5931955575942993, + "learning_rate": 1.8093450503431798e-06, + "loss": 0.5443, + "step": 3201 + }, + { + "epoch": 3.58165548098434, + "grad_norm": 0.5920387506484985, + "learning_rate": 1.8079112214378769e-06, + "loss": 0.4938, + "step": 3202 + }, + { + "epoch": 3.582774049217002, + "grad_norm": 0.6016706824302673, + "learning_rate": 1.8064776390398708e-06, + "loss": 0.5635, + "step": 3203 + }, + { + "epoch": 3.5838926174496644, + "grad_norm": 0.5818331837654114, + "learning_rate": 1.805044303659772e-06, + "loss": 0.5549, + "step": 3204 + }, + { + "epoch": 3.5850111856823266, + "grad_norm": 0.5680695176124573, + "learning_rate": 1.8036112158081062e-06, + "loss": 0.4904, + "step": 3205 + }, + { + "epoch": 3.586129753914989, + "grad_norm": 0.6140836477279663, + "learning_rate": 1.802178375995307e-06, + "loss": 0.567, + "step": 3206 + }, + { + "epoch": 3.587248322147651, + "grad_norm": 0.6097182035446167, + "learning_rate": 1.8007457847317222e-06, + "loss": 0.5758, + "step": 3207 + }, + { + "epoch": 3.588366890380313, + "grad_norm": 0.5881671905517578, + "learning_rate": 1.7993134425276095e-06, + "loss": 0.5331, + "step": 3208 + }, + { + "epoch": 3.5894854586129754, + "grad_norm": 0.576551079750061, + "learning_rate": 1.7978813498931403e-06, + "loss": 0.519, + "step": 3209 + }, + { + "epoch": 3.5906040268456376, + "grad_norm": 0.6039683818817139, + "learning_rate": 1.7964495073383947e-06, + "loss": 0.5351, + "step": 3210 + }, + { + "epoch": 3.5917225950783, + "grad_norm": 0.5941099524497986, + "learning_rate": 1.7950179153733637e-06, + "loss": 0.5673, + "step": 3211 + }, + { + "epoch": 3.592841163310962, + "grad_norm": 0.5799036026000977, + "learning_rate": 1.7935865745079511e-06, + "loss": 0.4867, + "step": 3212 + }, + { + "epoch": 3.5939597315436242, + "grad_norm": 0.5967704057693481, + "learning_rate": 1.7921554852519685e-06, + "loss": 0.5574, + "step": 3213 + }, + { + "epoch": 3.5950782997762865, + "grad_norm": 0.57224041223526, + "learning_rate": 1.7907246481151424e-06, + "loss": 0.5035, + "step": 3214 + }, + { + "epoch": 3.5961968680089487, + "grad_norm": 0.5907691717147827, + "learning_rate": 1.789294063607103e-06, + "loss": 0.5302, + "step": 3215 + }, + { + "epoch": 3.597315436241611, + "grad_norm": 0.6033266186714172, + "learning_rate": 1.7878637322373965e-06, + "loss": 0.578, + "step": 3216 + }, + { + "epoch": 3.598434004474273, + "grad_norm": 0.5928179025650024, + "learning_rate": 1.7864336545154757e-06, + "loss": 0.5417, + "step": 3217 + }, + { + "epoch": 3.5995525727069353, + "grad_norm": 0.5765846967697144, + "learning_rate": 1.785003830950704e-06, + "loss": 0.5391, + "step": 3218 + }, + { + "epoch": 3.600671140939597, + "grad_norm": 0.5763617753982544, + "learning_rate": 1.7835742620523534e-06, + "loss": 0.5375, + "step": 3219 + }, + { + "epoch": 3.6017897091722597, + "grad_norm": 0.6030755639076233, + "learning_rate": 1.7821449483296073e-06, + "loss": 0.5336, + "step": 3220 + }, + { + "epoch": 3.6029082774049215, + "grad_norm": 0.5732738971710205, + "learning_rate": 1.7807158902915561e-06, + "loss": 0.5241, + "step": 3221 + }, + { + "epoch": 3.604026845637584, + "grad_norm": 0.5835790634155273, + "learning_rate": 1.7792870884471991e-06, + "loss": 0.5498, + "step": 3222 + }, + { + "epoch": 3.605145413870246, + "grad_norm": 0.586104691028595, + "learning_rate": 1.7778585433054469e-06, + "loss": 0.5244, + "step": 3223 + }, + { + "epoch": 3.6062639821029085, + "grad_norm": 0.5874592661857605, + "learning_rate": 1.7764302553751145e-06, + "loss": 0.5293, + "step": 3224 + }, + { + "epoch": 3.6073825503355703, + "grad_norm": 0.5961647033691406, + "learning_rate": 1.77500222516493e-06, + "loss": 0.5552, + "step": 3225 + }, + { + "epoch": 3.6085011185682325, + "grad_norm": 0.6169877648353577, + "learning_rate": 1.7735744531835258e-06, + "loss": 0.531, + "step": 3226 + }, + { + "epoch": 3.6096196868008947, + "grad_norm": 0.6003921031951904, + "learning_rate": 1.7721469399394454e-06, + "loss": 0.5645, + "step": 3227 + }, + { + "epoch": 3.610738255033557, + "grad_norm": 0.5881280303001404, + "learning_rate": 1.7707196859411364e-06, + "loss": 0.549, + "step": 3228 + }, + { + "epoch": 3.611856823266219, + "grad_norm": 0.6043563485145569, + "learning_rate": 1.7692926916969588e-06, + "loss": 0.5384, + "step": 3229 + }, + { + "epoch": 3.6129753914988814, + "grad_norm": 0.5928142070770264, + "learning_rate": 1.7678659577151759e-06, + "loss": 0.5226, + "step": 3230 + }, + { + "epoch": 3.6140939597315436, + "grad_norm": 0.603988528251648, + "learning_rate": 1.766439484503961e-06, + "loss": 0.5471, + "step": 3231 + }, + { + "epoch": 3.615212527964206, + "grad_norm": 0.6013253331184387, + "learning_rate": 1.765013272571393e-06, + "loss": 0.5563, + "step": 3232 + }, + { + "epoch": 3.616331096196868, + "grad_norm": 0.592201292514801, + "learning_rate": 1.7635873224254571e-06, + "loss": 0.5475, + "step": 3233 + }, + { + "epoch": 3.61744966442953, + "grad_norm": 0.5934378504753113, + "learning_rate": 1.7621616345740488e-06, + "loss": 0.5444, + "step": 3234 + }, + { + "epoch": 3.6185682326621924, + "grad_norm": 0.5830172896385193, + "learning_rate": 1.760736209524966e-06, + "loss": 0.5209, + "step": 3235 + }, + { + "epoch": 3.6196868008948546, + "grad_norm": 0.6052092909812927, + "learning_rate": 1.7593110477859155e-06, + "loss": 0.5404, + "step": 3236 + }, + { + "epoch": 3.620805369127517, + "grad_norm": 0.591371476650238, + "learning_rate": 1.757886149864509e-06, + "loss": 0.5158, + "step": 3237 + }, + { + "epoch": 3.621923937360179, + "grad_norm": 0.5898125767707825, + "learning_rate": 1.7564615162682661e-06, + "loss": 0.5404, + "step": 3238 + }, + { + "epoch": 3.6230425055928412, + "grad_norm": 0.5633562207221985, + "learning_rate": 1.7550371475046088e-06, + "loss": 0.5189, + "step": 3239 + }, + { + "epoch": 3.6241610738255035, + "grad_norm": 0.592021107673645, + "learning_rate": 1.7536130440808693e-06, + "loss": 0.5388, + "step": 3240 + }, + { + "epoch": 3.6252796420581657, + "grad_norm": 0.5834406018257141, + "learning_rate": 1.7521892065042812e-06, + "loss": 0.5332, + "step": 3241 + }, + { + "epoch": 3.626398210290828, + "grad_norm": 0.5873613357543945, + "learning_rate": 1.7507656352819866e-06, + "loss": 0.5377, + "step": 3242 + }, + { + "epoch": 3.62751677852349, + "grad_norm": 0.5920371413230896, + "learning_rate": 1.7493423309210305e-06, + "loss": 0.541, + "step": 3243 + }, + { + "epoch": 3.6286353467561523, + "grad_norm": 0.5716673135757446, + "learning_rate": 1.7479192939283623e-06, + "loss": 0.5165, + "step": 3244 + }, + { + "epoch": 3.6297539149888145, + "grad_norm": 0.5592018365859985, + "learning_rate": 1.74649652481084e-06, + "loss": 0.5396, + "step": 3245 + }, + { + "epoch": 3.6308724832214763, + "grad_norm": 0.5928216576576233, + "learning_rate": 1.7450740240752223e-06, + "loss": 0.5241, + "step": 3246 + }, + { + "epoch": 3.631991051454139, + "grad_norm": 0.6018883585929871, + "learning_rate": 1.743651792228174e-06, + "loss": 0.5304, + "step": 3247 + }, + { + "epoch": 3.6331096196868007, + "grad_norm": 0.6035928726196289, + "learning_rate": 1.742229829776263e-06, + "loss": 0.5413, + "step": 3248 + }, + { + "epoch": 3.6342281879194633, + "grad_norm": 0.5754979848861694, + "learning_rate": 1.7408081372259633e-06, + "loss": 0.5257, + "step": 3249 + }, + { + "epoch": 3.635346756152125, + "grad_norm": 0.5860834121704102, + "learning_rate": 1.739386715083651e-06, + "loss": 0.5468, + "step": 3250 + }, + { + "epoch": 3.6364653243847873, + "grad_norm": 0.5843851566314697, + "learning_rate": 1.737965563855607e-06, + "loss": 0.5007, + "step": 3251 + }, + { + "epoch": 3.6375838926174495, + "grad_norm": 0.6038687229156494, + "learning_rate": 1.7365446840480133e-06, + "loss": 0.5398, + "step": 3252 + }, + { + "epoch": 3.6387024608501117, + "grad_norm": 0.5902573466300964, + "learning_rate": 1.7351240761669597e-06, + "loss": 0.5571, + "step": 3253 + }, + { + "epoch": 3.639821029082774, + "grad_norm": 0.5991767048835754, + "learning_rate": 1.7337037407184347e-06, + "loss": 0.5844, + "step": 3254 + }, + { + "epoch": 3.640939597315436, + "grad_norm": 0.6047876477241516, + "learning_rate": 1.7322836782083318e-06, + "loss": 0.5544, + "step": 3255 + }, + { + "epoch": 3.6420581655480984, + "grad_norm": 0.5845467448234558, + "learning_rate": 1.7308638891424476e-06, + "loss": 0.5197, + "step": 3256 + }, + { + "epoch": 3.6431767337807606, + "grad_norm": 0.5900623202323914, + "learning_rate": 1.7294443740264795e-06, + "loss": 0.5349, + "step": 3257 + }, + { + "epoch": 3.6442953020134228, + "grad_norm": 0.5952261686325073, + "learning_rate": 1.7280251333660303e-06, + "loss": 0.5344, + "step": 3258 + }, + { + "epoch": 3.645413870246085, + "grad_norm": 0.600703775882721, + "learning_rate": 1.7266061676666024e-06, + "loss": 0.553, + "step": 3259 + }, + { + "epoch": 3.646532438478747, + "grad_norm": 0.597102165222168, + "learning_rate": 1.7251874774336015e-06, + "loss": 0.5394, + "step": 3260 + }, + { + "epoch": 3.6476510067114094, + "grad_norm": 0.5843111872673035, + "learning_rate": 1.7237690631723335e-06, + "loss": 0.5429, + "step": 3261 + }, + { + "epoch": 3.6487695749440716, + "grad_norm": 0.5957069993019104, + "learning_rate": 1.7223509253880099e-06, + "loss": 0.5612, + "step": 3262 + }, + { + "epoch": 3.649888143176734, + "grad_norm": 0.5827816128730774, + "learning_rate": 1.7209330645857402e-06, + "loss": 0.5311, + "step": 3263 + }, + { + "epoch": 3.651006711409396, + "grad_norm": 0.5804867744445801, + "learning_rate": 1.7195154812705344e-06, + "loss": 0.5138, + "step": 3264 + }, + { + "epoch": 3.6521252796420582, + "grad_norm": 0.5843037366867065, + "learning_rate": 1.718098175947308e-06, + "loss": 0.5294, + "step": 3265 + }, + { + "epoch": 3.6532438478747205, + "grad_norm": 0.5882619619369507, + "learning_rate": 1.7166811491208736e-06, + "loss": 0.5607, + "step": 3266 + }, + { + "epoch": 3.6543624161073827, + "grad_norm": 0.6183314323425293, + "learning_rate": 1.7152644012959468e-06, + "loss": 0.5622, + "step": 3267 + }, + { + "epoch": 3.655480984340045, + "grad_norm": 0.6009407043457031, + "learning_rate": 1.7138479329771419e-06, + "loss": 0.5569, + "step": 3268 + }, + { + "epoch": 3.656599552572707, + "grad_norm": 0.6010070443153381, + "learning_rate": 1.7124317446689765e-06, + "loss": 0.5243, + "step": 3269 + }, + { + "epoch": 3.6577181208053693, + "grad_norm": 0.5998405814170837, + "learning_rate": 1.711015836875865e-06, + "loss": 0.5321, + "step": 3270 + }, + { + "epoch": 3.658836689038031, + "grad_norm": 0.5975531935691833, + "learning_rate": 1.709600210102125e-06, + "loss": 0.5377, + "step": 3271 + }, + { + "epoch": 3.6599552572706937, + "grad_norm": 0.6024014353752136, + "learning_rate": 1.708184864851971e-06, + "loss": 0.5588, + "step": 3272 + }, + { + "epoch": 3.6610738255033555, + "grad_norm": 0.5895260572433472, + "learning_rate": 1.706769801629521e-06, + "loss": 0.5242, + "step": 3273 + }, + { + "epoch": 3.662192393736018, + "grad_norm": 0.5740838050842285, + "learning_rate": 1.7053550209387892e-06, + "loss": 0.5426, + "step": 3274 + }, + { + "epoch": 3.66331096196868, + "grad_norm": 0.5972879528999329, + "learning_rate": 1.7039405232836902e-06, + "loss": 0.5239, + "step": 3275 + }, + { + "epoch": 3.6644295302013425, + "grad_norm": 0.5864456295967102, + "learning_rate": 1.7025263091680387e-06, + "loss": 0.5342, + "step": 3276 + }, + { + "epoch": 3.6655480984340043, + "grad_norm": 0.6112748980522156, + "learning_rate": 1.7011123790955458e-06, + "loss": 0.5411, + "step": 3277 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.6074910163879395, + "learning_rate": 1.6996987335698261e-06, + "loss": 0.5169, + "step": 3278 + }, + { + "epoch": 3.6677852348993287, + "grad_norm": 0.5817574858665466, + "learning_rate": 1.6982853730943882e-06, + "loss": 0.5399, + "step": 3279 + }, + { + "epoch": 3.668903803131991, + "grad_norm": 0.597766637802124, + "learning_rate": 1.6968722981726421e-06, + "loss": 0.5509, + "step": 3280 + }, + { + "epoch": 3.670022371364653, + "grad_norm": 0.5937582850456238, + "learning_rate": 1.695459509307893e-06, + "loss": 0.5551, + "step": 3281 + }, + { + "epoch": 3.6711409395973154, + "grad_norm": 0.577810525894165, + "learning_rate": 1.6940470070033491e-06, + "loss": 0.5252, + "step": 3282 + }, + { + "epoch": 3.6722595078299776, + "grad_norm": 0.590491771697998, + "learning_rate": 1.6926347917621118e-06, + "loss": 0.535, + "step": 3283 + }, + { + "epoch": 3.6733780760626398, + "grad_norm": 0.5885805487632751, + "learning_rate": 1.6912228640871825e-06, + "loss": 0.554, + "step": 3284 + }, + { + "epoch": 3.674496644295302, + "grad_norm": 0.5971885323524475, + "learning_rate": 1.6898112244814603e-06, + "loss": 0.5338, + "step": 3285 + }, + { + "epoch": 3.675615212527964, + "grad_norm": 0.5775821805000305, + "learning_rate": 1.6883998734477398e-06, + "loss": 0.5278, + "step": 3286 + }, + { + "epoch": 3.6767337807606264, + "grad_norm": 0.5998420119285583, + "learning_rate": 1.6869888114887156e-06, + "loss": 0.5226, + "step": 3287 + }, + { + "epoch": 3.6778523489932886, + "grad_norm": 0.6010130047798157, + "learning_rate": 1.6855780391069776e-06, + "loss": 0.5476, + "step": 3288 + }, + { + "epoch": 3.678970917225951, + "grad_norm": 0.5722376704216003, + "learning_rate": 1.684167556805013e-06, + "loss": 0.5362, + "step": 3289 + }, + { + "epoch": 3.680089485458613, + "grad_norm": 0.5952475070953369, + "learning_rate": 1.682757365085205e-06, + "loss": 0.5506, + "step": 3290 + }, + { + "epoch": 3.6812080536912752, + "grad_norm": 0.5949262976646423, + "learning_rate": 1.6813474644498345e-06, + "loss": 0.5487, + "step": 3291 + }, + { + "epoch": 3.6823266219239374, + "grad_norm": 0.5895739197731018, + "learning_rate": 1.6799378554010773e-06, + "loss": 0.5252, + "step": 3292 + }, + { + "epoch": 3.6834451901565997, + "grad_norm": 0.5880127549171448, + "learning_rate": 1.6785285384410074e-06, + "loss": 0.5473, + "step": 3293 + }, + { + "epoch": 3.684563758389262, + "grad_norm": 0.5945895314216614, + "learning_rate": 1.6771195140715923e-06, + "loss": 0.5638, + "step": 3294 + }, + { + "epoch": 3.685682326621924, + "grad_norm": 0.5986477732658386, + "learning_rate": 1.6757107827946978e-06, + "loss": 0.5517, + "step": 3295 + }, + { + "epoch": 3.6868008948545863, + "grad_norm": 0.6004531383514404, + "learning_rate": 1.6743023451120831e-06, + "loss": 0.5506, + "step": 3296 + }, + { + "epoch": 3.6879194630872485, + "grad_norm": 0.5891329050064087, + "learning_rate": 1.672894201525403e-06, + "loss": 0.5549, + "step": 3297 + }, + { + "epoch": 3.6890380313199103, + "grad_norm": 0.594777524471283, + "learning_rate": 1.6714863525362098e-06, + "loss": 0.5318, + "step": 3298 + }, + { + "epoch": 3.690156599552573, + "grad_norm": 0.5795633792877197, + "learning_rate": 1.6700787986459485e-06, + "loss": 0.527, + "step": 3299 + }, + { + "epoch": 3.6912751677852347, + "grad_norm": 0.6074640154838562, + "learning_rate": 1.6686715403559606e-06, + "loss": 0.5373, + "step": 3300 + }, + { + "epoch": 3.6923937360178973, + "grad_norm": 0.5706025958061218, + "learning_rate": 1.6672645781674797e-06, + "loss": 0.5131, + "step": 3301 + }, + { + "epoch": 3.693512304250559, + "grad_norm": 0.606348991394043, + "learning_rate": 1.6658579125816385e-06, + "loss": 0.5602, + "step": 3302 + }, + { + "epoch": 3.6946308724832218, + "grad_norm": 0.5676276683807373, + "learning_rate": 1.6644515440994591e-06, + "loss": 0.5545, + "step": 3303 + }, + { + "epoch": 3.6957494407158835, + "grad_norm": 0.5949615240097046, + "learning_rate": 1.6630454732218615e-06, + "loss": 0.5608, + "step": 3304 + }, + { + "epoch": 3.6968680089485457, + "grad_norm": 0.5761768817901611, + "learning_rate": 1.6616397004496569e-06, + "loss": 0.509, + "step": 3305 + }, + { + "epoch": 3.697986577181208, + "grad_norm": 0.5956002473831177, + "learning_rate": 1.660234226283553e-06, + "loss": 0.5533, + "step": 3306 + }, + { + "epoch": 3.69910514541387, + "grad_norm": 0.5927191972732544, + "learning_rate": 1.6588290512241494e-06, + "loss": 0.525, + "step": 3307 + }, + { + "epoch": 3.7002237136465324, + "grad_norm": 0.5845590233802795, + "learning_rate": 1.6574241757719387e-06, + "loss": 0.5556, + "step": 3308 + }, + { + "epoch": 3.7013422818791946, + "grad_norm": 0.6003521084785461, + "learning_rate": 1.6560196004273087e-06, + "loss": 0.5372, + "step": 3309 + }, + { + "epoch": 3.7024608501118568, + "grad_norm": 0.576077401638031, + "learning_rate": 1.654615325690538e-06, + "loss": 0.5488, + "step": 3310 + }, + { + "epoch": 3.703579418344519, + "grad_norm": 0.5685194134712219, + "learning_rate": 1.653211352061801e-06, + "loss": 0.5326, + "step": 3311 + }, + { + "epoch": 3.704697986577181, + "grad_norm": 0.5908631086349487, + "learning_rate": 1.651807680041162e-06, + "loss": 0.529, + "step": 3312 + }, + { + "epoch": 3.7058165548098434, + "grad_norm": 0.585866391658783, + "learning_rate": 1.6504043101285799e-06, + "loss": 0.5593, + "step": 3313 + }, + { + "epoch": 3.7069351230425056, + "grad_norm": 0.5837008953094482, + "learning_rate": 1.649001242823904e-06, + "loss": 0.5022, + "step": 3314 + }, + { + "epoch": 3.708053691275168, + "grad_norm": 0.5766224265098572, + "learning_rate": 1.6475984786268792e-06, + "loss": 0.524, + "step": 3315 + }, + { + "epoch": 3.70917225950783, + "grad_norm": 0.5825909972190857, + "learning_rate": 1.6461960180371385e-06, + "loss": 0.5348, + "step": 3316 + }, + { + "epoch": 3.7102908277404922, + "grad_norm": 0.5828694701194763, + "learning_rate": 1.6447938615542081e-06, + "loss": 0.5175, + "step": 3317 + }, + { + "epoch": 3.7114093959731544, + "grad_norm": 0.5782697796821594, + "learning_rate": 1.643392009677508e-06, + "loss": 0.5282, + "step": 3318 + }, + { + "epoch": 3.7125279642058167, + "grad_norm": 0.5827836394309998, + "learning_rate": 1.6419904629063465e-06, + "loss": 0.5236, + "step": 3319 + }, + { + "epoch": 3.713646532438479, + "grad_norm": 0.5921977162361145, + "learning_rate": 1.640589221739926e-06, + "loss": 0.56, + "step": 3320 + }, + { + "epoch": 3.714765100671141, + "grad_norm": 0.5894917249679565, + "learning_rate": 1.639188286677337e-06, + "loss": 0.5454, + "step": 3321 + }, + { + "epoch": 3.7158836689038033, + "grad_norm": 0.5997458696365356, + "learning_rate": 1.637787658217565e-06, + "loss": 0.5224, + "step": 3322 + }, + { + "epoch": 3.717002237136465, + "grad_norm": 0.5967844128608704, + "learning_rate": 1.636387336859483e-06, + "loss": 0.5433, + "step": 3323 + }, + { + "epoch": 3.7181208053691277, + "grad_norm": 0.5754695534706116, + "learning_rate": 1.6349873231018554e-06, + "loss": 0.5355, + "step": 3324 + }, + { + "epoch": 3.7192393736017895, + "grad_norm": 0.5959658622741699, + "learning_rate": 1.6335876174433368e-06, + "loss": 0.5177, + "step": 3325 + }, + { + "epoch": 3.720357941834452, + "grad_norm": 0.6039799451828003, + "learning_rate": 1.6321882203824746e-06, + "loss": 0.5236, + "step": 3326 + }, + { + "epoch": 3.721476510067114, + "grad_norm": 0.608328104019165, + "learning_rate": 1.6307891324177027e-06, + "loss": 0.5421, + "step": 3327 + }, + { + "epoch": 3.7225950782997765, + "grad_norm": 0.5950992703437805, + "learning_rate": 1.6293903540473466e-06, + "loss": 0.533, + "step": 3328 + }, + { + "epoch": 3.7237136465324383, + "grad_norm": 0.6009061932563782, + "learning_rate": 1.627991885769622e-06, + "loss": 0.5391, + "step": 3329 + }, + { + "epoch": 3.7248322147651005, + "grad_norm": 0.5911272764205933, + "learning_rate": 1.6265937280826316e-06, + "loss": 0.5431, + "step": 3330 + }, + { + "epoch": 3.7259507829977627, + "grad_norm": 0.576015293598175, + "learning_rate": 1.6251958814843723e-06, + "loss": 0.5175, + "step": 3331 + }, + { + "epoch": 3.727069351230425, + "grad_norm": 0.5802775621414185, + "learning_rate": 1.6237983464727252e-06, + "loss": 0.5442, + "step": 3332 + }, + { + "epoch": 3.728187919463087, + "grad_norm": 0.5840089917182922, + "learning_rate": 1.6224011235454643e-06, + "loss": 0.5092, + "step": 3333 + }, + { + "epoch": 3.7293064876957494, + "grad_norm": 0.614753246307373, + "learning_rate": 1.6210042132002483e-06, + "loss": 0.5213, + "step": 3334 + }, + { + "epoch": 3.7304250559284116, + "grad_norm": 0.6202266812324524, + "learning_rate": 1.6196076159346292e-06, + "loss": 0.5584, + "step": 3335 + }, + { + "epoch": 3.7315436241610738, + "grad_norm": 0.6043874025344849, + "learning_rate": 1.6182113322460439e-06, + "loss": 0.5615, + "step": 3336 + }, + { + "epoch": 3.732662192393736, + "grad_norm": 0.5924262404441833, + "learning_rate": 1.6168153626318198e-06, + "loss": 0.5253, + "step": 3337 + }, + { + "epoch": 3.733780760626398, + "grad_norm": 0.5696139335632324, + "learning_rate": 1.615419707589171e-06, + "loss": 0.5077, + "step": 3338 + }, + { + "epoch": 3.7348993288590604, + "grad_norm": 0.5858835577964783, + "learning_rate": 1.6140243676151995e-06, + "loss": 0.5321, + "step": 3339 + }, + { + "epoch": 3.7360178970917226, + "grad_norm": 0.5921154022216797, + "learning_rate": 1.6126293432068978e-06, + "loss": 0.5423, + "step": 3340 + }, + { + "epoch": 3.737136465324385, + "grad_norm": 0.5887474417686462, + "learning_rate": 1.6112346348611413e-06, + "loss": 0.5377, + "step": 3341 + }, + { + "epoch": 3.738255033557047, + "grad_norm": 0.5897992253303528, + "learning_rate": 1.6098402430746973e-06, + "loss": 0.5588, + "step": 3342 + }, + { + "epoch": 3.7393736017897092, + "grad_norm": 0.5925105810165405, + "learning_rate": 1.6084461683442176e-06, + "loss": 0.549, + "step": 3343 + }, + { + "epoch": 3.7404921700223714, + "grad_norm": 0.6138824224472046, + "learning_rate": 1.6070524111662428e-06, + "loss": 0.5555, + "step": 3344 + }, + { + "epoch": 3.7416107382550337, + "grad_norm": 0.5957280397415161, + "learning_rate": 1.6056589720371978e-06, + "loss": 0.5408, + "step": 3345 + }, + { + "epoch": 3.742729306487696, + "grad_norm": 0.5976058840751648, + "learning_rate": 1.6042658514533987e-06, + "loss": 0.5512, + "step": 3346 + }, + { + "epoch": 3.743847874720358, + "grad_norm": 0.5899811387062073, + "learning_rate": 1.602873049911043e-06, + "loss": 0.5231, + "step": 3347 + }, + { + "epoch": 3.7449664429530203, + "grad_norm": 0.60574871301651, + "learning_rate": 1.6014805679062185e-06, + "loss": 0.5232, + "step": 3348 + }, + { + "epoch": 3.7460850111856825, + "grad_norm": 0.5882735848426819, + "learning_rate": 1.6000884059348966e-06, + "loss": 0.5251, + "step": 3349 + }, + { + "epoch": 3.7472035794183443, + "grad_norm": 0.57975834608078, + "learning_rate": 1.5986965644929356e-06, + "loss": 0.5601, + "step": 3350 + }, + { + "epoch": 3.748322147651007, + "grad_norm": 0.5691300630569458, + "learning_rate": 1.5973050440760812e-06, + "loss": 0.5205, + "step": 3351 + }, + { + "epoch": 3.7494407158836687, + "grad_norm": 0.593222975730896, + "learning_rate": 1.595913845179962e-06, + "loss": 0.5446, + "step": 3352 + }, + { + "epoch": 3.7505592841163313, + "grad_norm": 0.5908300280570984, + "learning_rate": 1.594522968300095e-06, + "loss": 0.5503, + "step": 3353 + }, + { + "epoch": 3.751677852348993, + "grad_norm": 0.588367760181427, + "learning_rate": 1.5931324139318783e-06, + "loss": 0.5369, + "step": 3354 + }, + { + "epoch": 3.7527964205816557, + "grad_norm": 0.5896555781364441, + "learning_rate": 1.5917421825706008e-06, + "loss": 0.5403, + "step": 3355 + }, + { + "epoch": 3.7539149888143175, + "grad_norm": 0.5824311971664429, + "learning_rate": 1.5903522747114314e-06, + "loss": 0.5402, + "step": 3356 + }, + { + "epoch": 3.7550335570469797, + "grad_norm": 0.6074312925338745, + "learning_rate": 1.5889626908494266e-06, + "loss": 0.5381, + "step": 3357 + }, + { + "epoch": 3.756152125279642, + "grad_norm": 0.5748060345649719, + "learning_rate": 1.5875734314795254e-06, + "loss": 0.522, + "step": 3358 + }, + { + "epoch": 3.757270693512304, + "grad_norm": 0.609924852848053, + "learning_rate": 1.586184497096554e-06, + "loss": 0.5402, + "step": 3359 + }, + { + "epoch": 3.7583892617449663, + "grad_norm": 0.5816012620925903, + "learning_rate": 1.5847958881952207e-06, + "loss": 0.5363, + "step": 3360 + }, + { + "epoch": 3.7595078299776286, + "grad_norm": 0.6067857146263123, + "learning_rate": 1.583407605270118e-06, + "loss": 0.5358, + "step": 3361 + }, + { + "epoch": 3.7606263982102908, + "grad_norm": 0.6021775007247925, + "learning_rate": 1.5820196488157236e-06, + "loss": 0.544, + "step": 3362 + }, + { + "epoch": 3.761744966442953, + "grad_norm": 0.598644495010376, + "learning_rate": 1.5806320193263963e-06, + "loss": 0.5393, + "step": 3363 + }, + { + "epoch": 3.762863534675615, + "grad_norm": 0.5777289867401123, + "learning_rate": 1.5792447172963831e-06, + "loss": 0.525, + "step": 3364 + }, + { + "epoch": 3.7639821029082774, + "grad_norm": 0.6056726574897766, + "learning_rate": 1.5778577432198081e-06, + "loss": 0.558, + "step": 3365 + }, + { + "epoch": 3.7651006711409396, + "grad_norm": 0.5804301500320435, + "learning_rate": 1.5764710975906843e-06, + "loss": 0.535, + "step": 3366 + }, + { + "epoch": 3.766219239373602, + "grad_norm": 0.6163538098335266, + "learning_rate": 1.5750847809029045e-06, + "loss": 0.5254, + "step": 3367 + }, + { + "epoch": 3.767337807606264, + "grad_norm": 0.5800641775131226, + "learning_rate": 1.5736987936502463e-06, + "loss": 0.5248, + "step": 3368 + }, + { + "epoch": 3.7684563758389262, + "grad_norm": 0.5839651226997375, + "learning_rate": 1.5723131363263677e-06, + "loss": 0.5141, + "step": 3369 + }, + { + "epoch": 3.7695749440715884, + "grad_norm": 0.5862908959388733, + "learning_rate": 1.5709278094248093e-06, + "loss": 0.5391, + "step": 3370 + }, + { + "epoch": 3.7706935123042506, + "grad_norm": 0.5848096609115601, + "learning_rate": 1.5695428134389976e-06, + "loss": 0.5113, + "step": 3371 + }, + { + "epoch": 3.771812080536913, + "grad_norm": 0.6003323793411255, + "learning_rate": 1.5681581488622369e-06, + "loss": 0.5436, + "step": 3372 + }, + { + "epoch": 3.772930648769575, + "grad_norm": 0.601041316986084, + "learning_rate": 1.5667738161877165e-06, + "loss": 0.5462, + "step": 3373 + }, + { + "epoch": 3.7740492170022373, + "grad_norm": 0.5880104303359985, + "learning_rate": 1.5653898159085043e-06, + "loss": 0.5384, + "step": 3374 + }, + { + "epoch": 3.7751677852348995, + "grad_norm": 0.5900112390518188, + "learning_rate": 1.5640061485175543e-06, + "loss": 0.5306, + "step": 3375 + }, + { + "epoch": 3.7762863534675617, + "grad_norm": 0.5959150791168213, + "learning_rate": 1.5626228145076976e-06, + "loss": 0.554, + "step": 3376 + }, + { + "epoch": 3.7774049217002235, + "grad_norm": 0.5962737798690796, + "learning_rate": 1.5612398143716492e-06, + "loss": 0.5326, + "step": 3377 + }, + { + "epoch": 3.778523489932886, + "grad_norm": 0.5905041098594666, + "learning_rate": 1.5598571486020031e-06, + "loss": 0.5522, + "step": 3378 + }, + { + "epoch": 3.779642058165548, + "grad_norm": 0.6116909980773926, + "learning_rate": 1.5584748176912373e-06, + "loss": 0.558, + "step": 3379 + }, + { + "epoch": 3.7807606263982105, + "grad_norm": 0.6010723114013672, + "learning_rate": 1.5570928221317076e-06, + "loss": 0.5557, + "step": 3380 + }, + { + "epoch": 3.7818791946308723, + "grad_norm": 0.5796011090278625, + "learning_rate": 1.555711162415651e-06, + "loss": 0.5202, + "step": 3381 + }, + { + "epoch": 3.782997762863535, + "grad_norm": 0.598798930644989, + "learning_rate": 1.5543298390351864e-06, + "loss": 0.5276, + "step": 3382 + }, + { + "epoch": 3.7841163310961967, + "grad_norm": 0.5921764373779297, + "learning_rate": 1.5529488524823096e-06, + "loss": 0.5553, + "step": 3383 + }, + { + "epoch": 3.785234899328859, + "grad_norm": 0.594404399394989, + "learning_rate": 1.5515682032489018e-06, + "loss": 0.5475, + "step": 3384 + }, + { + "epoch": 3.786353467561521, + "grad_norm": 0.5731297731399536, + "learning_rate": 1.5501878918267186e-06, + "loss": 0.5128, + "step": 3385 + }, + { + "epoch": 3.7874720357941833, + "grad_norm": 0.5846068859100342, + "learning_rate": 1.5488079187073985e-06, + "loss": 0.5211, + "step": 3386 + }, + { + "epoch": 3.7885906040268456, + "grad_norm": 0.601236879825592, + "learning_rate": 1.5474282843824575e-06, + "loss": 0.533, + "step": 3387 + }, + { + "epoch": 3.7897091722595078, + "grad_norm": 0.6183696389198303, + "learning_rate": 1.546048989343294e-06, + "loss": 0.5664, + "step": 3388 + }, + { + "epoch": 3.79082774049217, + "grad_norm": 0.5790847539901733, + "learning_rate": 1.5446700340811815e-06, + "loss": 0.5183, + "step": 3389 + }, + { + "epoch": 3.791946308724832, + "grad_norm": 0.5779337286949158, + "learning_rate": 1.5432914190872757e-06, + "loss": 0.5193, + "step": 3390 + }, + { + "epoch": 3.7930648769574944, + "grad_norm": 0.6094821691513062, + "learning_rate": 1.5419131448526099e-06, + "loss": 0.5596, + "step": 3391 + }, + { + "epoch": 3.7941834451901566, + "grad_norm": 0.5740504860877991, + "learning_rate": 1.5405352118680953e-06, + "loss": 0.5077, + "step": 3392 + }, + { + "epoch": 3.795302013422819, + "grad_norm": 0.5931085348129272, + "learning_rate": 1.539157620624523e-06, + "loss": 0.5231, + "step": 3393 + }, + { + "epoch": 3.796420581655481, + "grad_norm": 0.5986872315406799, + "learning_rate": 1.5377803716125606e-06, + "loss": 0.5375, + "step": 3394 + }, + { + "epoch": 3.7975391498881432, + "grad_norm": 0.5834430456161499, + "learning_rate": 1.5364034653227564e-06, + "loss": 0.5032, + "step": 3395 + }, + { + "epoch": 3.7986577181208054, + "grad_norm": 0.5918521881103516, + "learning_rate": 1.5350269022455342e-06, + "loss": 0.5141, + "step": 3396 + }, + { + "epoch": 3.7997762863534676, + "grad_norm": 0.5843653082847595, + "learning_rate": 1.5336506828711972e-06, + "loss": 0.5151, + "step": 3397 + }, + { + "epoch": 3.80089485458613, + "grad_norm": 0.5750171542167664, + "learning_rate": 1.532274807689924e-06, + "loss": 0.5238, + "step": 3398 + }, + { + "epoch": 3.802013422818792, + "grad_norm": 0.6023558974266052, + "learning_rate": 1.5308992771917738e-06, + "loss": 0.5486, + "step": 3399 + }, + { + "epoch": 3.8031319910514543, + "grad_norm": 0.6220369338989258, + "learning_rate": 1.5295240918666805e-06, + "loss": 0.5348, + "step": 3400 + }, + { + "epoch": 3.8042505592841165, + "grad_norm": 0.5897588133811951, + "learning_rate": 1.5281492522044557e-06, + "loss": 0.5434, + "step": 3401 + }, + { + "epoch": 3.8053691275167782, + "grad_norm": 0.5919550657272339, + "learning_rate": 1.5267747586947888e-06, + "loss": 0.5445, + "step": 3402 + }, + { + "epoch": 3.806487695749441, + "grad_norm": 0.607524037361145, + "learning_rate": 1.5254006118272433e-06, + "loss": 0.533, + "step": 3403 + }, + { + "epoch": 3.8076062639821027, + "grad_norm": 0.5863617658615112, + "learning_rate": 1.5240268120912631e-06, + "loss": 0.5514, + "step": 3404 + }, + { + "epoch": 3.8087248322147653, + "grad_norm": 0.6031137704849243, + "learning_rate": 1.5226533599761656e-06, + "loss": 0.5297, + "step": 3405 + }, + { + "epoch": 3.809843400447427, + "grad_norm": 0.5941594839096069, + "learning_rate": 1.521280255971146e-06, + "loss": 0.5076, + "step": 3406 + }, + { + "epoch": 3.8109619686800897, + "grad_norm": 0.5763813257217407, + "learning_rate": 1.5199075005652725e-06, + "loss": 0.5325, + "step": 3407 + }, + { + "epoch": 3.8120805369127515, + "grad_norm": 0.5666414499282837, + "learning_rate": 1.5185350942474941e-06, + "loss": 0.5159, + "step": 3408 + }, + { + "epoch": 3.8131991051454137, + "grad_norm": 0.5939888954162598, + "learning_rate": 1.5171630375066309e-06, + "loss": 0.5355, + "step": 3409 + }, + { + "epoch": 3.814317673378076, + "grad_norm": 0.5711472034454346, + "learning_rate": 1.5157913308313815e-06, + "loss": 0.5019, + "step": 3410 + }, + { + "epoch": 3.815436241610738, + "grad_norm": 0.5917025208473206, + "learning_rate": 1.5144199747103173e-06, + "loss": 0.5398, + "step": 3411 + }, + { + "epoch": 3.8165548098434003, + "grad_norm": 0.5747054219245911, + "learning_rate": 1.5130489696318877e-06, + "loss": 0.5351, + "step": 3412 + }, + { + "epoch": 3.8176733780760626, + "grad_norm": 0.5856633186340332, + "learning_rate": 1.5116783160844144e-06, + "loss": 0.5234, + "step": 3413 + }, + { + "epoch": 3.8187919463087248, + "grad_norm": 0.5978276133537292, + "learning_rate": 1.5103080145560956e-06, + "loss": 0.5181, + "step": 3414 + }, + { + "epoch": 3.819910514541387, + "grad_norm": 0.5838809609413147, + "learning_rate": 1.5089380655350034e-06, + "loss": 0.5329, + "step": 3415 + }, + { + "epoch": 3.821029082774049, + "grad_norm": 0.5899587869644165, + "learning_rate": 1.5075684695090836e-06, + "loss": 0.5017, + "step": 3416 + }, + { + "epoch": 3.8221476510067114, + "grad_norm": 0.6024937033653259, + "learning_rate": 1.5061992269661597e-06, + "loss": 0.5566, + "step": 3417 + }, + { + "epoch": 3.8232662192393736, + "grad_norm": 0.6135915517807007, + "learning_rate": 1.504830338393923e-06, + "loss": 0.5485, + "step": 3418 + }, + { + "epoch": 3.824384787472036, + "grad_norm": 0.5840086936950684, + "learning_rate": 1.503461804279946e-06, + "loss": 0.5455, + "step": 3419 + }, + { + "epoch": 3.825503355704698, + "grad_norm": 0.5929518342018127, + "learning_rate": 1.502093625111669e-06, + "loss": 0.5364, + "step": 3420 + }, + { + "epoch": 3.8266219239373602, + "grad_norm": 0.6134499907493591, + "learning_rate": 1.5007258013764104e-06, + "loss": 0.5575, + "step": 3421 + }, + { + "epoch": 3.8277404921700224, + "grad_norm": 0.585550844669342, + "learning_rate": 1.499358333561358e-06, + "loss": 0.5152, + "step": 3422 + }, + { + "epoch": 3.8288590604026846, + "grad_norm": 0.5884019732475281, + "learning_rate": 1.4979912221535754e-06, + "loss": 0.5302, + "step": 3423 + }, + { + "epoch": 3.829977628635347, + "grad_norm": 0.6088367700576782, + "learning_rate": 1.4966244676399994e-06, + "loss": 0.5314, + "step": 3424 + }, + { + "epoch": 3.831096196868009, + "grad_norm": 0.5937127470970154, + "learning_rate": 1.4952580705074376e-06, + "loss": 0.5489, + "step": 3425 + }, + { + "epoch": 3.8322147651006713, + "grad_norm": 0.5937302708625793, + "learning_rate": 1.4938920312425724e-06, + "loss": 0.5471, + "step": 3426 + }, + { + "epoch": 3.8333333333333335, + "grad_norm": 0.5966449975967407, + "learning_rate": 1.492526350331957e-06, + "loss": 0.5269, + "step": 3427 + }, + { + "epoch": 3.8344519015659957, + "grad_norm": 0.5803402662277222, + "learning_rate": 1.4911610282620198e-06, + "loss": 0.5335, + "step": 3428 + }, + { + "epoch": 3.8355704697986575, + "grad_norm": 0.583142101764679, + "learning_rate": 1.4897960655190575e-06, + "loss": 0.5084, + "step": 3429 + }, + { + "epoch": 3.83668903803132, + "grad_norm": 0.6154366731643677, + "learning_rate": 1.488431462589242e-06, + "loss": 0.5517, + "step": 3430 + }, + { + "epoch": 3.837807606263982, + "grad_norm": 0.5960404276847839, + "learning_rate": 1.4870672199586144e-06, + "loss": 0.566, + "step": 3431 + }, + { + "epoch": 3.8389261744966445, + "grad_norm": 0.5882837772369385, + "learning_rate": 1.4857033381130912e-06, + "loss": 0.536, + "step": 3432 + }, + { + "epoch": 3.8400447427293063, + "grad_norm": 0.6045817136764526, + "learning_rate": 1.4843398175384563e-06, + "loss": 0.5544, + "step": 3433 + }, + { + "epoch": 3.841163310961969, + "grad_norm": 0.5830772519111633, + "learning_rate": 1.482976658720367e-06, + "loss": 0.5321, + "step": 3434 + }, + { + "epoch": 3.8422818791946307, + "grad_norm": 0.5983092784881592, + "learning_rate": 1.4816138621443518e-06, + "loss": 0.5394, + "step": 3435 + }, + { + "epoch": 3.843400447427293, + "grad_norm": 0.6113800406455994, + "learning_rate": 1.480251428295809e-06, + "loss": 0.5438, + "step": 3436 + }, + { + "epoch": 3.844519015659955, + "grad_norm": 0.5998823046684265, + "learning_rate": 1.47888935766001e-06, + "loss": 0.5449, + "step": 3437 + }, + { + "epoch": 3.8456375838926173, + "grad_norm": 0.5815863013267517, + "learning_rate": 1.4775276507220943e-06, + "loss": 0.525, + "step": 3438 + }, + { + "epoch": 3.8467561521252795, + "grad_norm": 0.5928709506988525, + "learning_rate": 1.4761663079670734e-06, + "loss": 0.535, + "step": 3439 + }, + { + "epoch": 3.8478747203579418, + "grad_norm": 0.5827286243438721, + "learning_rate": 1.4748053298798275e-06, + "loss": 0.5566, + "step": 3440 + }, + { + "epoch": 3.848993288590604, + "grad_norm": 0.5852411985397339, + "learning_rate": 1.473444716945111e-06, + "loss": 0.5414, + "step": 3441 + }, + { + "epoch": 3.850111856823266, + "grad_norm": 0.581245481967926, + "learning_rate": 1.4720844696475412e-06, + "loss": 0.534, + "step": 3442 + }, + { + "epoch": 3.8512304250559284, + "grad_norm": 0.5751507878303528, + "learning_rate": 1.4707245884716127e-06, + "loss": 0.5257, + "step": 3443 + }, + { + "epoch": 3.8523489932885906, + "grad_norm": 0.5914750695228577, + "learning_rate": 1.4693650739016847e-06, + "loss": 0.5368, + "step": 3444 + }, + { + "epoch": 3.853467561521253, + "grad_norm": 0.5936532616615295, + "learning_rate": 1.468005926421987e-06, + "loss": 0.5453, + "step": 3445 + }, + { + "epoch": 3.854586129753915, + "grad_norm": 0.5868202447891235, + "learning_rate": 1.4666471465166202e-06, + "loss": 0.5324, + "step": 3446 + }, + { + "epoch": 3.8557046979865772, + "grad_norm": 0.5847600698471069, + "learning_rate": 1.465288734669551e-06, + "loss": 0.5209, + "step": 3447 + }, + { + "epoch": 3.8568232662192394, + "grad_norm": 0.5762224793434143, + "learning_rate": 1.463930691364619e-06, + "loss": 0.5119, + "step": 3448 + }, + { + "epoch": 3.8579418344519016, + "grad_norm": 0.5749316811561584, + "learning_rate": 1.4625730170855286e-06, + "loss": 0.5337, + "step": 3449 + }, + { + "epoch": 3.859060402684564, + "grad_norm": 0.5961950421333313, + "learning_rate": 1.4612157123158557e-06, + "loss": 0.5417, + "step": 3450 + }, + { + "epoch": 3.860178970917226, + "grad_norm": 0.6332731246948242, + "learning_rate": 1.4598587775390419e-06, + "loss": 0.5502, + "step": 3451 + }, + { + "epoch": 3.8612975391498883, + "grad_norm": 0.5867600440979004, + "learning_rate": 1.4585022132384008e-06, + "loss": 0.5192, + "step": 3452 + }, + { + "epoch": 3.8624161073825505, + "grad_norm": 0.6151018142700195, + "learning_rate": 1.4571460198971089e-06, + "loss": 0.5481, + "step": 3453 + }, + { + "epoch": 3.8635346756152127, + "grad_norm": 0.6162075996398926, + "learning_rate": 1.4557901979982158e-06, + "loss": 0.5623, + "step": 3454 + }, + { + "epoch": 3.864653243847875, + "grad_norm": 0.6140363812446594, + "learning_rate": 1.4544347480246358e-06, + "loss": 0.531, + "step": 3455 + }, + { + "epoch": 3.8657718120805367, + "grad_norm": 0.6078040599822998, + "learning_rate": 1.4530796704591493e-06, + "loss": 0.5374, + "step": 3456 + }, + { + "epoch": 3.8668903803131993, + "grad_norm": 0.6000497937202454, + "learning_rate": 1.4517249657844095e-06, + "loss": 0.5687, + "step": 3457 + }, + { + "epoch": 3.868008948545861, + "grad_norm": 0.5983914136886597, + "learning_rate": 1.4503706344829304e-06, + "loss": 0.5095, + "step": 3458 + }, + { + "epoch": 3.8691275167785237, + "grad_norm": 0.5994406342506409, + "learning_rate": 1.4490166770370984e-06, + "loss": 0.5261, + "step": 3459 + }, + { + "epoch": 3.8702460850111855, + "grad_norm": 0.5991759896278381, + "learning_rate": 1.4476630939291631e-06, + "loss": 0.5312, + "step": 3460 + }, + { + "epoch": 3.8713646532438477, + "grad_norm": 0.6003211140632629, + "learning_rate": 1.4463098856412418e-06, + "loss": 0.547, + "step": 3461 + }, + { + "epoch": 3.87248322147651, + "grad_norm": 0.5779590606689453, + "learning_rate": 1.4449570526553183e-06, + "loss": 0.54, + "step": 3462 + }, + { + "epoch": 3.873601789709172, + "grad_norm": 0.5849114656448364, + "learning_rate": 1.4436045954532442e-06, + "loss": 0.5216, + "step": 3463 + }, + { + "epoch": 3.8747203579418343, + "grad_norm": 0.5999607443809509, + "learning_rate": 1.4422525145167344e-06, + "loss": 0.5514, + "step": 3464 + }, + { + "epoch": 3.8758389261744965, + "grad_norm": 0.5782378315925598, + "learning_rate": 1.440900810327373e-06, + "loss": 0.5294, + "step": 3465 + }, + { + "epoch": 3.8769574944071588, + "grad_norm": 0.5999947786331177, + "learning_rate": 1.4395494833666075e-06, + "loss": 0.5543, + "step": 3466 + }, + { + "epoch": 3.878076062639821, + "grad_norm": 0.6043230891227722, + "learning_rate": 1.4381985341157517e-06, + "loss": 0.5473, + "step": 3467 + }, + { + "epoch": 3.879194630872483, + "grad_norm": 0.6058630347251892, + "learning_rate": 1.436847963055985e-06, + "loss": 0.5517, + "step": 3468 + }, + { + "epoch": 3.8803131991051454, + "grad_norm": 0.5847921371459961, + "learning_rate": 1.4354977706683516e-06, + "loss": 0.5265, + "step": 3469 + }, + { + "epoch": 3.8814317673378076, + "grad_norm": 0.5842016339302063, + "learning_rate": 1.4341479574337621e-06, + "loss": 0.5174, + "step": 3470 + }, + { + "epoch": 3.88255033557047, + "grad_norm": 0.5968067646026611, + "learning_rate": 1.4327985238329903e-06, + "loss": 0.5313, + "step": 3471 + }, + { + "epoch": 3.883668903803132, + "grad_norm": 0.6129271388053894, + "learning_rate": 1.4314494703466775e-06, + "loss": 0.5641, + "step": 3472 + }, + { + "epoch": 3.884787472035794, + "grad_norm": 0.6046187281608582, + "learning_rate": 1.4301007974553254e-06, + "loss": 0.5176, + "step": 3473 + }, + { + "epoch": 3.8859060402684564, + "grad_norm": 0.5914602279663086, + "learning_rate": 1.428752505639305e-06, + "loss": 0.5381, + "step": 3474 + }, + { + "epoch": 3.8870246085011186, + "grad_norm": 0.6040394902229309, + "learning_rate": 1.4274045953788485e-06, + "loss": 0.4835, + "step": 3475 + }, + { + "epoch": 3.888143176733781, + "grad_norm": 0.6227742433547974, + "learning_rate": 1.426057067154052e-06, + "loss": 0.5482, + "step": 3476 + }, + { + "epoch": 3.889261744966443, + "grad_norm": 0.6004636287689209, + "learning_rate": 1.4247099214448767e-06, + "loss": 0.5329, + "step": 3477 + }, + { + "epoch": 3.8903803131991053, + "grad_norm": 0.5947349667549133, + "learning_rate": 1.423363158731147e-06, + "loss": 0.5271, + "step": 3478 + }, + { + "epoch": 3.8914988814317675, + "grad_norm": 0.6152440309524536, + "learning_rate": 1.4220167794925519e-06, + "loss": 0.5443, + "step": 3479 + }, + { + "epoch": 3.8926174496644297, + "grad_norm": 0.5924034714698792, + "learning_rate": 1.4206707842086417e-06, + "loss": 0.5238, + "step": 3480 + }, + { + "epoch": 3.8937360178970915, + "grad_norm": 0.5782044529914856, + "learning_rate": 1.419325173358833e-06, + "loss": 0.4965, + "step": 3481 + }, + { + "epoch": 3.894854586129754, + "grad_norm": 0.5839013457298279, + "learning_rate": 1.4179799474224026e-06, + "loss": 0.5267, + "step": 3482 + }, + { + "epoch": 3.895973154362416, + "grad_norm": 0.6051729321479797, + "learning_rate": 1.4166351068784922e-06, + "loss": 0.5418, + "step": 3483 + }, + { + "epoch": 3.8970917225950785, + "grad_norm": 0.596942126750946, + "learning_rate": 1.415290652206105e-06, + "loss": 0.5216, + "step": 3484 + }, + { + "epoch": 3.8982102908277403, + "grad_norm": 0.5890728831291199, + "learning_rate": 1.4139465838841072e-06, + "loss": 0.5426, + "step": 3485 + }, + { + "epoch": 3.899328859060403, + "grad_norm": 0.5977894067764282, + "learning_rate": 1.412602902391227e-06, + "loss": 0.5431, + "step": 3486 + }, + { + "epoch": 3.9004474272930647, + "grad_norm": 0.5922943353652954, + "learning_rate": 1.4112596082060543e-06, + "loss": 0.5341, + "step": 3487 + }, + { + "epoch": 3.901565995525727, + "grad_norm": 0.6063907146453857, + "learning_rate": 1.4099167018070436e-06, + "loss": 0.5531, + "step": 3488 + }, + { + "epoch": 3.902684563758389, + "grad_norm": 0.598052978515625, + "learning_rate": 1.4085741836725086e-06, + "loss": 0.5621, + "step": 3489 + }, + { + "epoch": 3.9038031319910513, + "grad_norm": 0.6031754612922668, + "learning_rate": 1.4072320542806267e-06, + "loss": 0.5219, + "step": 3490 + }, + { + "epoch": 3.9049217002237135, + "grad_norm": 0.6235234141349792, + "learning_rate": 1.405890314109434e-06, + "loss": 0.5811, + "step": 3491 + }, + { + "epoch": 3.9060402684563758, + "grad_norm": 0.5925267338752747, + "learning_rate": 1.4045489636368332e-06, + "loss": 0.5363, + "step": 3492 + }, + { + "epoch": 3.907158836689038, + "grad_norm": 0.603705644607544, + "learning_rate": 1.4032080033405804e-06, + "loss": 0.5414, + "step": 3493 + }, + { + "epoch": 3.9082774049217, + "grad_norm": 0.5979185700416565, + "learning_rate": 1.4018674336983e-06, + "loss": 0.5603, + "step": 3494 + }, + { + "epoch": 3.9093959731543624, + "grad_norm": 0.5962160229682922, + "learning_rate": 1.4005272551874732e-06, + "loss": 0.5587, + "step": 3495 + }, + { + "epoch": 3.9105145413870246, + "grad_norm": 0.5903746485710144, + "learning_rate": 1.3991874682854441e-06, + "loss": 0.5079, + "step": 3496 + }, + { + "epoch": 3.911633109619687, + "grad_norm": 0.5926321148872375, + "learning_rate": 1.3978480734694161e-06, + "loss": 0.5291, + "step": 3497 + }, + { + "epoch": 3.912751677852349, + "grad_norm": 0.5840973258018494, + "learning_rate": 1.3965090712164515e-06, + "loss": 0.5154, + "step": 3498 + }, + { + "epoch": 3.913870246085011, + "grad_norm": 0.6063098311424255, + "learning_rate": 1.3951704620034764e-06, + "loss": 0.5397, + "step": 3499 + }, + { + "epoch": 3.9149888143176734, + "grad_norm": 0.5906224846839905, + "learning_rate": 1.3938322463072745e-06, + "loss": 0.5608, + "step": 3500 + }, + { + "epoch": 3.9161073825503356, + "grad_norm": 0.5830900073051453, + "learning_rate": 1.3924944246044892e-06, + "loss": 0.5026, + "step": 3501 + }, + { + "epoch": 3.917225950782998, + "grad_norm": 0.5893346071243286, + "learning_rate": 1.3911569973716235e-06, + "loss": 0.5421, + "step": 3502 + }, + { + "epoch": 3.91834451901566, + "grad_norm": 0.5733920335769653, + "learning_rate": 1.3898199650850424e-06, + "loss": 0.5296, + "step": 3503 + }, + { + "epoch": 3.9194630872483223, + "grad_norm": 0.5772490501403809, + "learning_rate": 1.3884833282209663e-06, + "loss": 0.5489, + "step": 3504 + }, + { + "epoch": 3.9205816554809845, + "grad_norm": 0.6066833138465881, + "learning_rate": 1.3871470872554787e-06, + "loss": 0.5571, + "step": 3505 + }, + { + "epoch": 3.9217002237136467, + "grad_norm": 0.5913258790969849, + "learning_rate": 1.3858112426645184e-06, + "loss": 0.548, + "step": 3506 + }, + { + "epoch": 3.922818791946309, + "grad_norm": 0.5925204753875732, + "learning_rate": 1.384475794923887e-06, + "loss": 0.5418, + "step": 3507 + }, + { + "epoch": 3.9239373601789707, + "grad_norm": 0.6057487726211548, + "learning_rate": 1.383140744509241e-06, + "loss": 0.5546, + "step": 3508 + }, + { + "epoch": 3.9250559284116333, + "grad_norm": 0.6077185273170471, + "learning_rate": 1.3818060918960973e-06, + "loss": 0.5644, + "step": 3509 + }, + { + "epoch": 3.926174496644295, + "grad_norm": 0.6041961908340454, + "learning_rate": 1.380471837559831e-06, + "loss": 0.5402, + "step": 3510 + }, + { + "epoch": 3.9272930648769577, + "grad_norm": 0.604128897190094, + "learning_rate": 1.3791379819756734e-06, + "loss": 0.558, + "step": 3511 + }, + { + "epoch": 3.9284116331096195, + "grad_norm": 0.5926300883293152, + "learning_rate": 1.3778045256187181e-06, + "loss": 0.5137, + "step": 3512 + }, + { + "epoch": 3.929530201342282, + "grad_norm": 0.5913637280464172, + "learning_rate": 1.3764714689639116e-06, + "loss": 0.5442, + "step": 3513 + }, + { + "epoch": 3.930648769574944, + "grad_norm": 0.5825512409210205, + "learning_rate": 1.3751388124860626e-06, + "loss": 0.5341, + "step": 3514 + }, + { + "epoch": 3.931767337807606, + "grad_norm": 0.5853479504585266, + "learning_rate": 1.3738065566598327e-06, + "loss": 0.5567, + "step": 3515 + }, + { + "epoch": 3.9328859060402683, + "grad_norm": 0.6059838533401489, + "learning_rate": 1.372474701959745e-06, + "loss": 0.5397, + "step": 3516 + }, + { + "epoch": 3.9340044742729305, + "grad_norm": 0.5948166847229004, + "learning_rate": 1.3711432488601773e-06, + "loss": 0.5505, + "step": 3517 + }, + { + "epoch": 3.9351230425055927, + "grad_norm": 0.5900046825408936, + "learning_rate": 1.3698121978353645e-06, + "loss": 0.5523, + "step": 3518 + }, + { + "epoch": 3.936241610738255, + "grad_norm": 0.6191303730010986, + "learning_rate": 1.3684815493593986e-06, + "loss": 0.5568, + "step": 3519 + }, + { + "epoch": 3.937360178970917, + "grad_norm": 0.5993981957435608, + "learning_rate": 1.3671513039062273e-06, + "loss": 0.5225, + "step": 3520 + }, + { + "epoch": 3.9384787472035794, + "grad_norm": 0.6063231229782104, + "learning_rate": 1.365821461949658e-06, + "loss": 0.5469, + "step": 3521 + }, + { + "epoch": 3.9395973154362416, + "grad_norm": 0.5932417511940002, + "learning_rate": 1.3644920239633496e-06, + "loss": 0.5235, + "step": 3522 + }, + { + "epoch": 3.940715883668904, + "grad_norm": 0.5976962447166443, + "learning_rate": 1.3631629904208222e-06, + "loss": 0.5509, + "step": 3523 + }, + { + "epoch": 3.941834451901566, + "grad_norm": 0.595431923866272, + "learning_rate": 1.3618343617954476e-06, + "loss": 0.5274, + "step": 3524 + }, + { + "epoch": 3.942953020134228, + "grad_norm": 0.5923426747322083, + "learning_rate": 1.3605061385604556e-06, + "loss": 0.542, + "step": 3525 + }, + { + "epoch": 3.9440715883668904, + "grad_norm": 0.5865135788917542, + "learning_rate": 1.3591783211889304e-06, + "loss": 0.5209, + "step": 3526 + }, + { + "epoch": 3.9451901565995526, + "grad_norm": 0.5761022567749023, + "learning_rate": 1.357850910153813e-06, + "loss": 0.535, + "step": 3527 + }, + { + "epoch": 3.946308724832215, + "grad_norm": 0.615920901298523, + "learning_rate": 1.3565239059278992e-06, + "loss": 0.5498, + "step": 3528 + }, + { + "epoch": 3.947427293064877, + "grad_norm": 0.5963379144668579, + "learning_rate": 1.3551973089838382e-06, + "loss": 0.5357, + "step": 3529 + }, + { + "epoch": 3.9485458612975393, + "grad_norm": 0.5812051892280579, + "learning_rate": 1.3538711197941372e-06, + "loss": 0.5275, + "step": 3530 + }, + { + "epoch": 3.9496644295302015, + "grad_norm": 0.593917965888977, + "learning_rate": 1.3525453388311554e-06, + "loss": 0.5339, + "step": 3531 + }, + { + "epoch": 3.9507829977628637, + "grad_norm": 0.5994927883148193, + "learning_rate": 1.3512199665671094e-06, + "loss": 0.5236, + "step": 3532 + }, + { + "epoch": 3.951901565995526, + "grad_norm": 0.5868715643882751, + "learning_rate": 1.3498950034740676e-06, + "loss": 0.5712, + "step": 3533 + }, + { + "epoch": 3.953020134228188, + "grad_norm": 0.5731030702590942, + "learning_rate": 1.3485704500239533e-06, + "loss": 0.5167, + "step": 3534 + }, + { + "epoch": 3.95413870246085, + "grad_norm": 0.5860263705253601, + "learning_rate": 1.3472463066885438e-06, + "loss": 0.526, + "step": 3535 + }, + { + "epoch": 3.9552572706935125, + "grad_norm": 0.6025899052619934, + "learning_rate": 1.3459225739394728e-06, + "loss": 0.5392, + "step": 3536 + }, + { + "epoch": 3.9563758389261743, + "grad_norm": 0.5841599702835083, + "learning_rate": 1.3445992522482233e-06, + "loss": 0.5157, + "step": 3537 + }, + { + "epoch": 3.957494407158837, + "grad_norm": 0.5884062647819519, + "learning_rate": 1.3432763420861361e-06, + "loss": 0.5414, + "step": 3538 + }, + { + "epoch": 3.9586129753914987, + "grad_norm": 0.5740889310836792, + "learning_rate": 1.3419538439244035e-06, + "loss": 0.5136, + "step": 3539 + }, + { + "epoch": 3.959731543624161, + "grad_norm": 0.5774790048599243, + "learning_rate": 1.3406317582340694e-06, + "loss": 0.526, + "step": 3540 + }, + { + "epoch": 3.960850111856823, + "grad_norm": 0.5890363454818726, + "learning_rate": 1.3393100854860363e-06, + "loss": 0.5339, + "step": 3541 + }, + { + "epoch": 3.9619686800894853, + "grad_norm": 0.584784209728241, + "learning_rate": 1.3379888261510519e-06, + "loss": 0.5167, + "step": 3542 + }, + { + "epoch": 3.9630872483221475, + "grad_norm": 0.5787740349769592, + "learning_rate": 1.3366679806997228e-06, + "loss": 0.5248, + "step": 3543 + }, + { + "epoch": 3.9642058165548097, + "grad_norm": 0.6138972640037537, + "learning_rate": 1.3353475496025049e-06, + "loss": 0.5526, + "step": 3544 + }, + { + "epoch": 3.965324384787472, + "grad_norm": 0.6084880232810974, + "learning_rate": 1.3340275333297091e-06, + "loss": 0.5553, + "step": 3545 + }, + { + "epoch": 3.966442953020134, + "grad_norm": 0.6087905168533325, + "learning_rate": 1.3327079323514957e-06, + "loss": 0.5363, + "step": 3546 + }, + { + "epoch": 3.9675615212527964, + "grad_norm": 0.593187153339386, + "learning_rate": 1.33138874713788e-06, + "loss": 0.5307, + "step": 3547 + }, + { + "epoch": 3.9686800894854586, + "grad_norm": 0.5979828238487244, + "learning_rate": 1.3300699781587267e-06, + "loss": 0.5366, + "step": 3548 + }, + { + "epoch": 3.969798657718121, + "grad_norm": 0.6004894375801086, + "learning_rate": 1.3287516258837536e-06, + "loss": 0.5603, + "step": 3549 + }, + { + "epoch": 3.970917225950783, + "grad_norm": 0.5919094681739807, + "learning_rate": 1.327433690782529e-06, + "loss": 0.5232, + "step": 3550 + }, + { + "epoch": 3.972035794183445, + "grad_norm": 0.6052770018577576, + "learning_rate": 1.3261161733244738e-06, + "loss": 0.5254, + "step": 3551 + }, + { + "epoch": 3.9731543624161074, + "grad_norm": 0.6192354559898376, + "learning_rate": 1.3247990739788602e-06, + "loss": 0.5604, + "step": 3552 + }, + { + "epoch": 3.9742729306487696, + "grad_norm": 0.5797891616821289, + "learning_rate": 1.3234823932148093e-06, + "loss": 0.5238, + "step": 3553 + }, + { + "epoch": 3.975391498881432, + "grad_norm": 0.5921968221664429, + "learning_rate": 1.3221661315012973e-06, + "loss": 0.5288, + "step": 3554 + }, + { + "epoch": 3.976510067114094, + "grad_norm": 0.5861966609954834, + "learning_rate": 1.3208502893071461e-06, + "loss": 0.5243, + "step": 3555 + }, + { + "epoch": 3.9776286353467563, + "grad_norm": 0.5871561765670776, + "learning_rate": 1.3195348671010332e-06, + "loss": 0.5435, + "step": 3556 + }, + { + "epoch": 3.9787472035794185, + "grad_norm": 0.5934867858886719, + "learning_rate": 1.3182198653514828e-06, + "loss": 0.522, + "step": 3557 + }, + { + "epoch": 3.9798657718120807, + "grad_norm": 0.5910374522209167, + "learning_rate": 1.3169052845268701e-06, + "loss": 0.5231, + "step": 3558 + }, + { + "epoch": 3.980984340044743, + "grad_norm": 0.5925207734107971, + "learning_rate": 1.31559112509542e-06, + "loss": 0.5193, + "step": 3559 + }, + { + "epoch": 3.9821029082774047, + "grad_norm": 0.6060652732849121, + "learning_rate": 1.3142773875252107e-06, + "loss": 0.5538, + "step": 3560 + }, + { + "epoch": 3.9832214765100673, + "grad_norm": 0.6055449843406677, + "learning_rate": 1.312964072284166e-06, + "loss": 0.5456, + "step": 3561 + }, + { + "epoch": 3.984340044742729, + "grad_norm": 0.6018159985542297, + "learning_rate": 1.3116511798400599e-06, + "loss": 0.5299, + "step": 3562 + }, + { + "epoch": 3.9854586129753917, + "grad_norm": 0.6123597025871277, + "learning_rate": 1.3103387106605193e-06, + "loss": 0.5276, + "step": 3563 + }, + { + "epoch": 3.9865771812080535, + "grad_norm": 0.5759068131446838, + "learning_rate": 1.3090266652130145e-06, + "loss": 0.5055, + "step": 3564 + }, + { + "epoch": 3.987695749440716, + "grad_norm": 0.5802836418151855, + "learning_rate": 1.3077150439648718e-06, + "loss": 0.5487, + "step": 3565 + }, + { + "epoch": 3.988814317673378, + "grad_norm": 0.5954502820968628, + "learning_rate": 1.3064038473832608e-06, + "loss": 0.5464, + "step": 3566 + }, + { + "epoch": 3.98993288590604, + "grad_norm": 0.5886498689651489, + "learning_rate": 1.3050930759352019e-06, + "loss": 0.5513, + "step": 3567 + }, + { + "epoch": 3.9910514541387023, + "grad_norm": 0.5818016529083252, + "learning_rate": 1.303782730087563e-06, + "loss": 0.54, + "step": 3568 + }, + { + "epoch": 3.9921700223713645, + "grad_norm": 0.59755939245224, + "learning_rate": 1.3024728103070635e-06, + "loss": 0.536, + "step": 3569 + }, + { + "epoch": 3.9932885906040267, + "grad_norm": 0.594585120677948, + "learning_rate": 1.3011633170602672e-06, + "loss": 0.5435, + "step": 3570 + }, + { + "epoch": 3.994407158836689, + "grad_norm": 0.6277769804000854, + "learning_rate": 1.2998542508135894e-06, + "loss": 0.5543, + "step": 3571 + }, + { + "epoch": 3.995525727069351, + "grad_norm": 0.5899461507797241, + "learning_rate": 1.2985456120332907e-06, + "loss": 0.541, + "step": 3572 + }, + { + "epoch": 3.9966442953020134, + "grad_norm": 0.5964384078979492, + "learning_rate": 1.2972374011854804e-06, + "loss": 0.5262, + "step": 3573 + }, + { + "epoch": 3.9977628635346756, + "grad_norm": 0.616132378578186, + "learning_rate": 1.295929618736116e-06, + "loss": 0.5648, + "step": 3574 + }, + { + "epoch": 3.998881431767338, + "grad_norm": 0.5753785967826843, + "learning_rate": 1.2946222651509995e-06, + "loss": 0.497, + "step": 3575 + }, + { + "epoch": 4.0, + "grad_norm": 0.5872046947479248, + "learning_rate": 1.2933153408957855e-06, + "loss": 0.5146, + "step": 3576 + }, + { + "epoch": 4.001118568232662, + "grad_norm": 0.5730710029602051, + "learning_rate": 1.2920088464359697e-06, + "loss": 0.5077, + "step": 3577 + }, + { + "epoch": 4.002237136465324, + "grad_norm": 0.59812992811203, + "learning_rate": 1.2907027822369006e-06, + "loss": 0.5366, + "step": 3578 + }, + { + "epoch": 4.003355704697986, + "grad_norm": 0.6164752244949341, + "learning_rate": 1.2893971487637683e-06, + "loss": 0.5347, + "step": 3579 + }, + { + "epoch": 4.004474272930649, + "grad_norm": 0.6085970997810364, + "learning_rate": 1.288091946481613e-06, + "loss": 0.531, + "step": 3580 + }, + { + "epoch": 4.005592841163311, + "grad_norm": 0.5614831447601318, + "learning_rate": 1.2867871758553193e-06, + "loss": 0.4996, + "step": 3581 + }, + { + "epoch": 4.006711409395973, + "grad_norm": 0.5921034216880798, + "learning_rate": 1.2854828373496191e-06, + "loss": 0.5411, + "step": 3582 + }, + { + "epoch": 4.007829977628635, + "grad_norm": 0.5978662967681885, + "learning_rate": 1.2841789314290896e-06, + "loss": 0.5416, + "step": 3583 + }, + { + "epoch": 4.008948545861298, + "grad_norm": 0.5928497314453125, + "learning_rate": 1.2828754585581532e-06, + "loss": 0.5068, + "step": 3584 + }, + { + "epoch": 4.010067114093959, + "grad_norm": 0.5935963988304138, + "learning_rate": 1.281572419201082e-06, + "loss": 0.5025, + "step": 3585 + }, + { + "epoch": 4.011185682326622, + "grad_norm": 0.5959998369216919, + "learning_rate": 1.2802698138219882e-06, + "loss": 0.5647, + "step": 3586 + }, + { + "epoch": 4.012304250559284, + "grad_norm": 0.609431803226471, + "learning_rate": 1.278967642884834e-06, + "loss": 0.552, + "step": 3587 + }, + { + "epoch": 4.0134228187919465, + "grad_norm": 0.6250160932540894, + "learning_rate": 1.2776659068534237e-06, + "loss": 0.5274, + "step": 3588 + }, + { + "epoch": 4.014541387024608, + "grad_norm": 0.6044382452964783, + "learning_rate": 1.2763646061914092e-06, + "loss": 0.5169, + "step": 3589 + }, + { + "epoch": 4.015659955257271, + "grad_norm": 0.6135263442993164, + "learning_rate": 1.2750637413622857e-06, + "loss": 0.5369, + "step": 3590 + }, + { + "epoch": 4.016778523489933, + "grad_norm": 0.5983233451843262, + "learning_rate": 1.273763312829393e-06, + "loss": 0.5073, + "step": 3591 + }, + { + "epoch": 4.017897091722595, + "grad_norm": 0.6208228468894958, + "learning_rate": 1.2724633210559168e-06, + "loss": 0.5536, + "step": 3592 + }, + { + "epoch": 4.019015659955257, + "grad_norm": 0.6319078207015991, + "learning_rate": 1.2711637665048849e-06, + "loss": 0.5677, + "step": 3593 + }, + { + "epoch": 4.02013422818792, + "grad_norm": 0.6272683143615723, + "learning_rate": 1.269864649639173e-06, + "loss": 0.5413, + "step": 3594 + }, + { + "epoch": 4.0212527964205815, + "grad_norm": 0.6157651543617249, + "learning_rate": 1.2685659709214975e-06, + "loss": 0.5412, + "step": 3595 + }, + { + "epoch": 4.022371364653244, + "grad_norm": 0.5957029461860657, + "learning_rate": 1.2672677308144213e-06, + "loss": 0.5246, + "step": 3596 + }, + { + "epoch": 4.023489932885906, + "grad_norm": 0.5967620611190796, + "learning_rate": 1.2659699297803495e-06, + "loss": 0.5512, + "step": 3597 + }, + { + "epoch": 4.024608501118569, + "grad_norm": 0.6149760484695435, + "learning_rate": 1.2646725682815308e-06, + "loss": 0.5385, + "step": 3598 + }, + { + "epoch": 4.02572706935123, + "grad_norm": 0.6089226603507996, + "learning_rate": 1.2633756467800572e-06, + "loss": 0.5204, + "step": 3599 + }, + { + "epoch": 4.026845637583893, + "grad_norm": 0.6314104795455933, + "learning_rate": 1.2620791657378664e-06, + "loss": 0.5433, + "step": 3600 + }, + { + "epoch": 4.027964205816555, + "grad_norm": 0.5837777256965637, + "learning_rate": 1.2607831256167352e-06, + "loss": 0.5048, + "step": 3601 + }, + { + "epoch": 4.029082774049217, + "grad_norm": 0.5902408957481384, + "learning_rate": 1.2594875268782874e-06, + "loss": 0.5177, + "step": 3602 + }, + { + "epoch": 4.030201342281879, + "grad_norm": 0.617569088935852, + "learning_rate": 1.258192369983987e-06, + "loss": 0.5232, + "step": 3603 + }, + { + "epoch": 4.031319910514541, + "grad_norm": 0.6025773286819458, + "learning_rate": 1.2568976553951407e-06, + "loss": 0.5086, + "step": 3604 + }, + { + "epoch": 4.032438478747204, + "grad_norm": 0.6023662686347961, + "learning_rate": 1.2556033835728992e-06, + "loss": 0.5546, + "step": 3605 + }, + { + "epoch": 4.033557046979865, + "grad_norm": 0.6131324172019958, + "learning_rate": 1.2543095549782542e-06, + "loss": 0.5273, + "step": 3606 + }, + { + "epoch": 4.034675615212528, + "grad_norm": 0.6119228005409241, + "learning_rate": 1.25301617007204e-06, + "loss": 0.5479, + "step": 3607 + }, + { + "epoch": 4.03579418344519, + "grad_norm": 0.6121472716331482, + "learning_rate": 1.251723229314932e-06, + "loss": 0.5105, + "step": 3608 + }, + { + "epoch": 4.0369127516778525, + "grad_norm": 0.5955631732940674, + "learning_rate": 1.2504307331674499e-06, + "loss": 0.5288, + "step": 3609 + }, + { + "epoch": 4.038031319910514, + "grad_norm": 0.6060401797294617, + "learning_rate": 1.2491386820899508e-06, + "loss": 0.5216, + "step": 3610 + }, + { + "epoch": 4.039149888143177, + "grad_norm": 0.5963885188102722, + "learning_rate": 1.2478470765426383e-06, + "loss": 0.526, + "step": 3611 + }, + { + "epoch": 4.040268456375839, + "grad_norm": 0.6056948304176331, + "learning_rate": 1.2465559169855533e-06, + "loss": 0.5339, + "step": 3612 + }, + { + "epoch": 4.041387024608501, + "grad_norm": 0.6130921840667725, + "learning_rate": 1.2452652038785805e-06, + "loss": 0.5409, + "step": 3613 + }, + { + "epoch": 4.042505592841163, + "grad_norm": 0.6113458275794983, + "learning_rate": 1.2439749376814443e-06, + "loss": 0.5099, + "step": 3614 + }, + { + "epoch": 4.043624161073826, + "grad_norm": 0.6096323132514954, + "learning_rate": 1.2426851188537093e-06, + "loss": 0.5365, + "step": 3615 + }, + { + "epoch": 4.0447427293064875, + "grad_norm": 0.6119310855865479, + "learning_rate": 1.2413957478547816e-06, + "loss": 0.5152, + "step": 3616 + }, + { + "epoch": 4.04586129753915, + "grad_norm": 0.6114838719367981, + "learning_rate": 1.2401068251439072e-06, + "loss": 0.5129, + "step": 3617 + }, + { + "epoch": 4.046979865771812, + "grad_norm": 0.5982204675674438, + "learning_rate": 1.2388183511801746e-06, + "loss": 0.5131, + "step": 3618 + }, + { + "epoch": 4.0480984340044746, + "grad_norm": 0.5934600830078125, + "learning_rate": 1.2375303264225088e-06, + "loss": 0.5208, + "step": 3619 + }, + { + "epoch": 4.049217002237136, + "grad_norm": 0.6015812754631042, + "learning_rate": 1.236242751329679e-06, + "loss": 0.5622, + "step": 3620 + }, + { + "epoch": 4.050335570469799, + "grad_norm": 0.6197997331619263, + "learning_rate": 1.2349556263602908e-06, + "loss": 0.55, + "step": 3621 + }, + { + "epoch": 4.051454138702461, + "grad_norm": 0.5988110899925232, + "learning_rate": 1.233668951972791e-06, + "loss": 0.5275, + "step": 3622 + }, + { + "epoch": 4.052572706935123, + "grad_norm": 0.613101065158844, + "learning_rate": 1.2323827286254645e-06, + "loss": 0.5299, + "step": 3623 + }, + { + "epoch": 4.053691275167785, + "grad_norm": 0.6004343628883362, + "learning_rate": 1.2310969567764386e-06, + "loss": 0.5233, + "step": 3624 + }, + { + "epoch": 4.054809843400448, + "grad_norm": 0.6130960583686829, + "learning_rate": 1.2298116368836772e-06, + "loss": 0.5242, + "step": 3625 + }, + { + "epoch": 4.05592841163311, + "grad_norm": 0.5979886054992676, + "learning_rate": 1.2285267694049823e-06, + "loss": 0.5346, + "step": 3626 + }, + { + "epoch": 4.057046979865772, + "grad_norm": 0.6122968792915344, + "learning_rate": 1.2272423547979993e-06, + "loss": 0.5521, + "step": 3627 + }, + { + "epoch": 4.058165548098434, + "grad_norm": 0.6044282913208008, + "learning_rate": 1.2259583935202063e-06, + "loss": 0.5322, + "step": 3628 + }, + { + "epoch": 4.059284116331096, + "grad_norm": 0.5857721567153931, + "learning_rate": 1.2246748860289254e-06, + "loss": 0.5113, + "step": 3629 + }, + { + "epoch": 4.060402684563758, + "grad_norm": 0.615626871585846, + "learning_rate": 1.223391832781314e-06, + "loss": 0.5259, + "step": 3630 + }, + { + "epoch": 4.06152125279642, + "grad_norm": 0.5925611853599548, + "learning_rate": 1.222109234234368e-06, + "loss": 0.5282, + "step": 3631 + }, + { + "epoch": 4.062639821029083, + "grad_norm": 0.6028927564620972, + "learning_rate": 1.2208270908449207e-06, + "loss": 0.5055, + "step": 3632 + }, + { + "epoch": 4.063758389261745, + "grad_norm": 0.5996367335319519, + "learning_rate": 1.2195454030696466e-06, + "loss": 0.5385, + "step": 3633 + }, + { + "epoch": 4.064876957494407, + "grad_norm": 0.6218629479408264, + "learning_rate": 1.2182641713650534e-06, + "loss": 0.5437, + "step": 3634 + }, + { + "epoch": 4.065995525727069, + "grad_norm": 0.5944381356239319, + "learning_rate": 1.2169833961874902e-06, + "loss": 0.5232, + "step": 3635 + }, + { + "epoch": 4.067114093959732, + "grad_norm": 0.6154597401618958, + "learning_rate": 1.2157030779931418e-06, + "loss": 0.5417, + "step": 3636 + }, + { + "epoch": 4.068232662192393, + "grad_norm": 0.5996164679527283, + "learning_rate": 1.2144232172380283e-06, + "loss": 0.5134, + "step": 3637 + }, + { + "epoch": 4.069351230425056, + "grad_norm": 0.6204807162284851, + "learning_rate": 1.2131438143780113e-06, + "loss": 0.5466, + "step": 3638 + }, + { + "epoch": 4.070469798657718, + "grad_norm": 0.6035084128379822, + "learning_rate": 1.2118648698687862e-06, + "loss": 0.5192, + "step": 3639 + }, + { + "epoch": 4.0715883668903805, + "grad_norm": 0.6030504107475281, + "learning_rate": 1.210586384165885e-06, + "loss": 0.5279, + "step": 3640 + }, + { + "epoch": 4.072706935123042, + "grad_norm": 0.6018303632736206, + "learning_rate": 1.2093083577246773e-06, + "loss": 0.5254, + "step": 3641 + }, + { + "epoch": 4.073825503355705, + "grad_norm": 0.591963529586792, + "learning_rate": 1.20803079100037e-06, + "loss": 0.5338, + "step": 3642 + }, + { + "epoch": 4.074944071588367, + "grad_norm": 0.5986825227737427, + "learning_rate": 1.2067536844480033e-06, + "loss": 0.5045, + "step": 3643 + }, + { + "epoch": 4.076062639821029, + "grad_norm": 0.6163814663887024, + "learning_rate": 1.2054770385224574e-06, + "loss": 0.5074, + "step": 3644 + }, + { + "epoch": 4.077181208053691, + "grad_norm": 0.5927719473838806, + "learning_rate": 1.2042008536784455e-06, + "loss": 0.5148, + "step": 3645 + }, + { + "epoch": 4.078299776286354, + "grad_norm": 0.6080328822135925, + "learning_rate": 1.2029251303705177e-06, + "loss": 0.4989, + "step": 3646 + }, + { + "epoch": 4.0794183445190155, + "grad_norm": 0.626325786113739, + "learning_rate": 1.2016498690530592e-06, + "loss": 0.5483, + "step": 3647 + }, + { + "epoch": 4.080536912751678, + "grad_norm": 0.5998907089233398, + "learning_rate": 1.2003750701802903e-06, + "loss": 0.5311, + "step": 3648 + }, + { + "epoch": 4.08165548098434, + "grad_norm": 0.6005836725234985, + "learning_rate": 1.1991007342062686e-06, + "loss": 0.5172, + "step": 3649 + }, + { + "epoch": 4.082774049217003, + "grad_norm": 0.6273759007453918, + "learning_rate": 1.197826861584884e-06, + "loss": 0.5337, + "step": 3650 + }, + { + "epoch": 4.083892617449664, + "grad_norm": 0.5987786650657654, + "learning_rate": 1.1965534527698647e-06, + "loss": 0.5192, + "step": 3651 + }, + { + "epoch": 4.085011185682327, + "grad_norm": 0.6150439977645874, + "learning_rate": 1.1952805082147697e-06, + "loss": 0.5046, + "step": 3652 + }, + { + "epoch": 4.086129753914989, + "grad_norm": 0.6031131744384766, + "learning_rate": 1.194008028372997e-06, + "loss": 0.507, + "step": 3653 + }, + { + "epoch": 4.087248322147651, + "grad_norm": 0.6083060503005981, + "learning_rate": 1.1927360136977753e-06, + "loss": 0.5414, + "step": 3654 + }, + { + "epoch": 4.088366890380313, + "grad_norm": 0.5927894711494446, + "learning_rate": 1.1914644646421698e-06, + "loss": 0.5098, + "step": 3655 + }, + { + "epoch": 4.089485458612975, + "grad_norm": 0.6099792718887329, + "learning_rate": 1.1901933816590787e-06, + "loss": 0.5316, + "step": 3656 + }, + { + "epoch": 4.090604026845638, + "grad_norm": 0.5856950879096985, + "learning_rate": 1.1889227652012345e-06, + "loss": 0.5161, + "step": 3657 + }, + { + "epoch": 4.091722595078299, + "grad_norm": 0.5956772565841675, + "learning_rate": 1.1876526157212052e-06, + "loss": 0.5228, + "step": 3658 + }, + { + "epoch": 4.092841163310962, + "grad_norm": 0.598544716835022, + "learning_rate": 1.1863829336713886e-06, + "loss": 0.5252, + "step": 3659 + }, + { + "epoch": 4.093959731543624, + "grad_norm": 0.600497841835022, + "learning_rate": 1.185113719504021e-06, + "loss": 0.5063, + "step": 3660 + }, + { + "epoch": 4.0950782997762865, + "grad_norm": 0.5894132256507874, + "learning_rate": 1.1838449736711677e-06, + "loss": 0.5146, + "step": 3661 + }, + { + "epoch": 4.096196868008948, + "grad_norm": 0.5936020612716675, + "learning_rate": 1.1825766966247305e-06, + "loss": 0.5202, + "step": 3662 + }, + { + "epoch": 4.097315436241611, + "grad_norm": 0.5978924036026001, + "learning_rate": 1.1813088888164414e-06, + "loss": 0.5015, + "step": 3663 + }, + { + "epoch": 4.098434004474273, + "grad_norm": 0.611702024936676, + "learning_rate": 1.1800415506978674e-06, + "loss": 0.5102, + "step": 3664 + }, + { + "epoch": 4.099552572706935, + "grad_norm": 0.6160632967948914, + "learning_rate": 1.1787746827204059e-06, + "loss": 0.5526, + "step": 3665 + }, + { + "epoch": 4.100671140939597, + "grad_norm": 0.5955978035926819, + "learning_rate": 1.1775082853352902e-06, + "loss": 0.5179, + "step": 3666 + }, + { + "epoch": 4.10178970917226, + "grad_norm": 0.6225089430809021, + "learning_rate": 1.1762423589935834e-06, + "loss": 0.5481, + "step": 3667 + }, + { + "epoch": 4.1029082774049215, + "grad_norm": 0.6037780046463013, + "learning_rate": 1.1749769041461804e-06, + "loss": 0.5355, + "step": 3668 + }, + { + "epoch": 4.104026845637584, + "grad_norm": 0.6028552651405334, + "learning_rate": 1.1737119212438115e-06, + "loss": 0.5186, + "step": 3669 + }, + { + "epoch": 4.105145413870246, + "grad_norm": 0.5955883860588074, + "learning_rate": 1.1724474107370352e-06, + "loss": 0.4915, + "step": 3670 + }, + { + "epoch": 4.1062639821029085, + "grad_norm": 0.5754148960113525, + "learning_rate": 1.1711833730762434e-06, + "loss": 0.4892, + "step": 3671 + }, + { + "epoch": 4.10738255033557, + "grad_norm": 0.6185005307197571, + "learning_rate": 1.169919808711659e-06, + "loss": 0.564, + "step": 3672 + }, + { + "epoch": 4.108501118568233, + "grad_norm": 0.6019536256790161, + "learning_rate": 1.1686567180933378e-06, + "loss": 0.5276, + "step": 3673 + }, + { + "epoch": 4.109619686800895, + "grad_norm": 0.5994640588760376, + "learning_rate": 1.1673941016711645e-06, + "loss": 0.5231, + "step": 3674 + }, + { + "epoch": 4.110738255033557, + "grad_norm": 0.6197410225868225, + "learning_rate": 1.166131959894858e-06, + "loss": 0.5634, + "step": 3675 + }, + { + "epoch": 4.111856823266219, + "grad_norm": 0.5986242294311523, + "learning_rate": 1.1648702932139647e-06, + "loss": 0.5177, + "step": 3676 + }, + { + "epoch": 4.112975391498882, + "grad_norm": 0.6270274519920349, + "learning_rate": 1.1636091020778645e-06, + "loss": 0.5263, + "step": 3677 + }, + { + "epoch": 4.114093959731544, + "grad_norm": 0.6105303168296814, + "learning_rate": 1.1623483869357665e-06, + "loss": 0.5197, + "step": 3678 + }, + { + "epoch": 4.115212527964206, + "grad_norm": 0.6222716569900513, + "learning_rate": 1.1610881482367105e-06, + "loss": 0.5368, + "step": 3679 + }, + { + "epoch": 4.116331096196868, + "grad_norm": 0.6289597153663635, + "learning_rate": 1.159828386429567e-06, + "loss": 0.5417, + "step": 3680 + }, + { + "epoch": 4.117449664429531, + "grad_norm": 0.6015812754631042, + "learning_rate": 1.158569101963035e-06, + "loss": 0.5428, + "step": 3681 + }, + { + "epoch": 4.118568232662192, + "grad_norm": 0.6269590258598328, + "learning_rate": 1.157310295285647e-06, + "loss": 0.5266, + "step": 3682 + }, + { + "epoch": 4.119686800894854, + "grad_norm": 0.6014243960380554, + "learning_rate": 1.1560519668457606e-06, + "loss": 0.5414, + "step": 3683 + }, + { + "epoch": 4.120805369127517, + "grad_norm": 0.6185159087181091, + "learning_rate": 1.1547941170915686e-06, + "loss": 0.4965, + "step": 3684 + }, + { + "epoch": 4.121923937360179, + "grad_norm": 0.6223366260528564, + "learning_rate": 1.1535367464710875e-06, + "loss": 0.5151, + "step": 3685 + }, + { + "epoch": 4.123042505592841, + "grad_norm": 0.6197161674499512, + "learning_rate": 1.1522798554321685e-06, + "loss": 0.5546, + "step": 3686 + }, + { + "epoch": 4.124161073825503, + "grad_norm": 0.6214160919189453, + "learning_rate": 1.151023444422488e-06, + "loss": 0.5598, + "step": 3687 + }, + { + "epoch": 4.125279642058166, + "grad_norm": 0.5989544987678528, + "learning_rate": 1.149767513889553e-06, + "loss": 0.5276, + "step": 3688 + }, + { + "epoch": 4.126398210290827, + "grad_norm": 0.6019754409790039, + "learning_rate": 1.1485120642806993e-06, + "loss": 0.5265, + "step": 3689 + }, + { + "epoch": 4.12751677852349, + "grad_norm": 0.6040080785751343, + "learning_rate": 1.1472570960430903e-06, + "loss": 0.5489, + "step": 3690 + }, + { + "epoch": 4.128635346756152, + "grad_norm": 0.605163037776947, + "learning_rate": 1.1460026096237211e-06, + "loss": 0.5081, + "step": 3691 + }, + { + "epoch": 4.1297539149888145, + "grad_norm": 0.5949017405509949, + "learning_rate": 1.1447486054694113e-06, + "loss": 0.5286, + "step": 3692 + }, + { + "epoch": 4.130872483221476, + "grad_norm": 0.5885686278343201, + "learning_rate": 1.1434950840268119e-06, + "loss": 0.5174, + "step": 3693 + }, + { + "epoch": 4.131991051454139, + "grad_norm": 0.5962170958518982, + "learning_rate": 1.1422420457423988e-06, + "loss": 0.5121, + "step": 3694 + }, + { + "epoch": 4.133109619686801, + "grad_norm": 0.6187509894371033, + "learning_rate": 1.1409894910624808e-06, + "loss": 0.5351, + "step": 3695 + }, + { + "epoch": 4.134228187919463, + "grad_norm": 0.6415494084358215, + "learning_rate": 1.1397374204331867e-06, + "loss": 0.5477, + "step": 3696 + }, + { + "epoch": 4.135346756152125, + "grad_norm": 0.6038553714752197, + "learning_rate": 1.1384858343004812e-06, + "loss": 0.5634, + "step": 3697 + }, + { + "epoch": 4.136465324384788, + "grad_norm": 0.6118021011352539, + "learning_rate": 1.1372347331101511e-06, + "loss": 0.5088, + "step": 3698 + }, + { + "epoch": 4.1375838926174495, + "grad_norm": 0.5945549607276917, + "learning_rate": 1.135984117307811e-06, + "loss": 0.5067, + "step": 3699 + }, + { + "epoch": 4.138702460850112, + "grad_norm": 0.6121227741241455, + "learning_rate": 1.1347339873389058e-06, + "loss": 0.5357, + "step": 3700 + }, + { + "epoch": 4.139821029082774, + "grad_norm": 0.6096861362457275, + "learning_rate": 1.1334843436487035e-06, + "loss": 0.521, + "step": 3701 + }, + { + "epoch": 4.140939597315437, + "grad_norm": 0.6067557334899902, + "learning_rate": 1.1322351866823016e-06, + "loss": 0.5176, + "step": 3702 + }, + { + "epoch": 4.142058165548098, + "grad_norm": 0.611638605594635, + "learning_rate": 1.1309865168846235e-06, + "loss": 0.5433, + "step": 3703 + }, + { + "epoch": 4.143176733780761, + "grad_norm": 0.6068941354751587, + "learning_rate": 1.1297383347004178e-06, + "loss": 0.5398, + "step": 3704 + }, + { + "epoch": 4.144295302013423, + "grad_norm": 0.6006292104721069, + "learning_rate": 1.1284906405742602e-06, + "loss": 0.5103, + "step": 3705 + }, + { + "epoch": 4.145413870246085, + "grad_norm": 0.6096678972244263, + "learning_rate": 1.1272434349505543e-06, + "loss": 0.5438, + "step": 3706 + }, + { + "epoch": 4.146532438478747, + "grad_norm": 0.6165948510169983, + "learning_rate": 1.1259967182735263e-06, + "loss": 0.485, + "step": 3707 + }, + { + "epoch": 4.14765100671141, + "grad_norm": 0.601341962814331, + "learning_rate": 1.1247504909872323e-06, + "loss": 0.5315, + "step": 3708 + }, + { + "epoch": 4.148769574944072, + "grad_norm": 0.6077728867530823, + "learning_rate": 1.1235047535355507e-06, + "loss": 0.5537, + "step": 3709 + }, + { + "epoch": 4.149888143176733, + "grad_norm": 0.586830198764801, + "learning_rate": 1.1222595063621857e-06, + "loss": 0.5175, + "step": 3710 + }, + { + "epoch": 4.151006711409396, + "grad_norm": 0.636084794998169, + "learning_rate": 1.1210147499106703e-06, + "loss": 0.5411, + "step": 3711 + }, + { + "epoch": 4.152125279642058, + "grad_norm": 0.6106093525886536, + "learning_rate": 1.1197704846243587e-06, + "loss": 0.5226, + "step": 3712 + }, + { + "epoch": 4.1532438478747205, + "grad_norm": 0.6054192781448364, + "learning_rate": 1.1185267109464321e-06, + "loss": 0.5392, + "step": 3713 + }, + { + "epoch": 4.154362416107382, + "grad_norm": 0.5832464098930359, + "learning_rate": 1.1172834293198951e-06, + "loss": 0.4762, + "step": 3714 + }, + { + "epoch": 4.155480984340045, + "grad_norm": 0.60902339220047, + "learning_rate": 1.1160406401875797e-06, + "loss": 0.5314, + "step": 3715 + }, + { + "epoch": 4.156599552572707, + "grad_norm": 0.603844165802002, + "learning_rate": 1.11479834399214e-06, + "loss": 0.5404, + "step": 3716 + }, + { + "epoch": 4.157718120805369, + "grad_norm": 0.6323402523994446, + "learning_rate": 1.1135565411760565e-06, + "loss": 0.5427, + "step": 3717 + }, + { + "epoch": 4.158836689038031, + "grad_norm": 0.6196385025978088, + "learning_rate": 1.1123152321816313e-06, + "loss": 0.524, + "step": 3718 + }, + { + "epoch": 4.159955257270694, + "grad_norm": 0.6259109973907471, + "learning_rate": 1.1110744174509952e-06, + "loss": 0.5224, + "step": 3719 + }, + { + "epoch": 4.1610738255033555, + "grad_norm": 0.6295533776283264, + "learning_rate": 1.109834097426097e-06, + "loss": 0.5455, + "step": 3720 + }, + { + "epoch": 4.162192393736018, + "grad_norm": 0.6082515716552734, + "learning_rate": 1.1085942725487126e-06, + "loss": 0.541, + "step": 3721 + }, + { + "epoch": 4.16331096196868, + "grad_norm": 0.5940147638320923, + "learning_rate": 1.1073549432604428e-06, + "loss": 0.5143, + "step": 3722 + }, + { + "epoch": 4.1644295302013425, + "grad_norm": 0.6370109915733337, + "learning_rate": 1.1061161100027085e-06, + "loss": 0.554, + "step": 3723 + }, + { + "epoch": 4.165548098434004, + "grad_norm": 0.5989547371864319, + "learning_rate": 1.1048777732167576e-06, + "loss": 0.5245, + "step": 3724 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.5975155830383301, + "learning_rate": 1.1036399333436578e-06, + "loss": 0.5157, + "step": 3725 + }, + { + "epoch": 4.167785234899329, + "grad_norm": 0.5963829755783081, + "learning_rate": 1.1024025908243027e-06, + "loss": 0.5069, + "step": 3726 + }, + { + "epoch": 4.168903803131991, + "grad_norm": 0.5934597849845886, + "learning_rate": 1.101165746099407e-06, + "loss": 0.5214, + "step": 3727 + }, + { + "epoch": 4.170022371364653, + "grad_norm": 0.6068410873413086, + "learning_rate": 1.0999293996095076e-06, + "loss": 0.5271, + "step": 3728 + }, + { + "epoch": 4.171140939597316, + "grad_norm": 0.6195031404495239, + "learning_rate": 1.0986935517949647e-06, + "loss": 0.5464, + "step": 3729 + }, + { + "epoch": 4.172259507829978, + "grad_norm": 0.6107401847839355, + "learning_rate": 1.0974582030959627e-06, + "loss": 0.5374, + "step": 3730 + }, + { + "epoch": 4.17337807606264, + "grad_norm": 0.5975446105003357, + "learning_rate": 1.0962233539525052e-06, + "loss": 0.5359, + "step": 3731 + }, + { + "epoch": 4.174496644295302, + "grad_norm": 0.6143819689750671, + "learning_rate": 1.0949890048044187e-06, + "loss": 0.5332, + "step": 3732 + }, + { + "epoch": 4.175615212527964, + "grad_norm": 0.6155346632003784, + "learning_rate": 1.0937551560913537e-06, + "loss": 0.5333, + "step": 3733 + }, + { + "epoch": 4.176733780760626, + "grad_norm": 0.604322612285614, + "learning_rate": 1.0925218082527792e-06, + "loss": 0.5048, + "step": 3734 + }, + { + "epoch": 4.177852348993288, + "grad_norm": 0.6096341013908386, + "learning_rate": 1.0912889617279894e-06, + "loss": 0.5179, + "step": 3735 + }, + { + "epoch": 4.178970917225951, + "grad_norm": 0.6114840507507324, + "learning_rate": 1.0900566169560964e-06, + "loss": 0.5311, + "step": 3736 + }, + { + "epoch": 4.180089485458613, + "grad_norm": 0.5926511287689209, + "learning_rate": 1.0888247743760358e-06, + "loss": 0.5135, + "step": 3737 + }, + { + "epoch": 4.181208053691275, + "grad_norm": 0.6032086610794067, + "learning_rate": 1.087593434426563e-06, + "loss": 0.5179, + "step": 3738 + }, + { + "epoch": 4.182326621923937, + "grad_norm": 0.6060828566551208, + "learning_rate": 1.086362597546256e-06, + "loss": 0.5373, + "step": 3739 + }, + { + "epoch": 4.1834451901566, + "grad_norm": 0.6098891496658325, + "learning_rate": 1.0851322641735119e-06, + "loss": 0.5216, + "step": 3740 + }, + { + "epoch": 4.184563758389261, + "grad_norm": 0.6017407774925232, + "learning_rate": 1.0839024347465505e-06, + "loss": 0.5248, + "step": 3741 + }, + { + "epoch": 4.185682326621924, + "grad_norm": 0.599614679813385, + "learning_rate": 1.0826731097034102e-06, + "loss": 0.5347, + "step": 3742 + }, + { + "epoch": 4.186800894854586, + "grad_norm": 0.618355929851532, + "learning_rate": 1.081444289481949e-06, + "loss": 0.5313, + "step": 3743 + }, + { + "epoch": 4.1879194630872485, + "grad_norm": 0.5960415601730347, + "learning_rate": 1.0802159745198501e-06, + "loss": 0.5347, + "step": 3744 + }, + { + "epoch": 4.18903803131991, + "grad_norm": 0.6079975962638855, + "learning_rate": 1.0789881652546091e-06, + "loss": 0.5309, + "step": 3745 + }, + { + "epoch": 4.190156599552573, + "grad_norm": 0.57371985912323, + "learning_rate": 1.0777608621235482e-06, + "loss": 0.5131, + "step": 3746 + }, + { + "epoch": 4.191275167785235, + "grad_norm": 0.6361687779426575, + "learning_rate": 1.0765340655638048e-06, + "loss": 0.5627, + "step": 3747 + }, + { + "epoch": 4.192393736017897, + "grad_norm": 0.5936689972877502, + "learning_rate": 1.0753077760123397e-06, + "loss": 0.5174, + "step": 3748 + }, + { + "epoch": 4.193512304250559, + "grad_norm": 0.6075806021690369, + "learning_rate": 1.074081993905929e-06, + "loss": 0.5286, + "step": 3749 + }, + { + "epoch": 4.194630872483222, + "grad_norm": 0.6172950863838196, + "learning_rate": 1.0728567196811728e-06, + "loss": 0.5396, + "step": 3750 + }, + { + "epoch": 4.1957494407158835, + "grad_norm": 0.6089175939559937, + "learning_rate": 1.071631953774486e-06, + "loss": 0.535, + "step": 3751 + }, + { + "epoch": 4.196868008948546, + "grad_norm": 0.61447674036026, + "learning_rate": 1.0704076966221043e-06, + "loss": 0.5188, + "step": 3752 + }, + { + "epoch": 4.197986577181208, + "grad_norm": 0.5935482382774353, + "learning_rate": 1.0691839486600825e-06, + "loss": 0.5051, + "step": 3753 + }, + { + "epoch": 4.199105145413871, + "grad_norm": 0.6007702350616455, + "learning_rate": 1.0679607103242923e-06, + "loss": 0.5231, + "step": 3754 + }, + { + "epoch": 4.200223713646532, + "grad_norm": 0.6344751715660095, + "learning_rate": 1.066737982050427e-06, + "loss": 0.512, + "step": 3755 + }, + { + "epoch": 4.201342281879195, + "grad_norm": 0.6222809553146362, + "learning_rate": 1.0655157642739944e-06, + "loss": 0.5511, + "step": 3756 + }, + { + "epoch": 4.202460850111857, + "grad_norm": 0.618102490901947, + "learning_rate": 1.0642940574303248e-06, + "loss": 0.5481, + "step": 3757 + }, + { + "epoch": 4.203579418344519, + "grad_norm": 0.6104963421821594, + "learning_rate": 1.0630728619545618e-06, + "loss": 0.5363, + "step": 3758 + }, + { + "epoch": 4.204697986577181, + "grad_norm": 0.5959570407867432, + "learning_rate": 1.0618521782816718e-06, + "loss": 0.5519, + "step": 3759 + }, + { + "epoch": 4.205816554809843, + "grad_norm": 0.6085075736045837, + "learning_rate": 1.0606320068464346e-06, + "loss": 0.5373, + "step": 3760 + }, + { + "epoch": 4.206935123042506, + "grad_norm": 0.6132450103759766, + "learning_rate": 1.0594123480834498e-06, + "loss": 0.528, + "step": 3761 + }, + { + "epoch": 4.208053691275167, + "grad_norm": 0.6066333055496216, + "learning_rate": 1.0581932024271337e-06, + "loss": 0.5057, + "step": 3762 + }, + { + "epoch": 4.20917225950783, + "grad_norm": 0.623799741268158, + "learning_rate": 1.0569745703117192e-06, + "loss": 0.5315, + "step": 3763 + }, + { + "epoch": 4.210290827740492, + "grad_norm": 0.6040028929710388, + "learning_rate": 1.0557564521712594e-06, + "loss": 0.4975, + "step": 3764 + }, + { + "epoch": 4.2114093959731544, + "grad_norm": 0.6223567724227905, + "learning_rate": 1.0545388484396193e-06, + "loss": 0.5354, + "step": 3765 + }, + { + "epoch": 4.212527964205816, + "grad_norm": 0.6043722033500671, + "learning_rate": 1.0533217595504859e-06, + "loss": 0.5303, + "step": 3766 + }, + { + "epoch": 4.213646532438479, + "grad_norm": 0.6073420643806458, + "learning_rate": 1.0521051859373585e-06, + "loss": 0.5241, + "step": 3767 + }, + { + "epoch": 4.214765100671141, + "grad_norm": 0.6065641641616821, + "learning_rate": 1.0508891280335562e-06, + "loss": 0.5258, + "step": 3768 + }, + { + "epoch": 4.215883668903803, + "grad_norm": 0.5999026894569397, + "learning_rate": 1.0496735862722127e-06, + "loss": 0.505, + "step": 3769 + }, + { + "epoch": 4.217002237136465, + "grad_norm": 0.6005153656005859, + "learning_rate": 1.0484585610862774e-06, + "loss": 0.498, + "step": 3770 + }, + { + "epoch": 4.218120805369128, + "grad_norm": 0.5878973007202148, + "learning_rate": 1.047244052908516e-06, + "loss": 0.518, + "step": 3771 + }, + { + "epoch": 4.2192393736017895, + "grad_norm": 0.6193377375602722, + "learning_rate": 1.046030062171512e-06, + "loss": 0.5103, + "step": 3772 + }, + { + "epoch": 4.220357941834452, + "grad_norm": 0.5966192483901978, + "learning_rate": 1.0448165893076623e-06, + "loss": 0.5125, + "step": 3773 + }, + { + "epoch": 4.221476510067114, + "grad_norm": 0.6138927340507507, + "learning_rate": 1.0436036347491794e-06, + "loss": 0.5431, + "step": 3774 + }, + { + "epoch": 4.2225950782997765, + "grad_norm": 0.5923686027526855, + "learning_rate": 1.0423911989280932e-06, + "loss": 0.5221, + "step": 3775 + }, + { + "epoch": 4.223713646532438, + "grad_norm": 0.63111811876297, + "learning_rate": 1.041179282276247e-06, + "loss": 0.5559, + "step": 3776 + }, + { + "epoch": 4.224832214765101, + "grad_norm": 0.6234198808670044, + "learning_rate": 1.0399678852252998e-06, + "loss": 0.5125, + "step": 3777 + }, + { + "epoch": 4.225950782997763, + "grad_norm": 0.5993960499763489, + "learning_rate": 1.0387570082067241e-06, + "loss": 0.4888, + "step": 3778 + }, + { + "epoch": 4.227069351230425, + "grad_norm": 0.5968877077102661, + "learning_rate": 1.0375466516518109e-06, + "loss": 0.5377, + "step": 3779 + }, + { + "epoch": 4.228187919463087, + "grad_norm": 0.5986246466636658, + "learning_rate": 1.0363368159916615e-06, + "loss": 0.5351, + "step": 3780 + }, + { + "epoch": 4.22930648769575, + "grad_norm": 0.6331195831298828, + "learning_rate": 1.0351275016571953e-06, + "loss": 0.549, + "step": 3781 + }, + { + "epoch": 4.230425055928412, + "grad_norm": 0.6150019764900208, + "learning_rate": 1.0339187090791423e-06, + "loss": 0.5411, + "step": 3782 + }, + { + "epoch": 4.231543624161074, + "grad_norm": 0.613914430141449, + "learning_rate": 1.0327104386880506e-06, + "loss": 0.5139, + "step": 3783 + }, + { + "epoch": 4.232662192393736, + "grad_norm": 0.6459356546401978, + "learning_rate": 1.03150269091428e-06, + "loss": 0.5566, + "step": 3784 + }, + { + "epoch": 4.233780760626399, + "grad_norm": 0.6135070323944092, + "learning_rate": 1.030295466188004e-06, + "loss": 0.4999, + "step": 3785 + }, + { + "epoch": 4.23489932885906, + "grad_norm": 0.6170519590377808, + "learning_rate": 1.0290887649392108e-06, + "loss": 0.5236, + "step": 3786 + }, + { + "epoch": 4.236017897091722, + "grad_norm": 0.6268102526664734, + "learning_rate": 1.0278825875977003e-06, + "loss": 0.5307, + "step": 3787 + }, + { + "epoch": 4.237136465324385, + "grad_norm": 0.6062012910842896, + "learning_rate": 1.0266769345930893e-06, + "loss": 0.508, + "step": 3788 + }, + { + "epoch": 4.238255033557047, + "grad_norm": 0.6067014932632446, + "learning_rate": 1.025471806354804e-06, + "loss": 0.5275, + "step": 3789 + }, + { + "epoch": 4.239373601789709, + "grad_norm": 0.5911723971366882, + "learning_rate": 1.0242672033120874e-06, + "loss": 0.518, + "step": 3790 + }, + { + "epoch": 4.240492170022371, + "grad_norm": 0.5972678065299988, + "learning_rate": 1.0230631258939914e-06, + "loss": 0.5412, + "step": 3791 + }, + { + "epoch": 4.241610738255034, + "grad_norm": 0.6093435287475586, + "learning_rate": 1.0218595745293842e-06, + "loss": 0.5343, + "step": 3792 + }, + { + "epoch": 4.242729306487695, + "grad_norm": 0.6260213255882263, + "learning_rate": 1.0206565496469448e-06, + "loss": 0.5498, + "step": 3793 + }, + { + "epoch": 4.243847874720358, + "grad_norm": 0.6062767505645752, + "learning_rate": 1.019454051675165e-06, + "loss": 0.5379, + "step": 3794 + }, + { + "epoch": 4.24496644295302, + "grad_norm": 0.6090009212493896, + "learning_rate": 1.018252081042349e-06, + "loss": 0.5142, + "step": 3795 + }, + { + "epoch": 4.2460850111856825, + "grad_norm": 0.597719669342041, + "learning_rate": 1.0170506381766121e-06, + "loss": 0.5169, + "step": 3796 + }, + { + "epoch": 4.247203579418344, + "grad_norm": 0.6165065169334412, + "learning_rate": 1.0158497235058844e-06, + "loss": 0.5314, + "step": 3797 + }, + { + "epoch": 4.248322147651007, + "grad_norm": 0.6042012572288513, + "learning_rate": 1.014649337457905e-06, + "loss": 0.5182, + "step": 3798 + }, + { + "epoch": 4.249440715883669, + "grad_norm": 0.5833631753921509, + "learning_rate": 1.013449480460227e-06, + "loss": 0.5047, + "step": 3799 + }, + { + "epoch": 4.250559284116331, + "grad_norm": 0.6215090751647949, + "learning_rate": 1.0122501529402132e-06, + "loss": 0.5092, + "step": 3800 + }, + { + "epoch": 4.251677852348993, + "grad_norm": 0.6076421737670898, + "learning_rate": 1.0110513553250385e-06, + "loss": 0.5269, + "step": 3801 + }, + { + "epoch": 4.252796420581656, + "grad_norm": 0.5998140573501587, + "learning_rate": 1.0098530880416881e-06, + "loss": 0.5149, + "step": 3802 + }, + { + "epoch": 4.2539149888143175, + "grad_norm": 0.6134610176086426, + "learning_rate": 1.0086553515169617e-06, + "loss": 0.5397, + "step": 3803 + }, + { + "epoch": 4.25503355704698, + "grad_norm": 0.6159098744392395, + "learning_rate": 1.007458146177466e-06, + "loss": 0.561, + "step": 3804 + }, + { + "epoch": 4.256152125279642, + "grad_norm": 0.6095084547996521, + "learning_rate": 1.0062614724496195e-06, + "loss": 0.531, + "step": 3805 + }, + { + "epoch": 4.257270693512305, + "grad_norm": 0.6020951867103577, + "learning_rate": 1.0050653307596536e-06, + "loss": 0.5314, + "step": 3806 + }, + { + "epoch": 4.258389261744966, + "grad_norm": 0.6136798858642578, + "learning_rate": 1.0038697215336068e-06, + "loss": 0.5175, + "step": 3807 + }, + { + "epoch": 4.259507829977629, + "grad_norm": 0.5957618951797485, + "learning_rate": 1.0026746451973313e-06, + "loss": 0.5346, + "step": 3808 + }, + { + "epoch": 4.260626398210291, + "grad_norm": 0.6040836572647095, + "learning_rate": 1.0014801021764872e-06, + "loss": 0.5383, + "step": 3809 + }, + { + "epoch": 4.261744966442953, + "grad_norm": 0.5815736651420593, + "learning_rate": 1.0002860928965451e-06, + "loss": 0.5234, + "step": 3810 + }, + { + "epoch": 4.262863534675615, + "grad_norm": 0.5883612632751465, + "learning_rate": 9.99092617782785e-07, + "loss": 0.5103, + "step": 3811 + }, + { + "epoch": 4.263982102908278, + "grad_norm": 0.6072722673416138, + "learning_rate": 9.97899677260299e-07, + "loss": 0.5373, + "step": 3812 + }, + { + "epoch": 4.26510067114094, + "grad_norm": 0.5919036269187927, + "learning_rate": 9.967072717539852e-07, + "loss": 0.5351, + "step": 3813 + }, + { + "epoch": 4.266219239373601, + "grad_norm": 0.6038205027580261, + "learning_rate": 9.95515401688555e-07, + "loss": 0.539, + "step": 3814 + }, + { + "epoch": 4.267337807606264, + "grad_norm": 0.6071666479110718, + "learning_rate": 9.943240674885268e-07, + "loss": 0.5097, + "step": 3815 + }, + { + "epoch": 4.268456375838926, + "grad_norm": 0.5986416339874268, + "learning_rate": 9.931332695782268e-07, + "loss": 0.5228, + "step": 3816 + }, + { + "epoch": 4.269574944071588, + "grad_norm": 0.6139152646064758, + "learning_rate": 9.919430083817939e-07, + "loss": 0.5453, + "step": 3817 + }, + { + "epoch": 4.27069351230425, + "grad_norm": 0.6048949360847473, + "learning_rate": 9.90753284323173e-07, + "loss": 0.5173, + "step": 3818 + }, + { + "epoch": 4.271812080536913, + "grad_norm": 0.6093915104866028, + "learning_rate": 9.895640978261181e-07, + "loss": 0.5435, + "step": 3819 + }, + { + "epoch": 4.272930648769575, + "grad_norm": 0.5934566855430603, + "learning_rate": 9.88375449314192e-07, + "loss": 0.497, + "step": 3820 + }, + { + "epoch": 4.274049217002237, + "grad_norm": 0.6141425967216492, + "learning_rate": 9.871873392107672e-07, + "loss": 0.5261, + "step": 3821 + }, + { + "epoch": 4.275167785234899, + "grad_norm": 0.6290242075920105, + "learning_rate": 9.85999767939022e-07, + "loss": 0.5319, + "step": 3822 + }, + { + "epoch": 4.276286353467562, + "grad_norm": 0.6037012934684753, + "learning_rate": 9.84812735921945e-07, + "loss": 0.5123, + "step": 3823 + }, + { + "epoch": 4.2774049217002235, + "grad_norm": 0.6119054555892944, + "learning_rate": 9.836262435823316e-07, + "loss": 0.5269, + "step": 3824 + }, + { + "epoch": 4.278523489932886, + "grad_norm": 0.605689287185669, + "learning_rate": 9.824402913427848e-07, + "loss": 0.5161, + "step": 3825 + }, + { + "epoch": 4.279642058165548, + "grad_norm": 0.6387097239494324, + "learning_rate": 9.812548796257155e-07, + "loss": 0.5287, + "step": 3826 + }, + { + "epoch": 4.2807606263982105, + "grad_norm": 0.6267847418785095, + "learning_rate": 9.800700088533412e-07, + "loss": 0.5388, + "step": 3827 + }, + { + "epoch": 4.281879194630872, + "grad_norm": 0.6022135019302368, + "learning_rate": 9.788856794476898e-07, + "loss": 0.5531, + "step": 3828 + }, + { + "epoch": 4.282997762863535, + "grad_norm": 0.609603226184845, + "learning_rate": 9.777018918305922e-07, + "loss": 0.514, + "step": 3829 + }, + { + "epoch": 4.284116331096197, + "grad_norm": 0.6278068423271179, + "learning_rate": 9.7651864642369e-07, + "loss": 0.5408, + "step": 3830 + }, + { + "epoch": 4.285234899328859, + "grad_norm": 0.6040200591087341, + "learning_rate": 9.753359436484285e-07, + "loss": 0.4857, + "step": 3831 + }, + { + "epoch": 4.286353467561521, + "grad_norm": 0.6248037219047546, + "learning_rate": 9.74153783926063e-07, + "loss": 0.5166, + "step": 3832 + }, + { + "epoch": 4.287472035794184, + "grad_norm": 0.6191821694374084, + "learning_rate": 9.729721676776526e-07, + "loss": 0.5294, + "step": 3833 + }, + { + "epoch": 4.2885906040268456, + "grad_norm": 0.6170260310173035, + "learning_rate": 9.71791095324064e-07, + "loss": 0.5224, + "step": 3834 + }, + { + "epoch": 4.289709172259508, + "grad_norm": 0.6297686696052551, + "learning_rate": 9.70610567285969e-07, + "loss": 0.5518, + "step": 3835 + }, + { + "epoch": 4.29082774049217, + "grad_norm": 0.627439558506012, + "learning_rate": 9.694305839838485e-07, + "loss": 0.552, + "step": 3836 + }, + { + "epoch": 4.291946308724833, + "grad_norm": 0.6032774448394775, + "learning_rate": 9.682511458379867e-07, + "loss": 0.5127, + "step": 3837 + }, + { + "epoch": 4.293064876957494, + "grad_norm": 0.6123248338699341, + "learning_rate": 9.67072253268473e-07, + "loss": 0.5327, + "step": 3838 + }, + { + "epoch": 4.294183445190157, + "grad_norm": 0.6246888637542725, + "learning_rate": 9.658939066952064e-07, + "loss": 0.5471, + "step": 3839 + }, + { + "epoch": 4.295302013422819, + "grad_norm": 0.5911756157875061, + "learning_rate": 9.647161065378863e-07, + "loss": 0.5102, + "step": 3840 + }, + { + "epoch": 4.296420581655481, + "grad_norm": 0.6359622478485107, + "learning_rate": 9.635388532160222e-07, + "loss": 0.5269, + "step": 3841 + }, + { + "epoch": 4.297539149888143, + "grad_norm": 0.606510579586029, + "learning_rate": 9.623621471489259e-07, + "loss": 0.5372, + "step": 3842 + }, + { + "epoch": 4.298657718120805, + "grad_norm": 0.5866826772689819, + "learning_rate": 9.611859887557147e-07, + "loss": 0.517, + "step": 3843 + }, + { + "epoch": 4.299776286353468, + "grad_norm": 0.6106289625167847, + "learning_rate": 9.600103784553108e-07, + "loss": 0.5378, + "step": 3844 + }, + { + "epoch": 4.300894854586129, + "grad_norm": 0.608625054359436, + "learning_rate": 9.588353166664428e-07, + "loss": 0.5315, + "step": 3845 + }, + { + "epoch": 4.302013422818792, + "grad_norm": 0.5995439887046814, + "learning_rate": 9.576608038076412e-07, + "loss": 0.508, + "step": 3846 + }, + { + "epoch": 4.303131991051454, + "grad_norm": 0.618381142616272, + "learning_rate": 9.564868402972444e-07, + "loss": 0.5249, + "step": 3847 + }, + { + "epoch": 4.3042505592841165, + "grad_norm": 0.5940191745758057, + "learning_rate": 9.553134265533922e-07, + "loss": 0.5201, + "step": 3848 + }, + { + "epoch": 4.305369127516778, + "grad_norm": 0.6101189255714417, + "learning_rate": 9.541405629940299e-07, + "loss": 0.5399, + "step": 3849 + }, + { + "epoch": 4.306487695749441, + "grad_norm": 0.6200927495956421, + "learning_rate": 9.529682500369056e-07, + "loss": 0.5632, + "step": 3850 + }, + { + "epoch": 4.307606263982103, + "grad_norm": 0.5787659883499146, + "learning_rate": 9.517964880995725e-07, + "loss": 0.4896, + "step": 3851 + }, + { + "epoch": 4.308724832214765, + "grad_norm": 0.6133612394332886, + "learning_rate": 9.506252775993882e-07, + "loss": 0.5431, + "step": 3852 + }, + { + "epoch": 4.309843400447427, + "grad_norm": 0.6298347115516663, + "learning_rate": 9.49454618953512e-07, + "loss": 0.5318, + "step": 3853 + }, + { + "epoch": 4.31096196868009, + "grad_norm": 0.5947498083114624, + "learning_rate": 9.482845125789086e-07, + "loss": 0.4977, + "step": 3854 + }, + { + "epoch": 4.3120805369127515, + "grad_norm": 0.6170173287391663, + "learning_rate": 9.471149588923434e-07, + "loss": 0.5388, + "step": 3855 + }, + { + "epoch": 4.313199105145414, + "grad_norm": 0.6093713045120239, + "learning_rate": 9.459459583103886e-07, + "loss": 0.5054, + "step": 3856 + }, + { + "epoch": 4.314317673378076, + "grad_norm": 0.6108666658401489, + "learning_rate": 9.447775112494161e-07, + "loss": 0.511, + "step": 3857 + }, + { + "epoch": 4.315436241610739, + "grad_norm": 0.6149061918258667, + "learning_rate": 9.436096181256024e-07, + "loss": 0.5129, + "step": 3858 + }, + { + "epoch": 4.3165548098434, + "grad_norm": 0.616568386554718, + "learning_rate": 9.424422793549257e-07, + "loss": 0.5319, + "step": 3859 + }, + { + "epoch": 4.317673378076063, + "grad_norm": 0.6054441928863525, + "learning_rate": 9.412754953531664e-07, + "loss": 0.5136, + "step": 3860 + }, + { + "epoch": 4.318791946308725, + "grad_norm": 0.6090465188026428, + "learning_rate": 9.401092665359102e-07, + "loss": 0.5189, + "step": 3861 + }, + { + "epoch": 4.319910514541387, + "grad_norm": 0.6332055926322937, + "learning_rate": 9.389435933185406e-07, + "loss": 0.5487, + "step": 3862 + }, + { + "epoch": 4.321029082774049, + "grad_norm": 0.5949379801750183, + "learning_rate": 9.377784761162481e-07, + "loss": 0.5251, + "step": 3863 + }, + { + "epoch": 4.322147651006711, + "grad_norm": 0.6190198659896851, + "learning_rate": 9.366139153440207e-07, + "loss": 0.5295, + "step": 3864 + }, + { + "epoch": 4.323266219239374, + "grad_norm": 0.5971733927726746, + "learning_rate": 9.354499114166516e-07, + "loss": 0.5098, + "step": 3865 + }, + { + "epoch": 4.324384787472036, + "grad_norm": 0.5813844799995422, + "learning_rate": 9.342864647487334e-07, + "loss": 0.4878, + "step": 3866 + }, + { + "epoch": 4.325503355704698, + "grad_norm": 0.6124801635742188, + "learning_rate": 9.331235757546614e-07, + "loss": 0.5235, + "step": 3867 + }, + { + "epoch": 4.32662192393736, + "grad_norm": 0.6303718686103821, + "learning_rate": 9.319612448486315e-07, + "loss": 0.5383, + "step": 3868 + }, + { + "epoch": 4.327740492170022, + "grad_norm": 0.6017159223556519, + "learning_rate": 9.307994724446409e-07, + "loss": 0.5123, + "step": 3869 + }, + { + "epoch": 4.328859060402684, + "grad_norm": 0.6197109222412109, + "learning_rate": 9.296382589564892e-07, + "loss": 0.4879, + "step": 3870 + }, + { + "epoch": 4.329977628635347, + "grad_norm": 0.5964914560317993, + "learning_rate": 9.284776047977751e-07, + "loss": 0.4893, + "step": 3871 + }, + { + "epoch": 4.331096196868009, + "grad_norm": 0.602350652217865, + "learning_rate": 9.273175103818998e-07, + "loss": 0.5322, + "step": 3872 + }, + { + "epoch": 4.332214765100671, + "grad_norm": 0.6152085661888123, + "learning_rate": 9.261579761220641e-07, + "loss": 0.5375, + "step": 3873 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 0.6156829595565796, + "learning_rate": 9.249990024312689e-07, + "loss": 0.5211, + "step": 3874 + }, + { + "epoch": 4.334451901565996, + "grad_norm": 0.6165547370910645, + "learning_rate": 9.238405897223152e-07, + "loss": 0.5208, + "step": 3875 + }, + { + "epoch": 4.3355704697986575, + "grad_norm": 0.607903003692627, + "learning_rate": 9.226827384078069e-07, + "loss": 0.5389, + "step": 3876 + }, + { + "epoch": 4.33668903803132, + "grad_norm": 0.611706018447876, + "learning_rate": 9.215254489001441e-07, + "loss": 0.5435, + "step": 3877 + }, + { + "epoch": 4.337807606263982, + "grad_norm": 0.6131669878959656, + "learning_rate": 9.203687216115303e-07, + "loss": 0.541, + "step": 3878 + }, + { + "epoch": 4.3389261744966445, + "grad_norm": 0.5940380692481995, + "learning_rate": 9.192125569539667e-07, + "loss": 0.5237, + "step": 3879 + }, + { + "epoch": 4.340044742729306, + "grad_norm": 0.6364268064498901, + "learning_rate": 9.180569553392535e-07, + "loss": 0.5188, + "step": 3880 + }, + { + "epoch": 4.341163310961969, + "grad_norm": 0.5978094935417175, + "learning_rate": 9.16901917178993e-07, + "loss": 0.5239, + "step": 3881 + }, + { + "epoch": 4.342281879194631, + "grad_norm": 0.6130208969116211, + "learning_rate": 9.157474428845845e-07, + "loss": 0.498, + "step": 3882 + }, + { + "epoch": 4.343400447427293, + "grad_norm": 0.6137877106666565, + "learning_rate": 9.14593532867227e-07, + "loss": 0.5091, + "step": 3883 + }, + { + "epoch": 4.344519015659955, + "grad_norm": 0.6067939400672913, + "learning_rate": 9.134401875379179e-07, + "loss": 0.5368, + "step": 3884 + }, + { + "epoch": 4.345637583892618, + "grad_norm": 0.6217033267021179, + "learning_rate": 9.122874073074561e-07, + "loss": 0.5529, + "step": 3885 + }, + { + "epoch": 4.3467561521252795, + "grad_norm": 0.6201391220092773, + "learning_rate": 9.111351925864357e-07, + "loss": 0.5367, + "step": 3886 + }, + { + "epoch": 4.347874720357942, + "grad_norm": 0.5970144271850586, + "learning_rate": 9.099835437852525e-07, + "loss": 0.5008, + "step": 3887 + }, + { + "epoch": 4.348993288590604, + "grad_norm": 0.6054550409317017, + "learning_rate": 9.088324613140978e-07, + "loss": 0.5087, + "step": 3888 + }, + { + "epoch": 4.350111856823267, + "grad_norm": 0.5941537618637085, + "learning_rate": 9.076819455829644e-07, + "loss": 0.5195, + "step": 3889 + }, + { + "epoch": 4.351230425055928, + "grad_norm": 0.5920141935348511, + "learning_rate": 9.065319970016406e-07, + "loss": 0.5161, + "step": 3890 + }, + { + "epoch": 4.35234899328859, + "grad_norm": 0.5996435284614563, + "learning_rate": 9.053826159797139e-07, + "loss": 0.52, + "step": 3891 + }, + { + "epoch": 4.353467561521253, + "grad_norm": 0.6157770752906799, + "learning_rate": 9.042338029265688e-07, + "loss": 0.5144, + "step": 3892 + }, + { + "epoch": 4.3545861297539155, + "grad_norm": 0.604799747467041, + "learning_rate": 9.030855582513875e-07, + "loss": 0.4977, + "step": 3893 + }, + { + "epoch": 4.355704697986577, + "grad_norm": 0.603688657283783, + "learning_rate": 9.019378823631522e-07, + "loss": 0.524, + "step": 3894 + }, + { + "epoch": 4.356823266219239, + "grad_norm": 0.6087855100631714, + "learning_rate": 9.007907756706389e-07, + "loss": 0.5227, + "step": 3895 + }, + { + "epoch": 4.357941834451902, + "grad_norm": 0.6192216873168945, + "learning_rate": 8.996442385824239e-07, + "loss": 0.5051, + "step": 3896 + }, + { + "epoch": 4.359060402684563, + "grad_norm": 0.6199573874473572, + "learning_rate": 8.98498271506878e-07, + "loss": 0.5154, + "step": 3897 + }, + { + "epoch": 4.360178970917226, + "grad_norm": 0.6346080899238586, + "learning_rate": 8.973528748521729e-07, + "loss": 0.534, + "step": 3898 + }, + { + "epoch": 4.361297539149888, + "grad_norm": 0.613844096660614, + "learning_rate": 8.962080490262706e-07, + "loss": 0.516, + "step": 3899 + }, + { + "epoch": 4.3624161073825505, + "grad_norm": 0.5869024395942688, + "learning_rate": 8.950637944369369e-07, + "loss": 0.4974, + "step": 3900 + }, + { + "epoch": 4.363534675615212, + "grad_norm": 0.6002348065376282, + "learning_rate": 8.939201114917296e-07, + "loss": 0.5151, + "step": 3901 + }, + { + "epoch": 4.364653243847875, + "grad_norm": 0.6245166063308716, + "learning_rate": 8.927770005980035e-07, + "loss": 0.5467, + "step": 3902 + }, + { + "epoch": 4.365771812080537, + "grad_norm": 0.6147286891937256, + "learning_rate": 8.916344621629125e-07, + "loss": 0.5253, + "step": 3903 + }, + { + "epoch": 4.366890380313199, + "grad_norm": 0.6307237148284912, + "learning_rate": 8.904924965934019e-07, + "loss": 0.5387, + "step": 3904 + }, + { + "epoch": 4.368008948545861, + "grad_norm": 0.6227885484695435, + "learning_rate": 8.893511042962183e-07, + "loss": 0.53, + "step": 3905 + }, + { + "epoch": 4.369127516778524, + "grad_norm": 0.6208213567733765, + "learning_rate": 8.882102856778996e-07, + "loss": 0.524, + "step": 3906 + }, + { + "epoch": 4.3702460850111855, + "grad_norm": 0.6289359331130981, + "learning_rate": 8.870700411447817e-07, + "loss": 0.5535, + "step": 3907 + }, + { + "epoch": 4.371364653243848, + "grad_norm": 0.601539134979248, + "learning_rate": 8.85930371102994e-07, + "loss": 0.5178, + "step": 3908 + }, + { + "epoch": 4.37248322147651, + "grad_norm": 0.6087766289710999, + "learning_rate": 8.847912759584651e-07, + "loss": 0.5453, + "step": 3909 + }, + { + "epoch": 4.373601789709173, + "grad_norm": 0.6072549819946289, + "learning_rate": 8.836527561169142e-07, + "loss": 0.5183, + "step": 3910 + }, + { + "epoch": 4.374720357941834, + "grad_norm": 0.6419822573661804, + "learning_rate": 8.825148119838601e-07, + "loss": 0.5458, + "step": 3911 + }, + { + "epoch": 4.375838926174497, + "grad_norm": 0.604444146156311, + "learning_rate": 8.813774439646128e-07, + "loss": 0.5139, + "step": 3912 + }, + { + "epoch": 4.376957494407159, + "grad_norm": 0.6098402142524719, + "learning_rate": 8.802406524642782e-07, + "loss": 0.5176, + "step": 3913 + }, + { + "epoch": 4.378076062639821, + "grad_norm": 0.6146360635757446, + "learning_rate": 8.791044378877589e-07, + "loss": 0.5306, + "step": 3914 + }, + { + "epoch": 4.379194630872483, + "grad_norm": 0.5843731760978699, + "learning_rate": 8.779688006397491e-07, + "loss": 0.4852, + "step": 3915 + }, + { + "epoch": 4.380313199105146, + "grad_norm": 0.6277266144752502, + "learning_rate": 8.768337411247391e-07, + "loss": 0.5156, + "step": 3916 + }, + { + "epoch": 4.381431767337808, + "grad_norm": 0.6229064464569092, + "learning_rate": 8.756992597470121e-07, + "loss": 0.5238, + "step": 3917 + }, + { + "epoch": 4.382550335570469, + "grad_norm": 0.6058744192123413, + "learning_rate": 8.745653569106474e-07, + "loss": 0.5257, + "step": 3918 + }, + { + "epoch": 4.383668903803132, + "grad_norm": 0.6255224347114563, + "learning_rate": 8.734320330195159e-07, + "loss": 0.5045, + "step": 3919 + }, + { + "epoch": 4.384787472035794, + "grad_norm": 0.6120726466178894, + "learning_rate": 8.722992884772849e-07, + "loss": 0.5461, + "step": 3920 + }, + { + "epoch": 4.385906040268456, + "grad_norm": 0.6208629608154297, + "learning_rate": 8.711671236874134e-07, + "loss": 0.5547, + "step": 3921 + }, + { + "epoch": 4.387024608501118, + "grad_norm": 0.6123802661895752, + "learning_rate": 8.700355390531536e-07, + "loss": 0.5105, + "step": 3922 + }, + { + "epoch": 4.388143176733781, + "grad_norm": 0.5829566121101379, + "learning_rate": 8.689045349775527e-07, + "loss": 0.5129, + "step": 3923 + }, + { + "epoch": 4.389261744966443, + "grad_norm": 0.6066691875457764, + "learning_rate": 8.677741118634492e-07, + "loss": 0.5224, + "step": 3924 + }, + { + "epoch": 4.390380313199105, + "grad_norm": 0.5989576578140259, + "learning_rate": 8.666442701134773e-07, + "loss": 0.509, + "step": 3925 + }, + { + "epoch": 4.391498881431767, + "grad_norm": 0.6142332553863525, + "learning_rate": 8.655150101300611e-07, + "loss": 0.5169, + "step": 3926 + }, + { + "epoch": 4.39261744966443, + "grad_norm": 0.6166988611221313, + "learning_rate": 8.643863323154203e-07, + "loss": 0.5313, + "step": 3927 + }, + { + "epoch": 4.3937360178970915, + "grad_norm": 0.6153817176818848, + "learning_rate": 8.632582370715647e-07, + "loss": 0.5487, + "step": 3928 + }, + { + "epoch": 4.394854586129754, + "grad_norm": 0.6128860116004944, + "learning_rate": 8.621307248002994e-07, + "loss": 0.514, + "step": 3929 + }, + { + "epoch": 4.395973154362416, + "grad_norm": 0.6158210039138794, + "learning_rate": 8.610037959032192e-07, + "loss": 0.5408, + "step": 3930 + }, + { + "epoch": 4.3970917225950785, + "grad_norm": 0.6069280505180359, + "learning_rate": 8.59877450781712e-07, + "loss": 0.53, + "step": 3931 + }, + { + "epoch": 4.39821029082774, + "grad_norm": 0.6040332913398743, + "learning_rate": 8.587516898369588e-07, + "loss": 0.5066, + "step": 3932 + }, + { + "epoch": 4.399328859060403, + "grad_norm": 0.5886071920394897, + "learning_rate": 8.576265134699302e-07, + "loss": 0.5204, + "step": 3933 + }, + { + "epoch": 4.400447427293065, + "grad_norm": 0.6206260919570923, + "learning_rate": 8.565019220813917e-07, + "loss": 0.5403, + "step": 3934 + }, + { + "epoch": 4.401565995525727, + "grad_norm": 0.6197776794433594, + "learning_rate": 8.553779160718976e-07, + "loss": 0.5066, + "step": 3935 + }, + { + "epoch": 4.402684563758389, + "grad_norm": 0.6027390360832214, + "learning_rate": 8.542544958417962e-07, + "loss": 0.5461, + "step": 3936 + }, + { + "epoch": 4.403803131991052, + "grad_norm": 0.6143651008605957, + "learning_rate": 8.531316617912244e-07, + "loss": 0.5167, + "step": 3937 + }, + { + "epoch": 4.4049217002237135, + "grad_norm": 0.6299238204956055, + "learning_rate": 8.520094143201136e-07, + "loss": 0.5393, + "step": 3938 + }, + { + "epoch": 4.406040268456376, + "grad_norm": 0.5989980697631836, + "learning_rate": 8.508877538281834e-07, + "loss": 0.5032, + "step": 3939 + }, + { + "epoch": 4.407158836689038, + "grad_norm": 0.6179407835006714, + "learning_rate": 8.497666807149454e-07, + "loss": 0.5265, + "step": 3940 + }, + { + "epoch": 4.408277404921701, + "grad_norm": 0.6034483909606934, + "learning_rate": 8.486461953797012e-07, + "loss": 0.5154, + "step": 3941 + }, + { + "epoch": 4.409395973154362, + "grad_norm": 0.5913670659065247, + "learning_rate": 8.475262982215454e-07, + "loss": 0.5098, + "step": 3942 + }, + { + "epoch": 4.410514541387025, + "grad_norm": 0.6124094128608704, + "learning_rate": 8.464069896393612e-07, + "loss": 0.5298, + "step": 3943 + }, + { + "epoch": 4.411633109619687, + "grad_norm": 0.6141130924224854, + "learning_rate": 8.45288270031821e-07, + "loss": 0.5472, + "step": 3944 + }, + { + "epoch": 4.412751677852349, + "grad_norm": 0.6199144721031189, + "learning_rate": 8.441701397973909e-07, + "loss": 0.5433, + "step": 3945 + }, + { + "epoch": 4.413870246085011, + "grad_norm": 0.597920298576355, + "learning_rate": 8.430525993343236e-07, + "loss": 0.513, + "step": 3946 + }, + { + "epoch": 4.414988814317673, + "grad_norm": 0.5946189761161804, + "learning_rate": 8.419356490406652e-07, + "loss": 0.4946, + "step": 3947 + }, + { + "epoch": 4.416107382550336, + "grad_norm": 0.6173261404037476, + "learning_rate": 8.408192893142469e-07, + "loss": 0.5095, + "step": 3948 + }, + { + "epoch": 4.417225950782997, + "grad_norm": 0.6021565794944763, + "learning_rate": 8.397035205526947e-07, + "loss": 0.5194, + "step": 3949 + }, + { + "epoch": 4.41834451901566, + "grad_norm": 0.6614506244659424, + "learning_rate": 8.385883431534194e-07, + "loss": 0.5341, + "step": 3950 + }, + { + "epoch": 4.419463087248322, + "grad_norm": 0.6061006784439087, + "learning_rate": 8.374737575136261e-07, + "loss": 0.5129, + "step": 3951 + }, + { + "epoch": 4.4205816554809845, + "grad_norm": 0.6158464550971985, + "learning_rate": 8.36359764030304e-07, + "loss": 0.5244, + "step": 3952 + }, + { + "epoch": 4.421700223713646, + "grad_norm": 0.6081213355064392, + "learning_rate": 8.352463631002359e-07, + "loss": 0.5129, + "step": 3953 + }, + { + "epoch": 4.422818791946309, + "grad_norm": 0.601655900478363, + "learning_rate": 8.341335551199903e-07, + "loss": 0.5242, + "step": 3954 + }, + { + "epoch": 4.423937360178971, + "grad_norm": 0.609257698059082, + "learning_rate": 8.330213404859264e-07, + "loss": 0.5515, + "step": 3955 + }, + { + "epoch": 4.425055928411633, + "grad_norm": 0.6002370715141296, + "learning_rate": 8.319097195941911e-07, + "loss": 0.5189, + "step": 3956 + }, + { + "epoch": 4.426174496644295, + "grad_norm": 0.6211472749710083, + "learning_rate": 8.307986928407188e-07, + "loss": 0.5481, + "step": 3957 + }, + { + "epoch": 4.427293064876958, + "grad_norm": 0.6105425953865051, + "learning_rate": 8.296882606212361e-07, + "loss": 0.5369, + "step": 3958 + }, + { + "epoch": 4.4284116331096195, + "grad_norm": 0.6063711047172546, + "learning_rate": 8.28578423331253e-07, + "loss": 0.5344, + "step": 3959 + }, + { + "epoch": 4.429530201342282, + "grad_norm": 0.6273483037948608, + "learning_rate": 8.274691813660721e-07, + "loss": 0.52, + "step": 3960 + }, + { + "epoch": 4.430648769574944, + "grad_norm": 0.614701509475708, + "learning_rate": 8.263605351207796e-07, + "loss": 0.5182, + "step": 3961 + }, + { + "epoch": 4.431767337807607, + "grad_norm": 0.6111344695091248, + "learning_rate": 8.252524849902535e-07, + "loss": 0.5152, + "step": 3962 + }, + { + "epoch": 4.432885906040268, + "grad_norm": 0.6242409348487854, + "learning_rate": 8.241450313691573e-07, + "loss": 0.5575, + "step": 3963 + }, + { + "epoch": 4.434004474272931, + "grad_norm": 0.6164456009864807, + "learning_rate": 8.23038174651942e-07, + "loss": 0.5327, + "step": 3964 + }, + { + "epoch": 4.435123042505593, + "grad_norm": 0.6016994118690491, + "learning_rate": 8.219319152328461e-07, + "loss": 0.5426, + "step": 3965 + }, + { + "epoch": 4.436241610738255, + "grad_norm": 0.6227725744247437, + "learning_rate": 8.208262535058956e-07, + "loss": 0.55, + "step": 3966 + }, + { + "epoch": 4.437360178970917, + "grad_norm": 0.6207326054573059, + "learning_rate": 8.197211898649049e-07, + "loss": 0.5518, + "step": 3967 + }, + { + "epoch": 4.43847874720358, + "grad_norm": 0.6259524822235107, + "learning_rate": 8.186167247034726e-07, + "loss": 0.5366, + "step": 3968 + }, + { + "epoch": 4.439597315436242, + "grad_norm": 0.5974075198173523, + "learning_rate": 8.175128584149871e-07, + "loss": 0.5282, + "step": 3969 + }, + { + "epoch": 4.440715883668904, + "grad_norm": 0.6197651624679565, + "learning_rate": 8.164095913926209e-07, + "loss": 0.5221, + "step": 3970 + }, + { + "epoch": 4.441834451901566, + "grad_norm": 0.5975695252418518, + "learning_rate": 8.153069240293354e-07, + "loss": 0.5289, + "step": 3971 + }, + { + "epoch": 4.442953020134228, + "grad_norm": 0.6176166534423828, + "learning_rate": 8.142048567178767e-07, + "loss": 0.531, + "step": 3972 + }, + { + "epoch": 4.44407158836689, + "grad_norm": 0.5977227091789246, + "learning_rate": 8.131033898507779e-07, + "loss": 0.5121, + "step": 3973 + }, + { + "epoch": 4.445190156599552, + "grad_norm": 0.6163278222084045, + "learning_rate": 8.120025238203577e-07, + "loss": 0.5345, + "step": 3974 + }, + { + "epoch": 4.446308724832215, + "grad_norm": 0.6143608093261719, + "learning_rate": 8.109022590187205e-07, + "loss": 0.5253, + "step": 3975 + }, + { + "epoch": 4.447427293064877, + "grad_norm": 0.6125954389572144, + "learning_rate": 8.098025958377586e-07, + "loss": 0.5404, + "step": 3976 + }, + { + "epoch": 4.448545861297539, + "grad_norm": 0.6025015711784363, + "learning_rate": 8.087035346691474e-07, + "loss": 0.5156, + "step": 3977 + }, + { + "epoch": 4.449664429530201, + "grad_norm": 0.6312769651412964, + "learning_rate": 8.076050759043505e-07, + "loss": 0.5241, + "step": 3978 + }, + { + "epoch": 4.450782997762864, + "grad_norm": 0.6082600355148315, + "learning_rate": 8.065072199346149e-07, + "loss": 0.5277, + "step": 3979 + }, + { + "epoch": 4.4519015659955254, + "grad_norm": 0.6163889169692993, + "learning_rate": 8.054099671509732e-07, + "loss": 0.5426, + "step": 3980 + }, + { + "epoch": 4.453020134228188, + "grad_norm": 0.6325088143348694, + "learning_rate": 8.043133179442433e-07, + "loss": 0.5331, + "step": 3981 + }, + { + "epoch": 4.45413870246085, + "grad_norm": 0.6002582907676697, + "learning_rate": 8.032172727050291e-07, + "loss": 0.5119, + "step": 3982 + }, + { + "epoch": 4.4552572706935125, + "grad_norm": 0.6001783609390259, + "learning_rate": 8.021218318237175e-07, + "loss": 0.5371, + "step": 3983 + }, + { + "epoch": 4.456375838926174, + "grad_norm": 0.6225818395614624, + "learning_rate": 8.010269956904829e-07, + "loss": 0.496, + "step": 3984 + }, + { + "epoch": 4.457494407158837, + "grad_norm": 0.6112980246543884, + "learning_rate": 7.999327646952817e-07, + "loss": 0.5165, + "step": 3985 + }, + { + "epoch": 4.458612975391499, + "grad_norm": 0.6192248463630676, + "learning_rate": 7.988391392278547e-07, + "loss": 0.557, + "step": 3986 + }, + { + "epoch": 4.459731543624161, + "grad_norm": 0.6079258322715759, + "learning_rate": 7.977461196777297e-07, + "loss": 0.5222, + "step": 3987 + }, + { + "epoch": 4.460850111856823, + "grad_norm": 0.6217797994613647, + "learning_rate": 7.966537064342166e-07, + "loss": 0.5253, + "step": 3988 + }, + { + "epoch": 4.461968680089486, + "grad_norm": 0.6180599331855774, + "learning_rate": 7.955618998864092e-07, + "loss": 0.5101, + "step": 3989 + }, + { + "epoch": 4.4630872483221475, + "grad_norm": 0.6233153343200684, + "learning_rate": 7.944707004231855e-07, + "loss": 0.5293, + "step": 3990 + }, + { + "epoch": 4.46420581655481, + "grad_norm": 0.6290706992149353, + "learning_rate": 7.933801084332085e-07, + "loss": 0.5266, + "step": 3991 + }, + { + "epoch": 4.465324384787472, + "grad_norm": 0.6180533170700073, + "learning_rate": 7.922901243049231e-07, + "loss": 0.5089, + "step": 3992 + }, + { + "epoch": 4.466442953020135, + "grad_norm": 0.6237832903862, + "learning_rate": 7.912007484265591e-07, + "loss": 0.5374, + "step": 3993 + }, + { + "epoch": 4.467561521252796, + "grad_norm": 0.6008462309837341, + "learning_rate": 7.90111981186128e-07, + "loss": 0.5321, + "step": 3994 + }, + { + "epoch": 4.468680089485459, + "grad_norm": 0.6409690380096436, + "learning_rate": 7.890238229714269e-07, + "loss": 0.5464, + "step": 3995 + }, + { + "epoch": 4.469798657718121, + "grad_norm": 0.6167017817497253, + "learning_rate": 7.87936274170035e-07, + "loss": 0.5411, + "step": 3996 + }, + { + "epoch": 4.4709172259507834, + "grad_norm": 0.6248334646224976, + "learning_rate": 7.86849335169311e-07, + "loss": 0.515, + "step": 3997 + }, + { + "epoch": 4.472035794183445, + "grad_norm": 0.633633017539978, + "learning_rate": 7.857630063564026e-07, + "loss": 0.5406, + "step": 3998 + }, + { + "epoch": 4.473154362416107, + "grad_norm": 0.6115291118621826, + "learning_rate": 7.846772881182346e-07, + "loss": 0.5573, + "step": 3999 + }, + { + "epoch": 4.47427293064877, + "grad_norm": 0.6110193729400635, + "learning_rate": 7.83592180841519e-07, + "loss": 0.5266, + "step": 4000 + }, + { + "epoch": 4.475391498881431, + "grad_norm": 0.6215356588363647, + "learning_rate": 7.825076849127458e-07, + "loss": 0.516, + "step": 4001 + }, + { + "epoch": 4.476510067114094, + "grad_norm": 0.603797972202301, + "learning_rate": 7.814238007181913e-07, + "loss": 0.5112, + "step": 4002 + }, + { + "epoch": 4.477628635346756, + "grad_norm": 0.6180343627929688, + "learning_rate": 7.803405286439112e-07, + "loss": 0.5488, + "step": 4003 + }, + { + "epoch": 4.4787472035794185, + "grad_norm": 0.6226353645324707, + "learning_rate": 7.792578690757438e-07, + "loss": 0.5283, + "step": 4004 + }, + { + "epoch": 4.47986577181208, + "grad_norm": 0.6122317314147949, + "learning_rate": 7.781758223993086e-07, + "loss": 0.5103, + "step": 4005 + }, + { + "epoch": 4.480984340044743, + "grad_norm": 0.6442136764526367, + "learning_rate": 7.770943890000093e-07, + "loss": 0.5181, + "step": 4006 + }, + { + "epoch": 4.482102908277405, + "grad_norm": 0.6178842186927795, + "learning_rate": 7.760135692630289e-07, + "loss": 0.5228, + "step": 4007 + }, + { + "epoch": 4.483221476510067, + "grad_norm": 0.6348392963409424, + "learning_rate": 7.749333635733311e-07, + "loss": 0.5371, + "step": 4008 + }, + { + "epoch": 4.484340044742729, + "grad_norm": 0.6217544078826904, + "learning_rate": 7.738537723156641e-07, + "loss": 0.5312, + "step": 4009 + }, + { + "epoch": 4.485458612975392, + "grad_norm": 0.6248354911804199, + "learning_rate": 7.727747958745536e-07, + "loss": 0.5334, + "step": 4010 + }, + { + "epoch": 4.4865771812080535, + "grad_norm": 0.623227059841156, + "learning_rate": 7.716964346343098e-07, + "loss": 0.5599, + "step": 4011 + }, + { + "epoch": 4.487695749440716, + "grad_norm": 0.6066583395004272, + "learning_rate": 7.70618688979021e-07, + "loss": 0.504, + "step": 4012 + }, + { + "epoch": 4.488814317673378, + "grad_norm": 0.6039069294929504, + "learning_rate": 7.695415592925573e-07, + "loss": 0.501, + "step": 4013 + }, + { + "epoch": 4.489932885906041, + "grad_norm": 0.6048812866210938, + "learning_rate": 7.68465045958569e-07, + "loss": 0.5269, + "step": 4014 + }, + { + "epoch": 4.491051454138702, + "grad_norm": 0.6016896367073059, + "learning_rate": 7.673891493604882e-07, + "loss": 0.4842, + "step": 4015 + }, + { + "epoch": 4.492170022371365, + "grad_norm": 0.633258581161499, + "learning_rate": 7.663138698815254e-07, + "loss": 0.5477, + "step": 4016 + }, + { + "epoch": 4.493288590604027, + "grad_norm": 0.6193276047706604, + "learning_rate": 7.652392079046734e-07, + "loss": 0.5279, + "step": 4017 + }, + { + "epoch": 4.494407158836689, + "grad_norm": 0.6203513145446777, + "learning_rate": 7.641651638127032e-07, + "loss": 0.5377, + "step": 4018 + }, + { + "epoch": 4.495525727069351, + "grad_norm": 0.617175281047821, + "learning_rate": 7.630917379881658e-07, + "loss": 0.5344, + "step": 4019 + }, + { + "epoch": 4.496644295302014, + "grad_norm": 0.617176353931427, + "learning_rate": 7.620189308133943e-07, + "loss": 0.5391, + "step": 4020 + }, + { + "epoch": 4.497762863534676, + "grad_norm": 0.6160593032836914, + "learning_rate": 7.609467426704989e-07, + "loss": 0.5258, + "step": 4021 + }, + { + "epoch": 4.498881431767337, + "grad_norm": 0.6160233616828918, + "learning_rate": 7.598751739413701e-07, + "loss": 0.506, + "step": 4022 + }, + { + "epoch": 4.5, + "grad_norm": 0.6293731331825256, + "learning_rate": 7.588042250076771e-07, + "loss": 0.5462, + "step": 4023 + }, + { + "epoch": 4.501118568232663, + "grad_norm": 0.6198338866233826, + "learning_rate": 7.577338962508709e-07, + "loss": 0.5104, + "step": 4024 + }, + { + "epoch": 4.502237136465324, + "grad_norm": 0.6300543546676636, + "learning_rate": 7.56664188052178e-07, + "loss": 0.532, + "step": 4025 + }, + { + "epoch": 4.503355704697986, + "grad_norm": 0.6208925247192383, + "learning_rate": 7.555951007926074e-07, + "loss": 0.5207, + "step": 4026 + }, + { + "epoch": 4.504474272930649, + "grad_norm": 0.619932234287262, + "learning_rate": 7.545266348529445e-07, + "loss": 0.5384, + "step": 4027 + }, + { + "epoch": 4.505592841163311, + "grad_norm": 0.6355857253074646, + "learning_rate": 7.534587906137541e-07, + "loss": 0.5134, + "step": 4028 + }, + { + "epoch": 4.506711409395973, + "grad_norm": 0.629966139793396, + "learning_rate": 7.523915684553795e-07, + "loss": 0.5454, + "step": 4029 + }, + { + "epoch": 4.507829977628635, + "grad_norm": 0.6221733093261719, + "learning_rate": 7.513249687579419e-07, + "loss": 0.5209, + "step": 4030 + }, + { + "epoch": 4.508948545861298, + "grad_norm": 0.6132561564445496, + "learning_rate": 7.502589919013428e-07, + "loss": 0.5248, + "step": 4031 + }, + { + "epoch": 4.510067114093959, + "grad_norm": 0.6098362803459167, + "learning_rate": 7.49193638265259e-07, + "loss": 0.5101, + "step": 4032 + }, + { + "epoch": 4.511185682326622, + "grad_norm": 0.61825031042099, + "learning_rate": 7.481289082291485e-07, + "loss": 0.5266, + "step": 4033 + }, + { + "epoch": 4.512304250559284, + "grad_norm": 0.5990051031112671, + "learning_rate": 7.470648021722434e-07, + "loss": 0.4999, + "step": 4034 + }, + { + "epoch": 4.5134228187919465, + "grad_norm": 0.6216158270835876, + "learning_rate": 7.460013204735581e-07, + "loss": 0.5236, + "step": 4035 + }, + { + "epoch": 4.514541387024608, + "grad_norm": 0.6126163005828857, + "learning_rate": 7.449384635118806e-07, + "loss": 0.509, + "step": 4036 + }, + { + "epoch": 4.515659955257271, + "grad_norm": 0.6344913244247437, + "learning_rate": 7.438762316657782e-07, + "loss": 0.5592, + "step": 4037 + }, + { + "epoch": 4.516778523489933, + "grad_norm": 0.6145715713500977, + "learning_rate": 7.428146253135954e-07, + "loss": 0.5388, + "step": 4038 + }, + { + "epoch": 4.517897091722595, + "grad_norm": 0.5911350846290588, + "learning_rate": 7.417536448334528e-07, + "loss": 0.4784, + "step": 4039 + }, + { + "epoch": 4.519015659955257, + "grad_norm": 0.6221259832382202, + "learning_rate": 7.406932906032505e-07, + "loss": 0.5201, + "step": 4040 + }, + { + "epoch": 4.52013422818792, + "grad_norm": 0.6196680068969727, + "learning_rate": 7.396335630006629e-07, + "loss": 0.5356, + "step": 4041 + }, + { + "epoch": 4.5212527964205815, + "grad_norm": 0.6233739256858826, + "learning_rate": 7.385744624031441e-07, + "loss": 0.5598, + "step": 4042 + }, + { + "epoch": 4.522371364653244, + "grad_norm": 0.6183041334152222, + "learning_rate": 7.37515989187921e-07, + "loss": 0.5348, + "step": 4043 + }, + { + "epoch": 4.523489932885906, + "grad_norm": 0.6056849360466003, + "learning_rate": 7.364581437320015e-07, + "loss": 0.5173, + "step": 4044 + }, + { + "epoch": 4.524608501118569, + "grad_norm": 0.6273506283760071, + "learning_rate": 7.354009264121664e-07, + "loss": 0.5429, + "step": 4045 + }, + { + "epoch": 4.52572706935123, + "grad_norm": 0.6245075464248657, + "learning_rate": 7.343443376049741e-07, + "loss": 0.5444, + "step": 4046 + }, + { + "epoch": 4.526845637583893, + "grad_norm": 0.6066340804100037, + "learning_rate": 7.332883776867586e-07, + "loss": 0.5182, + "step": 4047 + }, + { + "epoch": 4.527964205816555, + "grad_norm": 0.6391448378562927, + "learning_rate": 7.322330470336314e-07, + "loss": 0.5147, + "step": 4048 + }, + { + "epoch": 4.5290827740492166, + "grad_norm": 0.6028881072998047, + "learning_rate": 7.311783460214783e-07, + "loss": 0.5218, + "step": 4049 + }, + { + "epoch": 4.530201342281879, + "grad_norm": 0.6323413252830505, + "learning_rate": 7.301242750259607e-07, + "loss": 0.5365, + "step": 4050 + }, + { + "epoch": 4.531319910514542, + "grad_norm": 0.6071269512176514, + "learning_rate": 7.290708344225175e-07, + "loss": 0.5148, + "step": 4051 + }, + { + "epoch": 4.532438478747204, + "grad_norm": 0.622318685054779, + "learning_rate": 7.280180245863616e-07, + "loss": 0.5123, + "step": 4052 + }, + { + "epoch": 4.533557046979865, + "grad_norm": 0.6270676851272583, + "learning_rate": 7.269658458924808e-07, + "loss": 0.5518, + "step": 4053 + }, + { + "epoch": 4.534675615212528, + "grad_norm": 0.6380423903465271, + "learning_rate": 7.259142987156384e-07, + "loss": 0.5435, + "step": 4054 + }, + { + "epoch": 4.53579418344519, + "grad_norm": 0.6066109538078308, + "learning_rate": 7.248633834303745e-07, + "loss": 0.5279, + "step": 4055 + }, + { + "epoch": 4.5369127516778525, + "grad_norm": 0.5954588651657104, + "learning_rate": 7.238131004110013e-07, + "loss": 0.5186, + "step": 4056 + }, + { + "epoch": 4.538031319910514, + "grad_norm": 0.6072389483451843, + "learning_rate": 7.227634500316089e-07, + "loss": 0.5191, + "step": 4057 + }, + { + "epoch": 4.539149888143177, + "grad_norm": 0.6137959361076355, + "learning_rate": 7.217144326660586e-07, + "loss": 0.523, + "step": 4058 + }, + { + "epoch": 4.540268456375839, + "grad_norm": 0.6308537721633911, + "learning_rate": 7.206660486879899e-07, + "loss": 0.5548, + "step": 4059 + }, + { + "epoch": 4.541387024608501, + "grad_norm": 0.6137513518333435, + "learning_rate": 7.196182984708139e-07, + "loss": 0.5265, + "step": 4060 + }, + { + "epoch": 4.542505592841163, + "grad_norm": 0.632115364074707, + "learning_rate": 7.18571182387717e-07, + "loss": 0.5018, + "step": 4061 + }, + { + "epoch": 4.543624161073826, + "grad_norm": 0.6195847988128662, + "learning_rate": 7.175247008116598e-07, + "loss": 0.5185, + "step": 4062 + }, + { + "epoch": 4.5447427293064875, + "grad_norm": 0.6138813495635986, + "learning_rate": 7.164788541153758e-07, + "loss": 0.5172, + "step": 4063 + }, + { + "epoch": 4.54586129753915, + "grad_norm": 0.6051745414733887, + "learning_rate": 7.154336426713751e-07, + "loss": 0.513, + "step": 4064 + }, + { + "epoch": 4.546979865771812, + "grad_norm": 0.6324571967124939, + "learning_rate": 7.143890668519379e-07, + "loss": 0.537, + "step": 4065 + }, + { + "epoch": 4.5480984340044746, + "grad_norm": 0.5961597561836243, + "learning_rate": 7.133451270291217e-07, + "loss": 0.5081, + "step": 4066 + }, + { + "epoch": 4.549217002237136, + "grad_norm": 0.6019732356071472, + "learning_rate": 7.123018235747539e-07, + "loss": 0.5232, + "step": 4067 + }, + { + "epoch": 4.550335570469799, + "grad_norm": 0.6343137621879578, + "learning_rate": 7.112591568604388e-07, + "loss": 0.545, + "step": 4068 + }, + { + "epoch": 4.551454138702461, + "grad_norm": 0.6335507035255432, + "learning_rate": 7.102171272575511e-07, + "loss": 0.5347, + "step": 4069 + }, + { + "epoch": 4.552572706935123, + "grad_norm": 0.6270492672920227, + "learning_rate": 7.0917573513724e-07, + "loss": 0.5414, + "step": 4070 + }, + { + "epoch": 4.553691275167785, + "grad_norm": 0.6069012880325317, + "learning_rate": 7.081349808704266e-07, + "loss": 0.5211, + "step": 4071 + }, + { + "epoch": 4.554809843400448, + "grad_norm": 0.609781801700592, + "learning_rate": 7.070948648278047e-07, + "loss": 0.5463, + "step": 4072 + }, + { + "epoch": 4.55592841163311, + "grad_norm": 0.6316078901290894, + "learning_rate": 7.060553873798437e-07, + "loss": 0.5208, + "step": 4073 + }, + { + "epoch": 4.557046979865772, + "grad_norm": 0.6268311142921448, + "learning_rate": 7.050165488967811e-07, + "loss": 0.5303, + "step": 4074 + }, + { + "epoch": 4.558165548098434, + "grad_norm": 0.6293088793754578, + "learning_rate": 7.039783497486311e-07, + "loss": 0.5365, + "step": 4075 + }, + { + "epoch": 4.559284116331096, + "grad_norm": 0.614711582660675, + "learning_rate": 7.029407903051771e-07, + "loss": 0.5249, + "step": 4076 + }, + { + "epoch": 4.560402684563758, + "grad_norm": 0.6183292269706726, + "learning_rate": 7.019038709359757e-07, + "loss": 0.5162, + "step": 4077 + }, + { + "epoch": 4.561521252796421, + "grad_norm": 0.6308655738830566, + "learning_rate": 7.008675920103547e-07, + "loss": 0.517, + "step": 4078 + }, + { + "epoch": 4.562639821029083, + "grad_norm": 0.6013302803039551, + "learning_rate": 6.998319538974163e-07, + "loss": 0.5038, + "step": 4079 + }, + { + "epoch": 4.563758389261745, + "grad_norm": 0.6287400126457214, + "learning_rate": 6.987969569660321e-07, + "loss": 0.5027, + "step": 4080 + }, + { + "epoch": 4.564876957494407, + "grad_norm": 0.6116160154342651, + "learning_rate": 6.977626015848449e-07, + "loss": 0.5047, + "step": 4081 + }, + { + "epoch": 4.565995525727069, + "grad_norm": 0.6113354563713074, + "learning_rate": 6.96728888122272e-07, + "loss": 0.522, + "step": 4082 + }, + { + "epoch": 4.567114093959732, + "grad_norm": 0.6086088418960571, + "learning_rate": 6.95695816946498e-07, + "loss": 0.526, + "step": 4083 + }, + { + "epoch": 4.568232662192393, + "grad_norm": 0.6158650517463684, + "learning_rate": 6.94663388425483e-07, + "loss": 0.538, + "step": 4084 + }, + { + "epoch": 4.569351230425056, + "grad_norm": 0.6200857162475586, + "learning_rate": 6.936316029269552e-07, + "loss": 0.5155, + "step": 4085 + }, + { + "epoch": 4.570469798657718, + "grad_norm": 0.6068480014801025, + "learning_rate": 6.926004608184142e-07, + "loss": 0.4996, + "step": 4086 + }, + { + "epoch": 4.5715883668903805, + "grad_norm": 0.6287431120872498, + "learning_rate": 6.915699624671307e-07, + "loss": 0.5431, + "step": 4087 + }, + { + "epoch": 4.572706935123042, + "grad_norm": 0.6107509136199951, + "learning_rate": 6.905401082401475e-07, + "loss": 0.5215, + "step": 4088 + }, + { + "epoch": 4.573825503355705, + "grad_norm": 0.6162368655204773, + "learning_rate": 6.895108985042751e-07, + "loss": 0.5405, + "step": 4089 + }, + { + "epoch": 4.574944071588367, + "grad_norm": 0.5989571809768677, + "learning_rate": 6.884823336260982e-07, + "loss": 0.5084, + "step": 4090 + }, + { + "epoch": 4.576062639821029, + "grad_norm": 0.6174416542053223, + "learning_rate": 6.874544139719683e-07, + "loss": 0.5093, + "step": 4091 + }, + { + "epoch": 4.577181208053691, + "grad_norm": 0.6274213194847107, + "learning_rate": 6.86427139908008e-07, + "loss": 0.5537, + "step": 4092 + }, + { + "epoch": 4.578299776286354, + "grad_norm": 0.6254974007606506, + "learning_rate": 6.85400511800112e-07, + "loss": 0.5224, + "step": 4093 + }, + { + "epoch": 4.5794183445190155, + "grad_norm": 0.6082426309585571, + "learning_rate": 6.843745300139426e-07, + "loss": 0.5186, + "step": 4094 + }, + { + "epoch": 4.580536912751678, + "grad_norm": 0.6134772896766663, + "learning_rate": 6.833491949149329e-07, + "loss": 0.5104, + "step": 4095 + }, + { + "epoch": 4.58165548098434, + "grad_norm": 0.6299877166748047, + "learning_rate": 6.823245068682847e-07, + "loss": 0.5271, + "step": 4096 + }, + { + "epoch": 4.582774049217003, + "grad_norm": 0.612312376499176, + "learning_rate": 6.81300466238971e-07, + "loss": 0.5329, + "step": 4097 + }, + { + "epoch": 4.583892617449664, + "grad_norm": 0.5974720120429993, + "learning_rate": 6.802770733917321e-07, + "loss": 0.5188, + "step": 4098 + }, + { + "epoch": 4.585011185682326, + "grad_norm": 0.6146959066390991, + "learning_rate": 6.792543286910808e-07, + "loss": 0.5369, + "step": 4099 + }, + { + "epoch": 4.586129753914989, + "grad_norm": 0.6181334853172302, + "learning_rate": 6.782322325012947e-07, + "loss": 0.5278, + "step": 4100 + }, + { + "epoch": 4.587248322147651, + "grad_norm": 0.6105345487594604, + "learning_rate": 6.772107851864254e-07, + "loss": 0.5146, + "step": 4101 + }, + { + "epoch": 4.588366890380313, + "grad_norm": 0.6299400329589844, + "learning_rate": 6.761899871102884e-07, + "loss": 0.5268, + "step": 4102 + }, + { + "epoch": 4.589485458612975, + "grad_norm": 0.6388238668441772, + "learning_rate": 6.751698386364703e-07, + "loss": 0.4978, + "step": 4103 + }, + { + "epoch": 4.590604026845638, + "grad_norm": 0.6019954085350037, + "learning_rate": 6.741503401283273e-07, + "loss": 0.5278, + "step": 4104 + }, + { + "epoch": 4.5917225950783, + "grad_norm": 0.6142205595970154, + "learning_rate": 6.731314919489823e-07, + "loss": 0.5167, + "step": 4105 + }, + { + "epoch": 4.592841163310962, + "grad_norm": 0.6175616383552551, + "learning_rate": 6.721132944613282e-07, + "loss": 0.5297, + "step": 4106 + }, + { + "epoch": 4.593959731543624, + "grad_norm": 0.6380606293678284, + "learning_rate": 6.710957480280245e-07, + "loss": 0.5258, + "step": 4107 + }, + { + "epoch": 4.5950782997762865, + "grad_norm": 0.6351152658462524, + "learning_rate": 6.700788530115002e-07, + "loss": 0.5372, + "step": 4108 + }, + { + "epoch": 4.596196868008948, + "grad_norm": 0.6230444312095642, + "learning_rate": 6.690626097739517e-07, + "loss": 0.5552, + "step": 4109 + }, + { + "epoch": 4.597315436241611, + "grad_norm": 0.6365874409675598, + "learning_rate": 6.680470186773427e-07, + "loss": 0.5499, + "step": 4110 + }, + { + "epoch": 4.598434004474273, + "grad_norm": 0.6084960699081421, + "learning_rate": 6.670320800834048e-07, + "loss": 0.5164, + "step": 4111 + }, + { + "epoch": 4.599552572706935, + "grad_norm": 0.6003339290618896, + "learning_rate": 6.660177943536386e-07, + "loss": 0.5138, + "step": 4112 + }, + { + "epoch": 4.600671140939597, + "grad_norm": 0.6006595492362976, + "learning_rate": 6.650041618493109e-07, + "loss": 0.4897, + "step": 4113 + }, + { + "epoch": 4.60178970917226, + "grad_norm": 0.6177203059196472, + "learning_rate": 6.639911829314546e-07, + "loss": 0.513, + "step": 4114 + }, + { + "epoch": 4.6029082774049215, + "grad_norm": 0.6120663285255432, + "learning_rate": 6.629788579608734e-07, + "loss": 0.5285, + "step": 4115 + }, + { + "epoch": 4.604026845637584, + "grad_norm": 0.6239177584648132, + "learning_rate": 6.61967187298134e-07, + "loss": 0.5329, + "step": 4116 + }, + { + "epoch": 4.605145413870246, + "grad_norm": 0.6162551641464233, + "learning_rate": 6.609561713035734e-07, + "loss": 0.5372, + "step": 4117 + }, + { + "epoch": 4.6062639821029085, + "grad_norm": 0.6273134350776672, + "learning_rate": 6.599458103372936e-07, + "loss": 0.5593, + "step": 4118 + }, + { + "epoch": 4.60738255033557, + "grad_norm": 0.611113965511322, + "learning_rate": 6.589361047591633e-07, + "loss": 0.5502, + "step": 4119 + }, + { + "epoch": 4.608501118568233, + "grad_norm": 0.6044580936431885, + "learning_rate": 6.579270549288174e-07, + "loss": 0.5203, + "step": 4120 + }, + { + "epoch": 4.609619686800895, + "grad_norm": 0.5963696837425232, + "learning_rate": 6.5691866120566e-07, + "loss": 0.5111, + "step": 4121 + }, + { + "epoch": 4.610738255033557, + "grad_norm": 0.6235088109970093, + "learning_rate": 6.559109239488576e-07, + "loss": 0.5502, + "step": 4122 + }, + { + "epoch": 4.611856823266219, + "grad_norm": 0.6367150545120239, + "learning_rate": 6.549038435173461e-07, + "loss": 0.5409, + "step": 4123 + }, + { + "epoch": 4.612975391498882, + "grad_norm": 0.6196271181106567, + "learning_rate": 6.538974202698259e-07, + "loss": 0.5294, + "step": 4124 + }, + { + "epoch": 4.614093959731544, + "grad_norm": 0.6228039860725403, + "learning_rate": 6.528916545647629e-07, + "loss": 0.5199, + "step": 4125 + }, + { + "epoch": 4.615212527964205, + "grad_norm": 0.620073676109314, + "learning_rate": 6.518865467603899e-07, + "loss": 0.5433, + "step": 4126 + }, + { + "epoch": 4.616331096196868, + "grad_norm": 0.6261345148086548, + "learning_rate": 6.50882097214704e-07, + "loss": 0.5127, + "step": 4127 + }, + { + "epoch": 4.617449664429531, + "grad_norm": 0.6221050024032593, + "learning_rate": 6.498783062854702e-07, + "loss": 0.5516, + "step": 4128 + }, + { + "epoch": 4.618568232662192, + "grad_norm": 0.6273404955863953, + "learning_rate": 6.488751743302163e-07, + "loss": 0.5323, + "step": 4129 + }, + { + "epoch": 4.619686800894854, + "grad_norm": 0.6044708490371704, + "learning_rate": 6.478727017062375e-07, + "loss": 0.5251, + "step": 4130 + }, + { + "epoch": 4.620805369127517, + "grad_norm": 0.6189625263214111, + "learning_rate": 6.468708887705921e-07, + "loss": 0.518, + "step": 4131 + }, + { + "epoch": 4.621923937360179, + "grad_norm": 0.6228320002555847, + "learning_rate": 6.458697358801061e-07, + "loss": 0.5213, + "step": 4132 + }, + { + "epoch": 4.623042505592841, + "grad_norm": 0.5935919284820557, + "learning_rate": 6.448692433913681e-07, + "loss": 0.518, + "step": 4133 + }, + { + "epoch": 4.624161073825503, + "grad_norm": 0.6069315075874329, + "learning_rate": 6.43869411660732e-07, + "loss": 0.5005, + "step": 4134 + }, + { + "epoch": 4.625279642058166, + "grad_norm": 0.5987986326217651, + "learning_rate": 6.428702410443166e-07, + "loss": 0.5213, + "step": 4135 + }, + { + "epoch": 4.626398210290827, + "grad_norm": 0.6405456066131592, + "learning_rate": 6.418717318980044e-07, + "loss": 0.5474, + "step": 4136 + }, + { + "epoch": 4.62751677852349, + "grad_norm": 0.6168224811553955, + "learning_rate": 6.408738845774451e-07, + "loss": 0.5318, + "step": 4137 + }, + { + "epoch": 4.628635346756152, + "grad_norm": 0.6058682203292847, + "learning_rate": 6.39876699438049e-07, + "loss": 0.4937, + "step": 4138 + }, + { + "epoch": 4.6297539149888145, + "grad_norm": 0.6042566299438477, + "learning_rate": 6.388801768349933e-07, + "loss": 0.5019, + "step": 4139 + }, + { + "epoch": 4.630872483221476, + "grad_norm": 0.621567964553833, + "learning_rate": 6.378843171232172e-07, + "loss": 0.5394, + "step": 4140 + }, + { + "epoch": 4.631991051454139, + "grad_norm": 0.6122708320617676, + "learning_rate": 6.368891206574257e-07, + "loss": 0.51, + "step": 4141 + }, + { + "epoch": 4.633109619686801, + "grad_norm": 0.6268691420555115, + "learning_rate": 6.358945877920861e-07, + "loss": 0.5251, + "step": 4142 + }, + { + "epoch": 4.634228187919463, + "grad_norm": 0.6161065101623535, + "learning_rate": 6.349007188814302e-07, + "loss": 0.5184, + "step": 4143 + }, + { + "epoch": 4.635346756152125, + "grad_norm": 0.605271577835083, + "learning_rate": 6.339075142794524e-07, + "loss": 0.5228, + "step": 4144 + }, + { + "epoch": 4.636465324384788, + "grad_norm": 0.6306655406951904, + "learning_rate": 6.329149743399107e-07, + "loss": 0.523, + "step": 4145 + }, + { + "epoch": 4.6375838926174495, + "grad_norm": 0.6109450459480286, + "learning_rate": 6.319230994163278e-07, + "loss": 0.519, + "step": 4146 + }, + { + "epoch": 4.638702460850112, + "grad_norm": 0.61943119764328, + "learning_rate": 6.309318898619873e-07, + "loss": 0.5219, + "step": 4147 + }, + { + "epoch": 4.639821029082774, + "grad_norm": 0.6090417504310608, + "learning_rate": 6.299413460299386e-07, + "loss": 0.5171, + "step": 4148 + }, + { + "epoch": 4.640939597315437, + "grad_norm": 0.6266696453094482, + "learning_rate": 6.289514682729902e-07, + "loss": 0.5319, + "step": 4149 + }, + { + "epoch": 4.642058165548098, + "grad_norm": 0.630018949508667, + "learning_rate": 6.279622569437185e-07, + "loss": 0.5297, + "step": 4150 + }, + { + "epoch": 4.643176733780761, + "grad_norm": 0.6124632954597473, + "learning_rate": 6.269737123944559e-07, + "loss": 0.5199, + "step": 4151 + }, + { + "epoch": 4.644295302013423, + "grad_norm": 0.6222842335700989, + "learning_rate": 6.259858349773034e-07, + "loss": 0.5402, + "step": 4152 + }, + { + "epoch": 4.6454138702460845, + "grad_norm": 0.6133089065551758, + "learning_rate": 6.249986250441206e-07, + "loss": 0.5541, + "step": 4153 + }, + { + "epoch": 4.646532438478747, + "grad_norm": 0.6119590997695923, + "learning_rate": 6.240120829465319e-07, + "loss": 0.5131, + "step": 4154 + }, + { + "epoch": 4.64765100671141, + "grad_norm": 0.618614912033081, + "learning_rate": 6.230262090359221e-07, + "loss": 0.5109, + "step": 4155 + }, + { + "epoch": 4.648769574944072, + "grad_norm": 0.6242492198944092, + "learning_rate": 6.220410036634375e-07, + "loss": 0.541, + "step": 4156 + }, + { + "epoch": 4.649888143176733, + "grad_norm": 0.6418766975402832, + "learning_rate": 6.21056467179989e-07, + "loss": 0.5323, + "step": 4157 + }, + { + "epoch": 4.651006711409396, + "grad_norm": 0.6265420913696289, + "learning_rate": 6.200725999362467e-07, + "loss": 0.5278, + "step": 4158 + }, + { + "epoch": 4.652125279642058, + "grad_norm": 0.6186356544494629, + "learning_rate": 6.190894022826433e-07, + "loss": 0.5137, + "step": 4159 + }, + { + "epoch": 4.6532438478747205, + "grad_norm": 0.6414504647254944, + "learning_rate": 6.181068745693716e-07, + "loss": 0.5524, + "step": 4160 + }, + { + "epoch": 4.654362416107382, + "grad_norm": 0.6189000010490417, + "learning_rate": 6.171250171463894e-07, + "loss": 0.5408, + "step": 4161 + }, + { + "epoch": 4.655480984340045, + "grad_norm": 0.6097800135612488, + "learning_rate": 6.161438303634115e-07, + "loss": 0.5065, + "step": 4162 + }, + { + "epoch": 4.656599552572707, + "grad_norm": 0.5997540950775146, + "learning_rate": 6.151633145699174e-07, + "loss": 0.5286, + "step": 4163 + }, + { + "epoch": 4.657718120805369, + "grad_norm": 0.6324319839477539, + "learning_rate": 6.141834701151439e-07, + "loss": 0.5408, + "step": 4164 + }, + { + "epoch": 4.658836689038031, + "grad_norm": 0.6081840991973877, + "learning_rate": 6.132042973480931e-07, + "loss": 0.5293, + "step": 4165 + }, + { + "epoch": 4.659955257270694, + "grad_norm": 0.6301783323287964, + "learning_rate": 6.122257966175241e-07, + "loss": 0.5401, + "step": 4166 + }, + { + "epoch": 4.6610738255033555, + "grad_norm": 0.622042715549469, + "learning_rate": 6.112479682719583e-07, + "loss": 0.5276, + "step": 4167 + }, + { + "epoch": 4.662192393736018, + "grad_norm": 0.6172642111778259, + "learning_rate": 6.102708126596773e-07, + "loss": 0.5276, + "step": 4168 + }, + { + "epoch": 4.66331096196868, + "grad_norm": 0.6213328838348389, + "learning_rate": 6.092943301287224e-07, + "loss": 0.5127, + "step": 4169 + }, + { + "epoch": 4.6644295302013425, + "grad_norm": 0.6147501468658447, + "learning_rate": 6.083185210268977e-07, + "loss": 0.5395, + "step": 4170 + }, + { + "epoch": 4.665548098434004, + "grad_norm": 0.6227311491966248, + "learning_rate": 6.073433857017635e-07, + "loss": 0.525, + "step": 4171 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.6157082915306091, + "learning_rate": 6.063689245006443e-07, + "loss": 0.5466, + "step": 4172 + }, + { + "epoch": 4.667785234899329, + "grad_norm": 0.6288533210754395, + "learning_rate": 6.053951377706207e-07, + "loss": 0.5198, + "step": 4173 + }, + { + "epoch": 4.668903803131991, + "grad_norm": 0.5910659432411194, + "learning_rate": 6.044220258585364e-07, + "loss": 0.5355, + "step": 4174 + }, + { + "epoch": 4.670022371364653, + "grad_norm": 0.6204217076301575, + "learning_rate": 6.034495891109924e-07, + "loss": 0.5058, + "step": 4175 + }, + { + "epoch": 4.671140939597316, + "grad_norm": 0.6177459359169006, + "learning_rate": 6.024778278743499e-07, + "loss": 0.5405, + "step": 4176 + }, + { + "epoch": 4.672259507829978, + "grad_norm": 0.6095530986785889, + "learning_rate": 6.015067424947296e-07, + "loss": 0.5213, + "step": 4177 + }, + { + "epoch": 4.67337807606264, + "grad_norm": 0.6181003451347351, + "learning_rate": 6.005363333180106e-07, + "loss": 0.535, + "step": 4178 + }, + { + "epoch": 4.674496644295302, + "grad_norm": 0.6089491248130798, + "learning_rate": 5.995666006898337e-07, + "loss": 0.5129, + "step": 4179 + }, + { + "epoch": 4.675615212527964, + "grad_norm": 0.6342470049858093, + "learning_rate": 5.985975449555955e-07, + "loss": 0.5397, + "step": 4180 + }, + { + "epoch": 4.676733780760626, + "grad_norm": 0.6063351631164551, + "learning_rate": 5.976291664604548e-07, + "loss": 0.5112, + "step": 4181 + }, + { + "epoch": 4.677852348993289, + "grad_norm": 0.6308937072753906, + "learning_rate": 5.966614655493258e-07, + "loss": 0.5269, + "step": 4182 + }, + { + "epoch": 4.678970917225951, + "grad_norm": 0.6245014667510986, + "learning_rate": 5.956944425668837e-07, + "loss": 0.5226, + "step": 4183 + }, + { + "epoch": 4.680089485458613, + "grad_norm": 0.6024780869483948, + "learning_rate": 5.947280978575609e-07, + "loss": 0.5236, + "step": 4184 + }, + { + "epoch": 4.681208053691275, + "grad_norm": 0.6306081414222717, + "learning_rate": 5.937624317655497e-07, + "loss": 0.5345, + "step": 4185 + }, + { + "epoch": 4.682326621923937, + "grad_norm": 0.6226223111152649, + "learning_rate": 5.927974446347992e-07, + "loss": 0.5456, + "step": 4186 + }, + { + "epoch": 4.6834451901566, + "grad_norm": 0.6082135438919067, + "learning_rate": 5.918331368090172e-07, + "loss": 0.5227, + "step": 4187 + }, + { + "epoch": 4.684563758389261, + "grad_norm": 0.6295832991600037, + "learning_rate": 5.908695086316701e-07, + "loss": 0.5187, + "step": 4188 + }, + { + "epoch": 4.685682326621924, + "grad_norm": 0.601658821105957, + "learning_rate": 5.899065604459814e-07, + "loss": 0.5061, + "step": 4189 + }, + { + "epoch": 4.686800894854586, + "grad_norm": 0.6016608476638794, + "learning_rate": 5.889442925949332e-07, + "loss": 0.4981, + "step": 4190 + }, + { + "epoch": 4.6879194630872485, + "grad_norm": 0.6155381202697754, + "learning_rate": 5.879827054212645e-07, + "loss": 0.5377, + "step": 4191 + }, + { + "epoch": 4.68903803131991, + "grad_norm": 0.6111737489700317, + "learning_rate": 5.870217992674723e-07, + "loss": 0.5159, + "step": 4192 + }, + { + "epoch": 4.690156599552573, + "grad_norm": 0.6232223510742188, + "learning_rate": 5.860615744758102e-07, + "loss": 0.5275, + "step": 4193 + }, + { + "epoch": 4.691275167785235, + "grad_norm": 0.6044846177101135, + "learning_rate": 5.851020313882913e-07, + "loss": 0.5068, + "step": 4194 + }, + { + "epoch": 4.692393736017897, + "grad_norm": 0.6424250602722168, + "learning_rate": 5.841431703466827e-07, + "loss": 0.5749, + "step": 4195 + }, + { + "epoch": 4.693512304250559, + "grad_norm": 0.6269358992576599, + "learning_rate": 5.83184991692512e-07, + "loss": 0.5133, + "step": 4196 + }, + { + "epoch": 4.694630872483222, + "grad_norm": 0.6282919645309448, + "learning_rate": 5.822274957670612e-07, + "loss": 0.5387, + "step": 4197 + }, + { + "epoch": 4.6957494407158835, + "grad_norm": 0.6088079810142517, + "learning_rate": 5.812706829113693e-07, + "loss": 0.5277, + "step": 4198 + }, + { + "epoch": 4.696868008948546, + "grad_norm": 0.6263190507888794, + "learning_rate": 5.803145534662349e-07, + "loss": 0.5224, + "step": 4199 + }, + { + "epoch": 4.697986577181208, + "grad_norm": 0.6024121046066284, + "learning_rate": 5.79359107772208e-07, + "loss": 0.4909, + "step": 4200 + }, + { + "epoch": 4.699105145413871, + "grad_norm": 0.6233183741569519, + "learning_rate": 5.784043461696004e-07, + "loss": 0.5376, + "step": 4201 + }, + { + "epoch": 4.700223713646532, + "grad_norm": 0.6314034461975098, + "learning_rate": 5.77450268998476e-07, + "loss": 0.5522, + "step": 4202 + }, + { + "epoch": 4.701342281879195, + "grad_norm": 0.6141626834869385, + "learning_rate": 5.764968765986586e-07, + "loss": 0.5139, + "step": 4203 + }, + { + "epoch": 4.702460850111857, + "grad_norm": 0.6016871333122253, + "learning_rate": 5.755441693097247e-07, + "loss": 0.5101, + "step": 4204 + }, + { + "epoch": 4.703579418344519, + "grad_norm": 0.6220463514328003, + "learning_rate": 5.745921474710098e-07, + "loss": 0.4984, + "step": 4205 + }, + { + "epoch": 4.704697986577181, + "grad_norm": 0.623416543006897, + "learning_rate": 5.736408114216033e-07, + "loss": 0.5329, + "step": 4206 + }, + { + "epoch": 4.705816554809843, + "grad_norm": 0.6152774095535278, + "learning_rate": 5.726901615003505e-07, + "loss": 0.5475, + "step": 4207 + }, + { + "epoch": 4.706935123042506, + "grad_norm": 0.6148276329040527, + "learning_rate": 5.717401980458528e-07, + "loss": 0.5048, + "step": 4208 + }, + { + "epoch": 4.708053691275168, + "grad_norm": 0.6332700252532959, + "learning_rate": 5.707909213964666e-07, + "loss": 0.5486, + "step": 4209 + }, + { + "epoch": 4.70917225950783, + "grad_norm": 0.6266604065895081, + "learning_rate": 5.698423318903052e-07, + "loss": 0.5234, + "step": 4210 + }, + { + "epoch": 4.710290827740492, + "grad_norm": 0.6005502343177795, + "learning_rate": 5.688944298652344e-07, + "loss": 0.5329, + "step": 4211 + }, + { + "epoch": 4.7114093959731544, + "grad_norm": 0.6332852244377136, + "learning_rate": 5.679472156588783e-07, + "loss": 0.5137, + "step": 4212 + }, + { + "epoch": 4.712527964205816, + "grad_norm": 0.6186884045600891, + "learning_rate": 5.670006896086131e-07, + "loss": 0.5218, + "step": 4213 + }, + { + "epoch": 4.713646532438479, + "grad_norm": 0.6097549200057983, + "learning_rate": 5.660548520515724e-07, + "loss": 0.5067, + "step": 4214 + }, + { + "epoch": 4.714765100671141, + "grad_norm": 0.6113163232803345, + "learning_rate": 5.65109703324643e-07, + "loss": 0.5523, + "step": 4215 + }, + { + "epoch": 4.715883668903803, + "grad_norm": 0.6053656339645386, + "learning_rate": 5.641652437644668e-07, + "loss": 0.4918, + "step": 4216 + }, + { + "epoch": 4.717002237136465, + "grad_norm": 0.6113032102584839, + "learning_rate": 5.632214737074393e-07, + "loss": 0.5279, + "step": 4217 + }, + { + "epoch": 4.718120805369128, + "grad_norm": 0.5959969758987427, + "learning_rate": 5.62278393489713e-07, + "loss": 0.5469, + "step": 4218 + }, + { + "epoch": 4.7192393736017895, + "grad_norm": 0.6240187287330627, + "learning_rate": 5.613360034471918e-07, + "loss": 0.5287, + "step": 4219 + }, + { + "epoch": 4.720357941834452, + "grad_norm": 0.6279364824295044, + "learning_rate": 5.603943039155347e-07, + "loss": 0.5364, + "step": 4220 + }, + { + "epoch": 4.721476510067114, + "grad_norm": 0.6038797497749329, + "learning_rate": 5.594532952301562e-07, + "loss": 0.5447, + "step": 4221 + }, + { + "epoch": 4.7225950782997765, + "grad_norm": 0.6107326149940491, + "learning_rate": 5.585129777262224e-07, + "loss": 0.5238, + "step": 4222 + }, + { + "epoch": 4.723713646532438, + "grad_norm": 0.6297809481620789, + "learning_rate": 5.575733517386556e-07, + "loss": 0.5356, + "step": 4223 + }, + { + "epoch": 4.724832214765101, + "grad_norm": 0.6040112376213074, + "learning_rate": 5.566344176021299e-07, + "loss": 0.504, + "step": 4224 + }, + { + "epoch": 4.725950782997763, + "grad_norm": 0.5866071581840515, + "learning_rate": 5.556961756510737e-07, + "loss": 0.5108, + "step": 4225 + }, + { + "epoch": 4.727069351230425, + "grad_norm": 0.6039175987243652, + "learning_rate": 5.547586262196683e-07, + "loss": 0.5058, + "step": 4226 + }, + { + "epoch": 4.728187919463087, + "grad_norm": 0.620013952255249, + "learning_rate": 5.538217696418499e-07, + "loss": 0.5171, + "step": 4227 + }, + { + "epoch": 4.72930648769575, + "grad_norm": 0.6077677011489868, + "learning_rate": 5.528856062513058e-07, + "loss": 0.5007, + "step": 4228 + }, + { + "epoch": 4.730425055928412, + "grad_norm": 0.6117517352104187, + "learning_rate": 5.519501363814786e-07, + "loss": 0.5291, + "step": 4229 + }, + { + "epoch": 4.731543624161074, + "grad_norm": 0.618986189365387, + "learning_rate": 5.510153603655622e-07, + "loss": 0.5284, + "step": 4230 + }, + { + "epoch": 4.732662192393736, + "grad_norm": 0.6191661357879639, + "learning_rate": 5.50081278536504e-07, + "loss": 0.5139, + "step": 4231 + }, + { + "epoch": 4.733780760626399, + "grad_norm": 0.6176586747169495, + "learning_rate": 5.49147891227004e-07, + "loss": 0.5556, + "step": 4232 + }, + { + "epoch": 4.73489932885906, + "grad_norm": 0.6484273076057434, + "learning_rate": 5.482151987695142e-07, + "loss": 0.5453, + "step": 4233 + }, + { + "epoch": 4.736017897091722, + "grad_norm": 0.6048842072486877, + "learning_rate": 5.472832014962412e-07, + "loss": 0.5089, + "step": 4234 + }, + { + "epoch": 4.737136465324385, + "grad_norm": 0.6267152428627014, + "learning_rate": 5.463518997391415e-07, + "loss": 0.5368, + "step": 4235 + }, + { + "epoch": 4.7382550335570475, + "grad_norm": 0.6154950857162476, + "learning_rate": 5.454212938299256e-07, + "loss": 0.5461, + "step": 4236 + }, + { + "epoch": 4.739373601789709, + "grad_norm": 0.6153565049171448, + "learning_rate": 5.444913841000548e-07, + "loss": 0.5503, + "step": 4237 + }, + { + "epoch": 4.740492170022371, + "grad_norm": 0.6144719123840332, + "learning_rate": 5.435621708807445e-07, + "loss": 0.5173, + "step": 4238 + }, + { + "epoch": 4.741610738255034, + "grad_norm": 0.6148480176925659, + "learning_rate": 5.426336545029598e-07, + "loss": 0.5549, + "step": 4239 + }, + { + "epoch": 4.742729306487695, + "grad_norm": 0.6281608939170837, + "learning_rate": 5.417058352974183e-07, + "loss": 0.5387, + "step": 4240 + }, + { + "epoch": 4.743847874720358, + "grad_norm": 0.5989152193069458, + "learning_rate": 5.4077871359459e-07, + "loss": 0.4972, + "step": 4241 + }, + { + "epoch": 4.74496644295302, + "grad_norm": 0.6037313938140869, + "learning_rate": 5.398522897246944e-07, + "loss": 0.4888, + "step": 4242 + }, + { + "epoch": 4.7460850111856825, + "grad_norm": 0.6202802062034607, + "learning_rate": 5.389265640177063e-07, + "loss": 0.5386, + "step": 4243 + }, + { + "epoch": 4.747203579418344, + "grad_norm": 0.6116086840629578, + "learning_rate": 5.380015368033476e-07, + "loss": 0.5377, + "step": 4244 + }, + { + "epoch": 4.748322147651007, + "grad_norm": 0.6276658177375793, + "learning_rate": 5.370772084110945e-07, + "loss": 0.5278, + "step": 4245 + }, + { + "epoch": 4.749440715883669, + "grad_norm": 0.6100133061408997, + "learning_rate": 5.361535791701724e-07, + "loss": 0.5324, + "step": 4246 + }, + { + "epoch": 4.750559284116331, + "grad_norm": 0.615986168384552, + "learning_rate": 5.352306494095592e-07, + "loss": 0.53, + "step": 4247 + }, + { + "epoch": 4.751677852348993, + "grad_norm": 0.6291327476501465, + "learning_rate": 5.343084194579823e-07, + "loss": 0.5138, + "step": 4248 + }, + { + "epoch": 4.752796420581656, + "grad_norm": 0.6179410219192505, + "learning_rate": 5.333868896439201e-07, + "loss": 0.5154, + "step": 4249 + }, + { + "epoch": 4.7539149888143175, + "grad_norm": 0.6211801767349243, + "learning_rate": 5.324660602956025e-07, + "loss": 0.5257, + "step": 4250 + }, + { + "epoch": 4.75503355704698, + "grad_norm": 0.6245579123497009, + "learning_rate": 5.315459317410085e-07, + "loss": 0.4941, + "step": 4251 + }, + { + "epoch": 4.756152125279642, + "grad_norm": 0.6134278178215027, + "learning_rate": 5.306265043078693e-07, + "loss": 0.512, + "step": 4252 + }, + { + "epoch": 4.757270693512305, + "grad_norm": 0.6103272438049316, + "learning_rate": 5.297077783236642e-07, + "loss": 0.5293, + "step": 4253 + }, + { + "epoch": 4.758389261744966, + "grad_norm": 0.6163358688354492, + "learning_rate": 5.287897541156253e-07, + "loss": 0.5435, + "step": 4254 + }, + { + "epoch": 4.759507829977629, + "grad_norm": 0.6116615533828735, + "learning_rate": 5.278724320107326e-07, + "loss": 0.5303, + "step": 4255 + }, + { + "epoch": 4.760626398210291, + "grad_norm": 0.6170740723609924, + "learning_rate": 5.269558123357165e-07, + "loss": 0.5243, + "step": 4256 + }, + { + "epoch": 4.7617449664429525, + "grad_norm": 0.610209047794342, + "learning_rate": 5.260398954170573e-07, + "loss": 0.5154, + "step": 4257 + }, + { + "epoch": 4.762863534675615, + "grad_norm": 0.6133108735084534, + "learning_rate": 5.251246815809857e-07, + "loss": 0.5233, + "step": 4258 + }, + { + "epoch": 4.763982102908278, + "grad_norm": 0.6136714220046997, + "learning_rate": 5.242101711534808e-07, + "loss": 0.5187, + "step": 4259 + }, + { + "epoch": 4.76510067114094, + "grad_norm": 0.626308262348175, + "learning_rate": 5.232963644602726e-07, + "loss": 0.5307, + "step": 4260 + }, + { + "epoch": 4.766219239373601, + "grad_norm": 0.6247338652610779, + "learning_rate": 5.223832618268391e-07, + "loss": 0.5646, + "step": 4261 + }, + { + "epoch": 4.767337807606264, + "grad_norm": 0.624542772769928, + "learning_rate": 5.214708635784071e-07, + "loss": 0.5024, + "step": 4262 + }, + { + "epoch": 4.768456375838926, + "grad_norm": 0.598546028137207, + "learning_rate": 5.205591700399551e-07, + "loss": 0.5163, + "step": 4263 + }, + { + "epoch": 4.769574944071588, + "grad_norm": 0.6249225735664368, + "learning_rate": 5.196481815362084e-07, + "loss": 0.5323, + "step": 4264 + }, + { + "epoch": 4.77069351230425, + "grad_norm": 0.6184273362159729, + "learning_rate": 5.187378983916413e-07, + "loss": 0.5107, + "step": 4265 + }, + { + "epoch": 4.771812080536913, + "grad_norm": 0.6360250115394592, + "learning_rate": 5.178283209304769e-07, + "loss": 0.5243, + "step": 4266 + }, + { + "epoch": 4.772930648769575, + "grad_norm": 0.607309877872467, + "learning_rate": 5.169194494766886e-07, + "loss": 0.4918, + "step": 4267 + }, + { + "epoch": 4.774049217002237, + "grad_norm": 0.6215641498565674, + "learning_rate": 5.160112843539955e-07, + "loss": 0.5311, + "step": 4268 + }, + { + "epoch": 4.775167785234899, + "grad_norm": 0.6287278532981873, + "learning_rate": 5.151038258858684e-07, + "loss": 0.5415, + "step": 4269 + }, + { + "epoch": 4.776286353467562, + "grad_norm": 0.6232270002365112, + "learning_rate": 5.141970743955233e-07, + "loss": 0.5162, + "step": 4270 + }, + { + "epoch": 4.7774049217002235, + "grad_norm": 0.595032811164856, + "learning_rate": 5.132910302059271e-07, + "loss": 0.5214, + "step": 4271 + }, + { + "epoch": 4.778523489932886, + "grad_norm": 0.6345818638801575, + "learning_rate": 5.123856936397925e-07, + "loss": 0.5612, + "step": 4272 + }, + { + "epoch": 4.779642058165548, + "grad_norm": 0.6268307566642761, + "learning_rate": 5.114810650195817e-07, + "loss": 0.5141, + "step": 4273 + }, + { + "epoch": 4.7807606263982105, + "grad_norm": 0.6207347512245178, + "learning_rate": 5.105771446675034e-07, + "loss": 0.515, + "step": 4274 + }, + { + "epoch": 4.781879194630872, + "grad_norm": 0.6145913600921631, + "learning_rate": 5.09673932905515e-07, + "loss": 0.5076, + "step": 4275 + }, + { + "epoch": 4.782997762863535, + "grad_norm": 0.6285806894302368, + "learning_rate": 5.087714300553221e-07, + "loss": 0.5255, + "step": 4276 + }, + { + "epoch": 4.784116331096197, + "grad_norm": 0.6313791275024414, + "learning_rate": 5.078696364383753e-07, + "loss": 0.5499, + "step": 4277 + }, + { + "epoch": 4.785234899328859, + "grad_norm": 0.6183452010154724, + "learning_rate": 5.069685523758766e-07, + "loss": 0.5301, + "step": 4278 + }, + { + "epoch": 4.786353467561521, + "grad_norm": 0.6329482793807983, + "learning_rate": 5.060681781887717e-07, + "loss": 0.5497, + "step": 4279 + }, + { + "epoch": 4.787472035794184, + "grad_norm": 0.6155105233192444, + "learning_rate": 5.051685141977547e-07, + "loss": 0.5087, + "step": 4280 + }, + { + "epoch": 4.7885906040268456, + "grad_norm": 0.625220000743866, + "learning_rate": 5.042695607232664e-07, + "loss": 0.5412, + "step": 4281 + }, + { + "epoch": 4.789709172259508, + "grad_norm": 0.6219828128814697, + "learning_rate": 5.03371318085496e-07, + "loss": 0.4959, + "step": 4282 + }, + { + "epoch": 4.79082774049217, + "grad_norm": 0.6242988705635071, + "learning_rate": 5.02473786604378e-07, + "loss": 0.4926, + "step": 4283 + }, + { + "epoch": 4.791946308724832, + "grad_norm": 0.6251333951950073, + "learning_rate": 5.015769665995929e-07, + "loss": 0.5368, + "step": 4284 + }, + { + "epoch": 4.793064876957494, + "grad_norm": 0.6266576051712036, + "learning_rate": 5.006808583905706e-07, + "loss": 0.5224, + "step": 4285 + }, + { + "epoch": 4.794183445190157, + "grad_norm": 0.6432021260261536, + "learning_rate": 4.997854622964846e-07, + "loss": 0.5514, + "step": 4286 + }, + { + "epoch": 4.795302013422819, + "grad_norm": 0.6249504089355469, + "learning_rate": 4.988907786362567e-07, + "loss": 0.519, + "step": 4287 + }, + { + "epoch": 4.796420581655481, + "grad_norm": 0.6238746047019958, + "learning_rate": 4.979968077285538e-07, + "loss": 0.5383, + "step": 4288 + }, + { + "epoch": 4.797539149888143, + "grad_norm": 0.6194822192192078, + "learning_rate": 4.971035498917895e-07, + "loss": 0.5317, + "step": 4289 + }, + { + "epoch": 4.798657718120805, + "grad_norm": 0.6111868619918823, + "learning_rate": 4.962110054441225e-07, + "loss": 0.5197, + "step": 4290 + }, + { + "epoch": 4.799776286353468, + "grad_norm": 0.6123765110969543, + "learning_rate": 4.953191747034594e-07, + "loss": 0.5356, + "step": 4291 + }, + { + "epoch": 4.800894854586129, + "grad_norm": 0.6086040139198303, + "learning_rate": 4.944280579874497e-07, + "loss": 0.512, + "step": 4292 + }, + { + "epoch": 4.802013422818792, + "grad_norm": 0.6479697823524475, + "learning_rate": 4.935376556134921e-07, + "loss": 0.5288, + "step": 4293 + }, + { + "epoch": 4.803131991051454, + "grad_norm": 0.5919728875160217, + "learning_rate": 4.926479678987281e-07, + "loss": 0.516, + "step": 4294 + }, + { + "epoch": 4.8042505592841165, + "grad_norm": 0.6351590156555176, + "learning_rate": 4.917589951600446e-07, + "loss": 0.5576, + "step": 4295 + }, + { + "epoch": 4.805369127516778, + "grad_norm": 0.620989978313446, + "learning_rate": 4.908707377140765e-07, + "loss": 0.5335, + "step": 4296 + }, + { + "epoch": 4.806487695749441, + "grad_norm": 0.6417598724365234, + "learning_rate": 4.899831958772014e-07, + "loss": 0.5394, + "step": 4297 + }, + { + "epoch": 4.807606263982103, + "grad_norm": 0.6155484318733215, + "learning_rate": 4.890963699655427e-07, + "loss": 0.5253, + "step": 4298 + }, + { + "epoch": 4.808724832214765, + "grad_norm": 0.610963761806488, + "learning_rate": 4.882102602949682e-07, + "loss": 0.5261, + "step": 4299 + }, + { + "epoch": 4.809843400447427, + "grad_norm": 0.5874950289726257, + "learning_rate": 4.873248671810929e-07, + "loss": 0.5179, + "step": 4300 + }, + { + "epoch": 4.81096196868009, + "grad_norm": 0.603138267993927, + "learning_rate": 4.864401909392736e-07, + "loss": 0.5033, + "step": 4301 + }, + { + "epoch": 4.8120805369127515, + "grad_norm": 0.6461830735206604, + "learning_rate": 4.855562318846143e-07, + "loss": 0.5556, + "step": 4302 + }, + { + "epoch": 4.813199105145414, + "grad_norm": 0.6246387362480164, + "learning_rate": 4.846729903319619e-07, + "loss": 0.4991, + "step": 4303 + }, + { + "epoch": 4.814317673378076, + "grad_norm": 0.6340742111206055, + "learning_rate": 4.837904665959081e-07, + "loss": 0.5407, + "step": 4304 + }, + { + "epoch": 4.815436241610739, + "grad_norm": 0.6025733351707458, + "learning_rate": 4.829086609907893e-07, + "loss": 0.5179, + "step": 4305 + }, + { + "epoch": 4.8165548098434, + "grad_norm": 0.6241434216499329, + "learning_rate": 4.820275738306854e-07, + "loss": 0.4934, + "step": 4306 + }, + { + "epoch": 4.817673378076063, + "grad_norm": 0.6417579054832458, + "learning_rate": 4.811472054294214e-07, + "loss": 0.5298, + "step": 4307 + }, + { + "epoch": 4.818791946308725, + "grad_norm": 0.6289979815483093, + "learning_rate": 4.802675561005654e-07, + "loss": 0.5408, + "step": 4308 + }, + { + "epoch": 4.819910514541387, + "grad_norm": 0.6294391751289368, + "learning_rate": 4.793886261574304e-07, + "loss": 0.5257, + "step": 4309 + }, + { + "epoch": 4.821029082774049, + "grad_norm": 0.6174818277359009, + "learning_rate": 4.785104159130718e-07, + "loss": 0.5215, + "step": 4310 + }, + { + "epoch": 4.822147651006711, + "grad_norm": 0.6052206158638, + "learning_rate": 4.776329256802903e-07, + "loss": 0.4855, + "step": 4311 + }, + { + "epoch": 4.823266219239374, + "grad_norm": 0.6192960739135742, + "learning_rate": 4.767561557716288e-07, + "loss": 0.523, + "step": 4312 + }, + { + "epoch": 4.824384787472036, + "grad_norm": 0.6602144241333008, + "learning_rate": 4.75880106499374e-07, + "loss": 0.5381, + "step": 4313 + }, + { + "epoch": 4.825503355704698, + "grad_norm": 0.5983721613883972, + "learning_rate": 4.7500477817555597e-07, + "loss": 0.5252, + "step": 4314 + }, + { + "epoch": 4.82662192393736, + "grad_norm": 0.6190552711486816, + "learning_rate": 4.7413017111194727e-07, + "loss": 0.5392, + "step": 4315 + }, + { + "epoch": 4.827740492170022, + "grad_norm": 0.6299715638160706, + "learning_rate": 4.732562856200662e-07, + "loss": 0.5297, + "step": 4316 + }, + { + "epoch": 4.828859060402684, + "grad_norm": 0.6263290047645569, + "learning_rate": 4.723831220111702e-07, + "loss": 0.5411, + "step": 4317 + }, + { + "epoch": 4.829977628635347, + "grad_norm": 0.6194301843643188, + "learning_rate": 4.715106805962633e-07, + "loss": 0.5074, + "step": 4318 + }, + { + "epoch": 4.831096196868009, + "grad_norm": 0.628377377986908, + "learning_rate": 4.7063896168608904e-07, + "loss": 0.5187, + "step": 4319 + }, + { + "epoch": 4.832214765100671, + "grad_norm": 0.637402355670929, + "learning_rate": 4.6976796559113633e-07, + "loss": 0.5264, + "step": 4320 + }, + { + "epoch": 4.833333333333333, + "grad_norm": 0.62098628282547, + "learning_rate": 4.6889769262163543e-07, + "loss": 0.5379, + "step": 4321 + }, + { + "epoch": 4.834451901565996, + "grad_norm": 0.6260651350021362, + "learning_rate": 4.680281430875583e-07, + "loss": 0.5266, + "step": 4322 + }, + { + "epoch": 4.8355704697986575, + "grad_norm": 0.6246919631958008, + "learning_rate": 4.671593172986197e-07, + "loss": 0.5241, + "step": 4323 + }, + { + "epoch": 4.83668903803132, + "grad_norm": 0.6138426065444946, + "learning_rate": 4.662912155642782e-07, + "loss": 0.5161, + "step": 4324 + }, + { + "epoch": 4.837807606263982, + "grad_norm": 0.6254590749740601, + "learning_rate": 4.654238381937323e-07, + "loss": 0.5266, + "step": 4325 + }, + { + "epoch": 4.8389261744966445, + "grad_norm": 0.6181991696357727, + "learning_rate": 4.64557185495923e-07, + "loss": 0.5294, + "step": 4326 + }, + { + "epoch": 4.840044742729306, + "grad_norm": 0.6205984950065613, + "learning_rate": 4.636912577795347e-07, + "loss": 0.5234, + "step": 4327 + }, + { + "epoch": 4.841163310961969, + "grad_norm": 0.6175354719161987, + "learning_rate": 4.628260553529917e-07, + "loss": 0.5206, + "step": 4328 + }, + { + "epoch": 4.842281879194631, + "grad_norm": 0.6091630458831787, + "learning_rate": 4.619615785244608e-07, + "loss": 0.5419, + "step": 4329 + }, + { + "epoch": 4.843400447427293, + "grad_norm": 0.6083254814147949, + "learning_rate": 4.610978276018496e-07, + "loss": 0.5226, + "step": 4330 + }, + { + "epoch": 4.844519015659955, + "grad_norm": 0.6020364761352539, + "learning_rate": 4.6023480289280914e-07, + "loss": 0.5288, + "step": 4331 + }, + { + "epoch": 4.845637583892618, + "grad_norm": 0.6260385513305664, + "learning_rate": 4.593725047047293e-07, + "loss": 0.5303, + "step": 4332 + }, + { + "epoch": 4.8467561521252795, + "grad_norm": 0.6089619994163513, + "learning_rate": 4.5851093334474325e-07, + "loss": 0.4993, + "step": 4333 + }, + { + "epoch": 4.847874720357942, + "grad_norm": 0.6102758646011353, + "learning_rate": 4.576500891197233e-07, + "loss": 0.5202, + "step": 4334 + }, + { + "epoch": 4.848993288590604, + "grad_norm": 0.5902683138847351, + "learning_rate": 4.5678997233628506e-07, + "loss": 0.4824, + "step": 4335 + }, + { + "epoch": 4.850111856823267, + "grad_norm": 0.6379708051681519, + "learning_rate": 4.5593058330078323e-07, + "loss": 0.5375, + "step": 4336 + }, + { + "epoch": 4.851230425055928, + "grad_norm": 0.6257898211479187, + "learning_rate": 4.5507192231931395e-07, + "loss": 0.5099, + "step": 4337 + }, + { + "epoch": 4.85234899328859, + "grad_norm": 0.6260154247283936, + "learning_rate": 4.542139896977138e-07, + "loss": 0.5406, + "step": 4338 + }, + { + "epoch": 4.853467561521253, + "grad_norm": 0.6325336694717407, + "learning_rate": 4.5335678574155994e-07, + "loss": 0.5161, + "step": 4339 + }, + { + "epoch": 4.8545861297539155, + "grad_norm": 0.6125253438949585, + "learning_rate": 4.5250031075617095e-07, + "loss": 0.4991, + "step": 4340 + }, + { + "epoch": 4.855704697986577, + "grad_norm": 0.6220778822898865, + "learning_rate": 4.5164456504660394e-07, + "loss": 0.4977, + "step": 4341 + }, + { + "epoch": 4.856823266219239, + "grad_norm": 0.606346607208252, + "learning_rate": 4.5078954891765864e-07, + "loss": 0.5171, + "step": 4342 + }, + { + "epoch": 4.857941834451902, + "grad_norm": 0.6247082352638245, + "learning_rate": 4.4993526267387234e-07, + "loss": 0.5235, + "step": 4343 + }, + { + "epoch": 4.859060402684563, + "grad_norm": 0.6178449988365173, + "learning_rate": 4.4908170661952465e-07, + "loss": 0.5277, + "step": 4344 + }, + { + "epoch": 4.860178970917226, + "grad_norm": 0.6131439805030823, + "learning_rate": 4.4822888105863353e-07, + "loss": 0.5185, + "step": 4345 + }, + { + "epoch": 4.861297539149888, + "grad_norm": 0.6086623668670654, + "learning_rate": 4.473767862949577e-07, + "loss": 0.5385, + "step": 4346 + }, + { + "epoch": 4.8624161073825505, + "grad_norm": 0.6126616597175598, + "learning_rate": 4.4652542263199455e-07, + "loss": 0.5149, + "step": 4347 + }, + { + "epoch": 4.863534675615212, + "grad_norm": 0.5909984707832336, + "learning_rate": 4.456747903729816e-07, + "loss": 0.4821, + "step": 4348 + }, + { + "epoch": 4.864653243847875, + "grad_norm": 0.6049395203590393, + "learning_rate": 4.4482488982089704e-07, + "loss": 0.516, + "step": 4349 + }, + { + "epoch": 4.865771812080537, + "grad_norm": 0.6227195858955383, + "learning_rate": 4.4397572127845597e-07, + "loss": 0.5281, + "step": 4350 + }, + { + "epoch": 4.866890380313199, + "grad_norm": 0.6395372748374939, + "learning_rate": 4.431272850481155e-07, + "loss": 0.5498, + "step": 4351 + }, + { + "epoch": 4.868008948545861, + "grad_norm": 0.6205397844314575, + "learning_rate": 4.422795814320693e-07, + "loss": 0.5321, + "step": 4352 + }, + { + "epoch": 4.869127516778524, + "grad_norm": 0.6074703931808472, + "learning_rate": 4.4143261073225336e-07, + "loss": 0.4975, + "step": 4353 + }, + { + "epoch": 4.8702460850111855, + "grad_norm": 0.6096267700195312, + "learning_rate": 4.405863732503379e-07, + "loss": 0.5209, + "step": 4354 + }, + { + "epoch": 4.871364653243848, + "grad_norm": 0.6234140992164612, + "learning_rate": 4.397408692877367e-07, + "loss": 0.5103, + "step": 4355 + }, + { + "epoch": 4.87248322147651, + "grad_norm": 0.6261554956436157, + "learning_rate": 4.388960991455998e-07, + "loss": 0.521, + "step": 4356 + }, + { + "epoch": 4.873601789709173, + "grad_norm": 0.6448354721069336, + "learning_rate": 4.380520631248153e-07, + "loss": 0.5625, + "step": 4357 + }, + { + "epoch": 4.874720357941834, + "grad_norm": 0.6191956996917725, + "learning_rate": 4.372087615260126e-07, + "loss": 0.508, + "step": 4358 + }, + { + "epoch": 4.875838926174497, + "grad_norm": 0.6106873154640198, + "learning_rate": 4.363661946495565e-07, + "loss": 0.5579, + "step": 4359 + }, + { + "epoch": 4.876957494407159, + "grad_norm": 0.6299424767494202, + "learning_rate": 4.3552436279555216e-07, + "loss": 0.5372, + "step": 4360 + }, + { + "epoch": 4.878076062639821, + "grad_norm": 0.6564977169036865, + "learning_rate": 4.34683266263842e-07, + "loss": 0.5406, + "step": 4361 + }, + { + "epoch": 4.879194630872483, + "grad_norm": 0.6174746751785278, + "learning_rate": 4.3384290535400665e-07, + "loss": 0.5307, + "step": 4362 + }, + { + "epoch": 4.880313199105146, + "grad_norm": 0.6160749197006226, + "learning_rate": 4.3300328036536413e-07, + "loss": 0.5208, + "step": 4363 + }, + { + "epoch": 4.881431767337808, + "grad_norm": 0.6038165092468262, + "learning_rate": 4.32164391596972e-07, + "loss": 0.5243, + "step": 4364 + }, + { + "epoch": 4.882550335570469, + "grad_norm": 0.6318257451057434, + "learning_rate": 4.3132623934762365e-07, + "loss": 0.5156, + "step": 4365 + }, + { + "epoch": 4.883668903803132, + "grad_norm": 0.6299393773078918, + "learning_rate": 4.304888239158522e-07, + "loss": 0.5602, + "step": 4366 + }, + { + "epoch": 4.884787472035795, + "grad_norm": 0.6139900088310242, + "learning_rate": 4.296521455999267e-07, + "loss": 0.5247, + "step": 4367 + }, + { + "epoch": 4.885906040268456, + "grad_norm": 0.6244694590568542, + "learning_rate": 4.2881620469785334e-07, + "loss": 0.5367, + "step": 4368 + }, + { + "epoch": 4.887024608501118, + "grad_norm": 0.6173675656318665, + "learning_rate": 4.2798100150737764e-07, + "loss": 0.5441, + "step": 4369 + }, + { + "epoch": 4.888143176733781, + "grad_norm": 0.6056670546531677, + "learning_rate": 4.271465363259811e-07, + "loss": 0.5005, + "step": 4370 + }, + { + "epoch": 4.889261744966443, + "grad_norm": 0.6176080703735352, + "learning_rate": 4.2631280945088197e-07, + "loss": 0.5351, + "step": 4371 + }, + { + "epoch": 4.890380313199105, + "grad_norm": 0.6030075550079346, + "learning_rate": 4.254798211790356e-07, + "loss": 0.5549, + "step": 4372 + }, + { + "epoch": 4.891498881431767, + "grad_norm": 0.6153278350830078, + "learning_rate": 4.2464757180713606e-07, + "loss": 0.5385, + "step": 4373 + }, + { + "epoch": 4.89261744966443, + "grad_norm": 0.6094895005226135, + "learning_rate": 4.2381606163161113e-07, + "loss": 0.5245, + "step": 4374 + }, + { + "epoch": 4.8937360178970915, + "grad_norm": 0.6144399046897888, + "learning_rate": 4.2298529094862874e-07, + "loss": 0.5273, + "step": 4375 + }, + { + "epoch": 4.894854586129754, + "grad_norm": 0.6139836311340332, + "learning_rate": 4.221552600540904e-07, + "loss": 0.511, + "step": 4376 + }, + { + "epoch": 4.895973154362416, + "grad_norm": 0.621971070766449, + "learning_rate": 4.2132596924363666e-07, + "loss": 0.5172, + "step": 4377 + }, + { + "epoch": 4.8970917225950785, + "grad_norm": 0.6410837769508362, + "learning_rate": 4.2049741881264345e-07, + "loss": 0.5299, + "step": 4378 + }, + { + "epoch": 4.89821029082774, + "grad_norm": 0.6196994185447693, + "learning_rate": 4.196696090562205e-07, + "loss": 0.5302, + "step": 4379 + }, + { + "epoch": 4.899328859060403, + "grad_norm": 0.6105346083641052, + "learning_rate": 4.188425402692181e-07, + "loss": 0.5204, + "step": 4380 + }, + { + "epoch": 4.900447427293065, + "grad_norm": 0.6109244227409363, + "learning_rate": 4.180162127462195e-07, + "loss": 0.5289, + "step": 4381 + }, + { + "epoch": 4.901565995525727, + "grad_norm": 0.6313038468360901, + "learning_rate": 4.171906267815462e-07, + "loss": 0.5183, + "step": 4382 + }, + { + "epoch": 4.902684563758389, + "grad_norm": 0.625917375087738, + "learning_rate": 4.163657826692527e-07, + "loss": 0.4856, + "step": 4383 + }, + { + "epoch": 4.903803131991052, + "grad_norm": 0.6236258149147034, + "learning_rate": 4.155416807031326e-07, + "loss": 0.5505, + "step": 4384 + }, + { + "epoch": 4.9049217002237135, + "grad_norm": 0.6240785717964172, + "learning_rate": 4.1471832117671277e-07, + "loss": 0.5346, + "step": 4385 + }, + { + "epoch": 4.906040268456376, + "grad_norm": 0.6263416409492493, + "learning_rate": 4.1389570438325615e-07, + "loss": 0.5445, + "step": 4386 + }, + { + "epoch": 4.907158836689038, + "grad_norm": 0.6088545322418213, + "learning_rate": 4.1307383061576114e-07, + "loss": 0.5048, + "step": 4387 + }, + { + "epoch": 4.9082774049217, + "grad_norm": 0.6047202348709106, + "learning_rate": 4.1225270016696237e-07, + "loss": 0.4976, + "step": 4388 + }, + { + "epoch": 4.909395973154362, + "grad_norm": 0.6274957656860352, + "learning_rate": 4.114323133293288e-07, + "loss": 0.5434, + "step": 4389 + }, + { + "epoch": 4.910514541387025, + "grad_norm": 0.616178572177887, + "learning_rate": 4.1061267039506415e-07, + "loss": 0.5316, + "step": 4390 + }, + { + "epoch": 4.911633109619687, + "grad_norm": 0.6123727560043335, + "learning_rate": 4.0979377165610874e-07, + "loss": 0.5311, + "step": 4391 + }, + { + "epoch": 4.912751677852349, + "grad_norm": 0.6375654339790344, + "learning_rate": 4.089756174041359e-07, + "loss": 0.5077, + "step": 4392 + }, + { + "epoch": 4.913870246085011, + "grad_norm": 0.6250323057174683, + "learning_rate": 4.08158207930556e-07, + "loss": 0.5346, + "step": 4393 + }, + { + "epoch": 4.914988814317674, + "grad_norm": 0.6357123851776123, + "learning_rate": 4.073415435265121e-07, + "loss": 0.5403, + "step": 4394 + }, + { + "epoch": 4.916107382550336, + "grad_norm": 0.6266555190086365, + "learning_rate": 4.0652562448288304e-07, + "loss": 0.5204, + "step": 4395 + }, + { + "epoch": 4.917225950782997, + "grad_norm": 0.6327143907546997, + "learning_rate": 4.057104510902807e-07, + "loss": 0.5218, + "step": 4396 + }, + { + "epoch": 4.91834451901566, + "grad_norm": 0.6290907859802246, + "learning_rate": 4.0489602363905423e-07, + "loss": 0.5216, + "step": 4397 + }, + { + "epoch": 4.919463087248322, + "grad_norm": 0.6028468012809753, + "learning_rate": 4.0408234241928356e-07, + "loss": 0.5231, + "step": 4398 + }, + { + "epoch": 4.9205816554809845, + "grad_norm": 0.6239443421363831, + "learning_rate": 4.0326940772078634e-07, + "loss": 0.5438, + "step": 4399 + }, + { + "epoch": 4.921700223713646, + "grad_norm": 0.6322512626647949, + "learning_rate": 4.02457219833112e-07, + "loss": 0.5356, + "step": 4400 + }, + { + "epoch": 4.922818791946309, + "grad_norm": 0.6176933646202087, + "learning_rate": 4.0164577904554357e-07, + "loss": 0.51, + "step": 4401 + }, + { + "epoch": 4.923937360178971, + "grad_norm": 0.6284427046775818, + "learning_rate": 4.0083508564710135e-07, + "loss": 0.5204, + "step": 4402 + }, + { + "epoch": 4.925055928411633, + "grad_norm": 0.6082155704498291, + "learning_rate": 4.0002513992653425e-07, + "loss": 0.5381, + "step": 4403 + }, + { + "epoch": 4.926174496644295, + "grad_norm": 0.646155834197998, + "learning_rate": 3.9921594217232964e-07, + "loss": 0.5605, + "step": 4404 + }, + { + "epoch": 4.927293064876958, + "grad_norm": 0.6046296954154968, + "learning_rate": 3.9840749267270524e-07, + "loss": 0.5232, + "step": 4405 + }, + { + "epoch": 4.9284116331096195, + "grad_norm": 0.6020948886871338, + "learning_rate": 3.9759979171561493e-07, + "loss": 0.5272, + "step": 4406 + }, + { + "epoch": 4.929530201342282, + "grad_norm": 0.6343019604682922, + "learning_rate": 3.9679283958874324e-07, + "loss": 0.5261, + "step": 4407 + }, + { + "epoch": 4.930648769574944, + "grad_norm": 0.6095458269119263, + "learning_rate": 3.9598663657951063e-07, + "loss": 0.5107, + "step": 4408 + }, + { + "epoch": 4.931767337807607, + "grad_norm": 0.6146063208580017, + "learning_rate": 3.9518118297506904e-07, + "loss": 0.5173, + "step": 4409 + }, + { + "epoch": 4.932885906040268, + "grad_norm": 0.6243351101875305, + "learning_rate": 3.943764790623036e-07, + "loss": 0.5354, + "step": 4410 + }, + { + "epoch": 4.934004474272931, + "grad_norm": 0.6285550594329834, + "learning_rate": 3.93572525127833e-07, + "loss": 0.4993, + "step": 4411 + }, + { + "epoch": 4.935123042505593, + "grad_norm": 0.6457116007804871, + "learning_rate": 3.927693214580075e-07, + "loss": 0.5397, + "step": 4412 + }, + { + "epoch": 4.936241610738255, + "grad_norm": 0.5887306332588196, + "learning_rate": 3.919668683389127e-07, + "loss": 0.512, + "step": 4413 + }, + { + "epoch": 4.937360178970917, + "grad_norm": 0.6155664920806885, + "learning_rate": 3.9116516605636423e-07, + "loss": 0.5095, + "step": 4414 + }, + { + "epoch": 4.938478747203579, + "grad_norm": 0.6260802745819092, + "learning_rate": 3.9036421489591246e-07, + "loss": 0.538, + "step": 4415 + }, + { + "epoch": 4.939597315436242, + "grad_norm": 0.6137040257453918, + "learning_rate": 3.895640151428376e-07, + "loss": 0.5073, + "step": 4416 + }, + { + "epoch": 4.940715883668904, + "grad_norm": 0.608590304851532, + "learning_rate": 3.8876456708215546e-07, + "loss": 0.5356, + "step": 4417 + }, + { + "epoch": 4.941834451901566, + "grad_norm": 0.6056689620018005, + "learning_rate": 3.879658709986117e-07, + "loss": 0.5196, + "step": 4418 + }, + { + "epoch": 4.942953020134228, + "grad_norm": 0.605205774307251, + "learning_rate": 3.871679271766848e-07, + "loss": 0.4984, + "step": 4419 + }, + { + "epoch": 4.94407158836689, + "grad_norm": 0.6198406219482422, + "learning_rate": 3.863707359005853e-07, + "loss": 0.5558, + "step": 4420 + }, + { + "epoch": 4.945190156599552, + "grad_norm": 0.6238109469413757, + "learning_rate": 3.8557429745425535e-07, + "loss": 0.5264, + "step": 4421 + }, + { + "epoch": 4.946308724832215, + "grad_norm": 0.6276695728302002, + "learning_rate": 3.847786121213704e-07, + "loss": 0.523, + "step": 4422 + }, + { + "epoch": 4.947427293064877, + "grad_norm": 0.6219817996025085, + "learning_rate": 3.8398368018533556e-07, + "loss": 0.5135, + "step": 4423 + }, + { + "epoch": 4.948545861297539, + "grad_norm": 0.6174488663673401, + "learning_rate": 3.831895019292897e-07, + "loss": 0.5231, + "step": 4424 + }, + { + "epoch": 4.949664429530201, + "grad_norm": 0.6268370747566223, + "learning_rate": 3.823960776361013e-07, + "loss": 0.5499, + "step": 4425 + }, + { + "epoch": 4.950782997762864, + "grad_norm": 0.610793948173523, + "learning_rate": 3.816034075883723e-07, + "loss": 0.5474, + "step": 4426 + }, + { + "epoch": 4.9519015659955254, + "grad_norm": 0.617171585559845, + "learning_rate": 3.8081149206843417e-07, + "loss": 0.5174, + "step": 4427 + }, + { + "epoch": 4.953020134228188, + "grad_norm": 0.6154060959815979, + "learning_rate": 3.8002033135835033e-07, + "loss": 0.5155, + "step": 4428 + }, + { + "epoch": 4.95413870246085, + "grad_norm": 0.6336487531661987, + "learning_rate": 3.7922992573991513e-07, + "loss": 0.5108, + "step": 4429 + }, + { + "epoch": 4.9552572706935125, + "grad_norm": 0.6179271936416626, + "learning_rate": 3.784402754946553e-07, + "loss": 0.5237, + "step": 4430 + }, + { + "epoch": 4.956375838926174, + "grad_norm": 0.6497101783752441, + "learning_rate": 3.7765138090382653e-07, + "loss": 0.5266, + "step": 4431 + }, + { + "epoch": 4.957494407158837, + "grad_norm": 0.6304492950439453, + "learning_rate": 3.7686324224841614e-07, + "loss": 0.5418, + "step": 4432 + }, + { + "epoch": 4.958612975391499, + "grad_norm": 0.6313093304634094, + "learning_rate": 3.7607585980914323e-07, + "loss": 0.5254, + "step": 4433 + }, + { + "epoch": 4.959731543624161, + "grad_norm": 0.6071204543113708, + "learning_rate": 3.7528923386645625e-07, + "loss": 0.5079, + "step": 4434 + }, + { + "epoch": 4.960850111856823, + "grad_norm": 0.6156542897224426, + "learning_rate": 3.7450336470053447e-07, + "loss": 0.5177, + "step": 4435 + }, + { + "epoch": 4.961968680089486, + "grad_norm": 0.613685131072998, + "learning_rate": 3.7371825259128744e-07, + "loss": 0.5195, + "step": 4436 + }, + { + "epoch": 4.9630872483221475, + "grad_norm": 0.6097269058227539, + "learning_rate": 3.729338978183561e-07, + "loss": 0.5102, + "step": 4437 + }, + { + "epoch": 4.96420581655481, + "grad_norm": 0.6036381125450134, + "learning_rate": 3.721503006611102e-07, + "loss": 0.5318, + "step": 4438 + }, + { + "epoch": 4.965324384787472, + "grad_norm": 0.6201692223548889, + "learning_rate": 3.713674613986515e-07, + "loss": 0.5174, + "step": 4439 + }, + { + "epoch": 4.966442953020135, + "grad_norm": 0.6241570711135864, + "learning_rate": 3.7058538030980946e-07, + "loss": 0.5548, + "step": 4440 + }, + { + "epoch": 4.967561521252796, + "grad_norm": 0.6248655915260315, + "learning_rate": 3.698040576731457e-07, + "loss": 0.547, + "step": 4441 + }, + { + "epoch": 4.968680089485458, + "grad_norm": 0.6197899580001831, + "learning_rate": 3.6902349376695013e-07, + "loss": 0.5123, + "step": 4442 + }, + { + "epoch": 4.969798657718121, + "grad_norm": 0.6128517389297485, + "learning_rate": 3.682436888692434e-07, + "loss": 0.509, + "step": 4443 + }, + { + "epoch": 4.9709172259507834, + "grad_norm": 0.6188886165618896, + "learning_rate": 3.67464643257775e-07, + "loss": 0.5256, + "step": 4444 + }, + { + "epoch": 4.972035794183445, + "grad_norm": 0.6145041584968567, + "learning_rate": 3.6668635721002407e-07, + "loss": 0.5023, + "step": 4445 + }, + { + "epoch": 4.973154362416107, + "grad_norm": 0.6280524134635925, + "learning_rate": 3.6590883100320053e-07, + "loss": 0.5321, + "step": 4446 + }, + { + "epoch": 4.97427293064877, + "grad_norm": 0.6354238986968994, + "learning_rate": 3.6513206491424174e-07, + "loss": 0.5594, + "step": 4447 + }, + { + "epoch": 4.975391498881431, + "grad_norm": 0.6305732727050781, + "learning_rate": 3.6435605921981644e-07, + "loss": 0.5534, + "step": 4448 + }, + { + "epoch": 4.976510067114094, + "grad_norm": 0.632779061794281, + "learning_rate": 3.635808141963196e-07, + "loss": 0.4884, + "step": 4449 + }, + { + "epoch": 4.977628635346756, + "grad_norm": 0.6297087669372559, + "learning_rate": 3.628063301198789e-07, + "loss": 0.526, + "step": 4450 + }, + { + "epoch": 4.9787472035794185, + "grad_norm": 0.6244329810142517, + "learning_rate": 3.62032607266348e-07, + "loss": 0.5325, + "step": 4451 + }, + { + "epoch": 4.97986577181208, + "grad_norm": 0.6276065707206726, + "learning_rate": 3.6125964591131076e-07, + "loss": 0.5194, + "step": 4452 + }, + { + "epoch": 4.980984340044743, + "grad_norm": 0.632727324962616, + "learning_rate": 3.6048744633007976e-07, + "loss": 0.5187, + "step": 4453 + }, + { + "epoch": 4.982102908277405, + "grad_norm": 0.6135322451591492, + "learning_rate": 3.5971600879769504e-07, + "loss": 0.4965, + "step": 4454 + }, + { + "epoch": 4.983221476510067, + "grad_norm": 0.6446414589881897, + "learning_rate": 3.589453335889276e-07, + "loss": 0.5256, + "step": 4455 + }, + { + "epoch": 4.984340044742729, + "grad_norm": 0.6313472390174866, + "learning_rate": 3.5817542097827425e-07, + "loss": 0.5359, + "step": 4456 + }, + { + "epoch": 4.985458612975392, + "grad_norm": 0.6245015263557434, + "learning_rate": 3.5740627123996284e-07, + "loss": 0.5204, + "step": 4457 + }, + { + "epoch": 4.9865771812080535, + "grad_norm": 0.6169870495796204, + "learning_rate": 3.5663788464794764e-07, + "loss": 0.5282, + "step": 4458 + }, + { + "epoch": 4.987695749440716, + "grad_norm": 0.6137927770614624, + "learning_rate": 3.558702614759113e-07, + "loss": 0.5229, + "step": 4459 + }, + { + "epoch": 4.988814317673378, + "grad_norm": 0.6367197036743164, + "learning_rate": 3.551034019972646e-07, + "loss": 0.5285, + "step": 4460 + }, + { + "epoch": 4.989932885906041, + "grad_norm": 0.5930278897285461, + "learning_rate": 3.543373064851474e-07, + "loss": 0.5058, + "step": 4461 + }, + { + "epoch": 4.991051454138702, + "grad_norm": 0.6158058643341064, + "learning_rate": 3.535719752124264e-07, + "loss": 0.5251, + "step": 4462 + }, + { + "epoch": 4.992170022371365, + "grad_norm": 0.6222450137138367, + "learning_rate": 3.5280740845169526e-07, + "loss": 0.5409, + "step": 4463 + }, + { + "epoch": 4.993288590604027, + "grad_norm": 0.6268084645271301, + "learning_rate": 3.5204360647527805e-07, + "loss": 0.52, + "step": 4464 + }, + { + "epoch": 4.994407158836689, + "grad_norm": 0.6264050602912903, + "learning_rate": 3.512805695552235e-07, + "loss": 0.5268, + "step": 4465 + }, + { + "epoch": 4.995525727069351, + "grad_norm": 0.634637713432312, + "learning_rate": 3.505182979633098e-07, + "loss": 0.5592, + "step": 4466 + }, + { + "epoch": 4.996644295302014, + "grad_norm": 0.6207966804504395, + "learning_rate": 3.4975679197104205e-07, + "loss": 0.5244, + "step": 4467 + }, + { + "epoch": 4.997762863534676, + "grad_norm": 0.6360892057418823, + "learning_rate": 3.489960518496521e-07, + "loss": 0.5283, + "step": 4468 + }, + { + "epoch": 4.998881431767337, + "grad_norm": 0.6112513542175293, + "learning_rate": 3.4823607787009856e-07, + "loss": 0.515, + "step": 4469 + }, + { + "epoch": 5.0, + "grad_norm": 0.6247971057891846, + "learning_rate": 3.474768703030695e-07, + "loss": 0.5153, + "step": 4470 + }, + { + "epoch": 5.001118568232662, + "grad_norm": 0.6122350096702576, + "learning_rate": 3.4671842941897764e-07, + "loss": 0.5126, + "step": 4471 + }, + { + "epoch": 5.002237136465324, + "grad_norm": 0.6095572710037231, + "learning_rate": 3.4596075548796376e-07, + "loss": 0.4951, + "step": 4472 + }, + { + "epoch": 5.003355704697986, + "grad_norm": 0.6131179928779602, + "learning_rate": 3.4520384877989535e-07, + "loss": 0.5218, + "step": 4473 + }, + { + "epoch": 5.004474272930649, + "grad_norm": 0.6105647087097168, + "learning_rate": 3.4444770956436555e-07, + "loss": 0.4826, + "step": 4474 + }, + { + "epoch": 5.005592841163311, + "grad_norm": 0.6166825890541077, + "learning_rate": 3.4369233811069655e-07, + "loss": 0.5341, + "step": 4475 + }, + { + "epoch": 5.006711409395973, + "grad_norm": 0.6195070147514343, + "learning_rate": 3.429377346879348e-07, + "loss": 0.5383, + "step": 4476 + }, + { + "epoch": 5.007829977628635, + "grad_norm": 0.6139827966690063, + "learning_rate": 3.421838995648535e-07, + "loss": 0.507, + "step": 4477 + }, + { + "epoch": 5.008948545861298, + "grad_norm": 0.622748851776123, + "learning_rate": 3.4143083300995307e-07, + "loss": 0.5191, + "step": 4478 + }, + { + "epoch": 5.010067114093959, + "grad_norm": 0.623548686504364, + "learning_rate": 3.4067853529146016e-07, + "loss": 0.5453, + "step": 4479 + }, + { + "epoch": 5.011185682326622, + "grad_norm": 0.6198171377182007, + "learning_rate": 3.3992700667732666e-07, + "loss": 0.516, + "step": 4480 + }, + { + "epoch": 5.012304250559284, + "grad_norm": 0.6058144569396973, + "learning_rate": 3.391762474352317e-07, + "loss": 0.5174, + "step": 4481 + }, + { + "epoch": 5.0134228187919465, + "grad_norm": 0.6145999431610107, + "learning_rate": 3.3842625783257946e-07, + "loss": 0.5142, + "step": 4482 + }, + { + "epoch": 5.014541387024608, + "grad_norm": 0.6206093430519104, + "learning_rate": 3.3767703813649993e-07, + "loss": 0.5219, + "step": 4483 + }, + { + "epoch": 5.015659955257271, + "grad_norm": 0.6295281648635864, + "learning_rate": 3.3692858861384986e-07, + "loss": 0.529, + "step": 4484 + }, + { + "epoch": 5.016778523489933, + "grad_norm": 0.6065890789031982, + "learning_rate": 3.3618090953120994e-07, + "loss": 0.508, + "step": 4485 + }, + { + "epoch": 5.017897091722595, + "grad_norm": 0.5998103022575378, + "learning_rate": 3.35434001154889e-07, + "loss": 0.5045, + "step": 4486 + }, + { + "epoch": 5.019015659955257, + "grad_norm": 0.6572564244270325, + "learning_rate": 3.346878637509185e-07, + "loss": 0.5472, + "step": 4487 + }, + { + "epoch": 5.02013422818792, + "grad_norm": 0.6046032905578613, + "learning_rate": 3.3394249758505803e-07, + "loss": 0.5131, + "step": 4488 + }, + { + "epoch": 5.0212527964205815, + "grad_norm": 0.6111887693405151, + "learning_rate": 3.3319790292279e-07, + "loss": 0.5211, + "step": 4489 + }, + { + "epoch": 5.022371364653244, + "grad_norm": 0.6105478405952454, + "learning_rate": 3.324540800293241e-07, + "loss": 0.5355, + "step": 4490 + }, + { + "epoch": 5.023489932885906, + "grad_norm": 0.6316006183624268, + "learning_rate": 3.317110291695938e-07, + "loss": 0.5287, + "step": 4491 + }, + { + "epoch": 5.024608501118569, + "grad_norm": 0.6499285101890564, + "learning_rate": 3.3096875060825846e-07, + "loss": 0.5658, + "step": 4492 + }, + { + "epoch": 5.02572706935123, + "grad_norm": 0.6193447709083557, + "learning_rate": 3.302272446097005e-07, + "loss": 0.5026, + "step": 4493 + }, + { + "epoch": 5.026845637583893, + "grad_norm": 0.6185278296470642, + "learning_rate": 3.294865114380305e-07, + "loss": 0.5174, + "step": 4494 + }, + { + "epoch": 5.027964205816555, + "grad_norm": 0.6185445785522461, + "learning_rate": 3.287465513570806e-07, + "loss": 0.4988, + "step": 4495 + }, + { + "epoch": 5.029082774049217, + "grad_norm": 0.6410359144210815, + "learning_rate": 3.2800736463040883e-07, + "loss": 0.5402, + "step": 4496 + }, + { + "epoch": 5.030201342281879, + "grad_norm": 0.6149916648864746, + "learning_rate": 3.2726895152129843e-07, + "loss": 0.5373, + "step": 4497 + }, + { + "epoch": 5.031319910514541, + "grad_norm": 0.6139128804206848, + "learning_rate": 3.2653131229275576e-07, + "loss": 0.5241, + "step": 4498 + }, + { + "epoch": 5.032438478747204, + "grad_norm": 0.6243690848350525, + "learning_rate": 3.2579444720751306e-07, + "loss": 0.5126, + "step": 4499 + }, + { + "epoch": 5.033557046979865, + "grad_norm": 0.6277595162391663, + "learning_rate": 3.2505835652802577e-07, + "loss": 0.5418, + "step": 4500 + }, + { + "epoch": 5.034675615212528, + "grad_norm": 0.647991955280304, + "learning_rate": 3.2432304051647326e-07, + "loss": 0.5396, + "step": 4501 + }, + { + "epoch": 5.03579418344519, + "grad_norm": 0.609725296497345, + "learning_rate": 3.2358849943475905e-07, + "loss": 0.5148, + "step": 4502 + }, + { + "epoch": 5.0369127516778525, + "grad_norm": 0.6265501976013184, + "learning_rate": 3.2285473354451203e-07, + "loss": 0.5179, + "step": 4503 + }, + { + "epoch": 5.038031319910514, + "grad_norm": 0.6217706203460693, + "learning_rate": 3.2212174310708333e-07, + "loss": 0.5302, + "step": 4504 + }, + { + "epoch": 5.039149888143177, + "grad_norm": 0.6065677404403687, + "learning_rate": 3.213895283835494e-07, + "loss": 0.5107, + "step": 4505 + }, + { + "epoch": 5.040268456375839, + "grad_norm": 0.6399171352386475, + "learning_rate": 3.206580896347089e-07, + "loss": 0.5272, + "step": 4506 + }, + { + "epoch": 5.041387024608501, + "grad_norm": 0.625352680683136, + "learning_rate": 3.1992742712108475e-07, + "loss": 0.5396, + "step": 4507 + }, + { + "epoch": 5.042505592841163, + "grad_norm": 0.6287113428115845, + "learning_rate": 3.191975411029236e-07, + "loss": 0.5173, + "step": 4508 + }, + { + "epoch": 5.043624161073826, + "grad_norm": 0.6213611364364624, + "learning_rate": 3.1846843184019426e-07, + "loss": 0.5111, + "step": 4509 + }, + { + "epoch": 5.0447427293064875, + "grad_norm": 0.6409175395965576, + "learning_rate": 3.1774009959259195e-07, + "loss": 0.5099, + "step": 4510 + }, + { + "epoch": 5.04586129753915, + "grad_norm": 0.6313573122024536, + "learning_rate": 3.1701254461953104e-07, + "loss": 0.5182, + "step": 4511 + }, + { + "epoch": 5.046979865771812, + "grad_norm": 0.6188579797744751, + "learning_rate": 3.162857671801528e-07, + "loss": 0.5231, + "step": 4512 + }, + { + "epoch": 5.0480984340044746, + "grad_norm": 0.6257225275039673, + "learning_rate": 3.1555976753331884e-07, + "loss": 0.5243, + "step": 4513 + }, + { + "epoch": 5.049217002237136, + "grad_norm": 0.6147660613059998, + "learning_rate": 3.1483454593761565e-07, + "loss": 0.5375, + "step": 4514 + }, + { + "epoch": 5.050335570469799, + "grad_norm": 0.6361204385757446, + "learning_rate": 3.14110102651351e-07, + "loss": 0.5458, + "step": 4515 + }, + { + "epoch": 5.051454138702461, + "grad_norm": 0.618554949760437, + "learning_rate": 3.1338643793255656e-07, + "loss": 0.5014, + "step": 4516 + }, + { + "epoch": 5.052572706935123, + "grad_norm": 0.6243267059326172, + "learning_rate": 3.126635520389862e-07, + "loss": 0.5245, + "step": 4517 + }, + { + "epoch": 5.053691275167785, + "grad_norm": 0.6090473532676697, + "learning_rate": 3.119414452281158e-07, + "loss": 0.5172, + "step": 4518 + }, + { + "epoch": 5.054809843400448, + "grad_norm": 0.6069055795669556, + "learning_rate": 3.112201177571458e-07, + "loss": 0.4859, + "step": 4519 + }, + { + "epoch": 5.05592841163311, + "grad_norm": 0.6218957304954529, + "learning_rate": 3.104995698829963e-07, + "loss": 0.498, + "step": 4520 + }, + { + "epoch": 5.057046979865772, + "grad_norm": 0.6108510494232178, + "learning_rate": 3.0977980186231245e-07, + "loss": 0.5134, + "step": 4521 + }, + { + "epoch": 5.058165548098434, + "grad_norm": 0.6003857851028442, + "learning_rate": 3.090608139514589e-07, + "loss": 0.4845, + "step": 4522 + }, + { + "epoch": 5.059284116331096, + "grad_norm": 0.6157301664352417, + "learning_rate": 3.0834260640652536e-07, + "loss": 0.5207, + "step": 4523 + }, + { + "epoch": 5.060402684563758, + "grad_norm": 0.630525529384613, + "learning_rate": 3.076251794833213e-07, + "loss": 0.5317, + "step": 4524 + }, + { + "epoch": 5.06152125279642, + "grad_norm": 0.6104357838630676, + "learning_rate": 3.069085334373789e-07, + "loss": 0.5041, + "step": 4525 + }, + { + "epoch": 5.062639821029083, + "grad_norm": 0.6348655223846436, + "learning_rate": 3.061926685239522e-07, + "loss": 0.5515, + "step": 4526 + }, + { + "epoch": 5.063758389261745, + "grad_norm": 0.6133022904396057, + "learning_rate": 3.054775849980168e-07, + "loss": 0.5391, + "step": 4527 + }, + { + "epoch": 5.064876957494407, + "grad_norm": 0.6313871145248413, + "learning_rate": 3.047632831142711e-07, + "loss": 0.5336, + "step": 4528 + }, + { + "epoch": 5.065995525727069, + "grad_norm": 0.6301317811012268, + "learning_rate": 3.0404976312713304e-07, + "loss": 0.5187, + "step": 4529 + }, + { + "epoch": 5.067114093959732, + "grad_norm": 0.6294951438903809, + "learning_rate": 3.0333702529074445e-07, + "loss": 0.5269, + "step": 4530 + }, + { + "epoch": 5.068232662192393, + "grad_norm": 0.6168622374534607, + "learning_rate": 3.0262506985896665e-07, + "loss": 0.5048, + "step": 4531 + }, + { + "epoch": 5.069351230425056, + "grad_norm": 0.6129342317581177, + "learning_rate": 3.0191389708538355e-07, + "loss": 0.5153, + "step": 4532 + }, + { + "epoch": 5.070469798657718, + "grad_norm": 0.6209331750869751, + "learning_rate": 3.0120350722329857e-07, + "loss": 0.5273, + "step": 4533 + }, + { + "epoch": 5.0715883668903805, + "grad_norm": 0.60940021276474, + "learning_rate": 3.0049390052573903e-07, + "loss": 0.5012, + "step": 4534 + }, + { + "epoch": 5.072706935123042, + "grad_norm": 0.6250702142715454, + "learning_rate": 2.997850772454505e-07, + "loss": 0.5396, + "step": 4535 + }, + { + "epoch": 5.073825503355705, + "grad_norm": 0.6123648881912231, + "learning_rate": 2.990770376349017e-07, + "loss": 0.5022, + "step": 4536 + }, + { + "epoch": 5.074944071588367, + "grad_norm": 0.6388433575630188, + "learning_rate": 2.9836978194628106e-07, + "loss": 0.5352, + "step": 4537 + }, + { + "epoch": 5.076062639821029, + "grad_norm": 0.6280725002288818, + "learning_rate": 2.9766331043149713e-07, + "loss": 0.5306, + "step": 4538 + }, + { + "epoch": 5.077181208053691, + "grad_norm": 0.6390487551689148, + "learning_rate": 2.9695762334218113e-07, + "loss": 0.5252, + "step": 4539 + }, + { + "epoch": 5.078299776286354, + "grad_norm": 0.6450525522232056, + "learning_rate": 2.962527209296834e-07, + "loss": 0.525, + "step": 4540 + }, + { + "epoch": 5.0794183445190155, + "grad_norm": 0.627917468547821, + "learning_rate": 2.9554860344507525e-07, + "loss": 0.5275, + "step": 4541 + }, + { + "epoch": 5.080536912751678, + "grad_norm": 0.622687578201294, + "learning_rate": 2.948452711391475e-07, + "loss": 0.508, + "step": 4542 + }, + { + "epoch": 5.08165548098434, + "grad_norm": 0.6202448010444641, + "learning_rate": 2.941427242624137e-07, + "loss": 0.5122, + "step": 4543 + }, + { + "epoch": 5.082774049217003, + "grad_norm": 0.6130514144897461, + "learning_rate": 2.9344096306510445e-07, + "loss": 0.5334, + "step": 4544 + }, + { + "epoch": 5.083892617449664, + "grad_norm": 0.6143741607666016, + "learning_rate": 2.927399877971737e-07, + "loss": 0.5115, + "step": 4545 + }, + { + "epoch": 5.085011185682327, + "grad_norm": 0.6075924634933472, + "learning_rate": 2.920397987082926e-07, + "loss": 0.5275, + "step": 4546 + }, + { + "epoch": 5.086129753914989, + "grad_norm": 0.6433794498443604, + "learning_rate": 2.9134039604785474e-07, + "loss": 0.5298, + "step": 4547 + }, + { + "epoch": 5.087248322147651, + "grad_norm": 0.6396564245223999, + "learning_rate": 2.906417800649719e-07, + "loss": 0.5295, + "step": 4548 + }, + { + "epoch": 5.088366890380313, + "grad_norm": 0.6410099864006042, + "learning_rate": 2.8994395100847615e-07, + "loss": 0.5513, + "step": 4549 + }, + { + "epoch": 5.089485458612975, + "grad_norm": 0.6172551512718201, + "learning_rate": 2.892469091269198e-07, + "loss": 0.5263, + "step": 4550 + }, + { + "epoch": 5.090604026845638, + "grad_norm": 0.6078923344612122, + "learning_rate": 2.885506546685732e-07, + "loss": 0.5107, + "step": 4551 + }, + { + "epoch": 5.091722595078299, + "grad_norm": 0.6244307160377502, + "learning_rate": 2.878551878814287e-07, + "loss": 0.5065, + "step": 4552 + }, + { + "epoch": 5.092841163310962, + "grad_norm": 0.6273861527442932, + "learning_rate": 2.8716050901319596e-07, + "loss": 0.5287, + "step": 4553 + }, + { + "epoch": 5.093959731543624, + "grad_norm": 0.6452868580818176, + "learning_rate": 2.864666183113055e-07, + "loss": 0.5435, + "step": 4554 + }, + { + "epoch": 5.0950782997762865, + "grad_norm": 0.6231293678283691, + "learning_rate": 2.8577351602290556e-07, + "loss": 0.5181, + "step": 4555 + }, + { + "epoch": 5.096196868008948, + "grad_norm": 0.623128354549408, + "learning_rate": 2.850812023948657e-07, + "loss": 0.5259, + "step": 4556 + }, + { + "epoch": 5.097315436241611, + "grad_norm": 0.6198793649673462, + "learning_rate": 2.8438967767377147e-07, + "loss": 0.5139, + "step": 4557 + }, + { + "epoch": 5.098434004474273, + "grad_norm": 0.626021146774292, + "learning_rate": 2.8369894210593106e-07, + "loss": 0.5113, + "step": 4558 + }, + { + "epoch": 5.099552572706935, + "grad_norm": 0.608262300491333, + "learning_rate": 2.8300899593736885e-07, + "loss": 0.5195, + "step": 4559 + }, + { + "epoch": 5.100671140939597, + "grad_norm": 0.6315938234329224, + "learning_rate": 2.823198394138288e-07, + "loss": 0.5384, + "step": 4560 + }, + { + "epoch": 5.10178970917226, + "grad_norm": 0.6111140847206116, + "learning_rate": 2.8163147278077454e-07, + "loss": 0.5296, + "step": 4561 + }, + { + "epoch": 5.1029082774049215, + "grad_norm": 0.6279772520065308, + "learning_rate": 2.809438962833866e-07, + "loss": 0.5465, + "step": 4562 + }, + { + "epoch": 5.104026845637584, + "grad_norm": 0.628913402557373, + "learning_rate": 2.802571101665663e-07, + "loss": 0.5399, + "step": 4563 + }, + { + "epoch": 5.105145413870246, + "grad_norm": 0.6379182934761047, + "learning_rate": 2.7957111467493165e-07, + "loss": 0.5108, + "step": 4564 + }, + { + "epoch": 5.1062639821029085, + "grad_norm": 0.6148325204849243, + "learning_rate": 2.788859100528196e-07, + "loss": 0.5175, + "step": 4565 + }, + { + "epoch": 5.10738255033557, + "grad_norm": 0.6016440987586975, + "learning_rate": 2.78201496544285e-07, + "loss": 0.5153, + "step": 4566 + }, + { + "epoch": 5.108501118568233, + "grad_norm": 0.6261692643165588, + "learning_rate": 2.7751787439310257e-07, + "loss": 0.5306, + "step": 4567 + }, + { + "epoch": 5.109619686800895, + "grad_norm": 0.6274019479751587, + "learning_rate": 2.768350438427628e-07, + "loss": 0.509, + "step": 4568 + }, + { + "epoch": 5.110738255033557, + "grad_norm": 0.6449814438819885, + "learning_rate": 2.761530051364761e-07, + "loss": 0.5428, + "step": 4569 + }, + { + "epoch": 5.111856823266219, + "grad_norm": 0.6093098521232605, + "learning_rate": 2.7547175851717014e-07, + "loss": 0.5259, + "step": 4570 + }, + { + "epoch": 5.112975391498882, + "grad_norm": 0.6260678172111511, + "learning_rate": 2.7479130422749e-07, + "loss": 0.5123, + "step": 4571 + }, + { + "epoch": 5.114093959731544, + "grad_norm": 0.6170490384101868, + "learning_rate": 2.7411164250979947e-07, + "loss": 0.5148, + "step": 4572 + }, + { + "epoch": 5.115212527964206, + "grad_norm": 0.6244754195213318, + "learning_rate": 2.7343277360617953e-07, + "loss": 0.506, + "step": 4573 + }, + { + "epoch": 5.116331096196868, + "grad_norm": 0.6210300326347351, + "learning_rate": 2.727546977584286e-07, + "loss": 0.5432, + "step": 4574 + }, + { + "epoch": 5.117449664429531, + "grad_norm": 0.6308062076568604, + "learning_rate": 2.7207741520806266e-07, + "loss": 0.5345, + "step": 4575 + }, + { + "epoch": 5.118568232662192, + "grad_norm": 0.616058886051178, + "learning_rate": 2.71400926196316e-07, + "loss": 0.5058, + "step": 4576 + }, + { + "epoch": 5.119686800894854, + "grad_norm": 0.6124359965324402, + "learning_rate": 2.7072523096413896e-07, + "loss": 0.5034, + "step": 4577 + }, + { + "epoch": 5.120805369127517, + "grad_norm": 0.6011144518852234, + "learning_rate": 2.7005032975220056e-07, + "loss": 0.5402, + "step": 4578 + }, + { + "epoch": 5.121923937360179, + "grad_norm": 0.6484677791595459, + "learning_rate": 2.6937622280088585e-07, + "loss": 0.524, + "step": 4579 + }, + { + "epoch": 5.123042505592841, + "grad_norm": 0.634046196937561, + "learning_rate": 2.6870291035029724e-07, + "loss": 0.5391, + "step": 4580 + }, + { + "epoch": 5.124161073825503, + "grad_norm": 0.6243428587913513, + "learning_rate": 2.680303926402544e-07, + "loss": 0.5265, + "step": 4581 + }, + { + "epoch": 5.125279642058166, + "grad_norm": 0.6447023749351501, + "learning_rate": 2.673586699102937e-07, + "loss": 0.5422, + "step": 4582 + }, + { + "epoch": 5.126398210290827, + "grad_norm": 0.6163579225540161, + "learning_rate": 2.6668774239966887e-07, + "loss": 0.5056, + "step": 4583 + }, + { + "epoch": 5.12751677852349, + "grad_norm": 0.6006442904472351, + "learning_rate": 2.660176103473491e-07, + "loss": 0.5076, + "step": 4584 + }, + { + "epoch": 5.128635346756152, + "grad_norm": 0.635659396648407, + "learning_rate": 2.6534827399202266e-07, + "loss": 0.5225, + "step": 4585 + }, + { + "epoch": 5.1297539149888145, + "grad_norm": 0.6169342994689941, + "learning_rate": 2.646797335720913e-07, + "loss": 0.5129, + "step": 4586 + }, + { + "epoch": 5.130872483221476, + "grad_norm": 0.5994575619697571, + "learning_rate": 2.6401198932567624e-07, + "loss": 0.4989, + "step": 4587 + }, + { + "epoch": 5.131991051454139, + "grad_norm": 0.6238452196121216, + "learning_rate": 2.633450414906133e-07, + "loss": 0.5221, + "step": 4588 + }, + { + "epoch": 5.133109619686801, + "grad_norm": 0.6163538098335266, + "learning_rate": 2.62678890304455e-07, + "loss": 0.5199, + "step": 4589 + }, + { + "epoch": 5.134228187919463, + "grad_norm": 0.628524661064148, + "learning_rate": 2.620135360044704e-07, + "loss": 0.5354, + "step": 4590 + }, + { + "epoch": 5.135346756152125, + "grad_norm": 0.6159660816192627, + "learning_rate": 2.6134897882764383e-07, + "loss": 0.5244, + "step": 4591 + }, + { + "epoch": 5.136465324384788, + "grad_norm": 0.6295143365859985, + "learning_rate": 2.6068521901067734e-07, + "loss": 0.5328, + "step": 4592 + }, + { + "epoch": 5.1375838926174495, + "grad_norm": 0.6538597345352173, + "learning_rate": 2.6002225678998713e-07, + "loss": 0.5327, + "step": 4593 + }, + { + "epoch": 5.138702460850112, + "grad_norm": 0.6527684330940247, + "learning_rate": 2.593600924017073e-07, + "loss": 0.5297, + "step": 4594 + }, + { + "epoch": 5.139821029082774, + "grad_norm": 0.6243908405303955, + "learning_rate": 2.586987260816859e-07, + "loss": 0.5153, + "step": 4595 + }, + { + "epoch": 5.140939597315437, + "grad_norm": 0.5935629606246948, + "learning_rate": 2.580381580654884e-07, + "loss": 0.5128, + "step": 4596 + }, + { + "epoch": 5.142058165548098, + "grad_norm": 0.622734785079956, + "learning_rate": 2.573783885883943e-07, + "loss": 0.5378, + "step": 4597 + }, + { + "epoch": 5.143176733780761, + "grad_norm": 0.6235083937644958, + "learning_rate": 2.567194178853996e-07, + "loss": 0.4891, + "step": 4598 + }, + { + "epoch": 5.144295302013423, + "grad_norm": 0.5983145833015442, + "learning_rate": 2.560612461912154e-07, + "loss": 0.5216, + "step": 4599 + }, + { + "epoch": 5.145413870246085, + "grad_norm": 0.6291846036911011, + "learning_rate": 2.554038737402692e-07, + "loss": 0.5299, + "step": 4600 + }, + { + "epoch": 5.146532438478747, + "grad_norm": 0.6259318590164185, + "learning_rate": 2.5474730076670286e-07, + "loss": 0.5391, + "step": 4601 + }, + { + "epoch": 5.14765100671141, + "grad_norm": 0.6033223271369934, + "learning_rate": 2.540915275043726e-07, + "loss": 0.4962, + "step": 4602 + }, + { + "epoch": 5.148769574944072, + "grad_norm": 0.6088421940803528, + "learning_rate": 2.5343655418685273e-07, + "loss": 0.4885, + "step": 4603 + }, + { + "epoch": 5.149888143176733, + "grad_norm": 0.6339602470397949, + "learning_rate": 2.527823810474289e-07, + "loss": 0.5312, + "step": 4604 + }, + { + "epoch": 5.151006711409396, + "grad_norm": 0.6375308632850647, + "learning_rate": 2.521290083191061e-07, + "loss": 0.5343, + "step": 4605 + }, + { + "epoch": 5.152125279642058, + "grad_norm": 0.6346525549888611, + "learning_rate": 2.514764362345989e-07, + "loss": 0.5335, + "step": 4606 + }, + { + "epoch": 5.1532438478747205, + "grad_norm": 0.6093592643737793, + "learning_rate": 2.508246650263418e-07, + "loss": 0.5008, + "step": 4607 + }, + { + "epoch": 5.154362416107382, + "grad_norm": 0.6261592507362366, + "learning_rate": 2.501736949264805e-07, + "loss": 0.5424, + "step": 4608 + }, + { + "epoch": 5.155480984340045, + "grad_norm": 0.6172440052032471, + "learning_rate": 2.4952352616687767e-07, + "loss": 0.5086, + "step": 4609 + }, + { + "epoch": 5.156599552572707, + "grad_norm": 0.6098453402519226, + "learning_rate": 2.488741589791088e-07, + "loss": 0.5218, + "step": 4610 + }, + { + "epoch": 5.157718120805369, + "grad_norm": 0.6186922788619995, + "learning_rate": 2.482255935944655e-07, + "loss": 0.5286, + "step": 4611 + }, + { + "epoch": 5.158836689038031, + "grad_norm": 0.6190816760063171, + "learning_rate": 2.4757783024395244e-07, + "loss": 0.5232, + "step": 4612 + }, + { + "epoch": 5.159955257270694, + "grad_norm": 0.620119571685791, + "learning_rate": 2.4693086915828915e-07, + "loss": 0.5234, + "step": 4613 + }, + { + "epoch": 5.1610738255033555, + "grad_norm": 0.6487640738487244, + "learning_rate": 2.462847105679095e-07, + "loss": 0.5261, + "step": 4614 + }, + { + "epoch": 5.162192393736018, + "grad_norm": 0.6284217834472656, + "learning_rate": 2.456393547029609e-07, + "loss": 0.5149, + "step": 4615 + }, + { + "epoch": 5.16331096196868, + "grad_norm": 0.6162627935409546, + "learning_rate": 2.4499480179330625e-07, + "loss": 0.5007, + "step": 4616 + }, + { + "epoch": 5.1644295302013425, + "grad_norm": 0.6179028749465942, + "learning_rate": 2.443510520685208e-07, + "loss": 0.5383, + "step": 4617 + }, + { + "epoch": 5.165548098434004, + "grad_norm": 0.6375982761383057, + "learning_rate": 2.4370810575789523e-07, + "loss": 0.5587, + "step": 4618 + }, + { + "epoch": 5.166666666666667, + "grad_norm": 0.6336742639541626, + "learning_rate": 2.4306596309043277e-07, + "loss": 0.5364, + "step": 4619 + }, + { + "epoch": 5.167785234899329, + "grad_norm": 0.6209999918937683, + "learning_rate": 2.4242462429485165e-07, + "loss": 0.5477, + "step": 4620 + }, + { + "epoch": 5.168903803131991, + "grad_norm": 0.6189097762107849, + "learning_rate": 2.417840895995824e-07, + "loss": 0.5055, + "step": 4621 + }, + { + "epoch": 5.170022371364653, + "grad_norm": 0.6433843374252319, + "learning_rate": 2.411443592327703e-07, + "loss": 0.5556, + "step": 4622 + }, + { + "epoch": 5.171140939597316, + "grad_norm": 0.6204437017440796, + "learning_rate": 2.405054334222734e-07, + "loss": 0.5199, + "step": 4623 + }, + { + "epoch": 5.172259507829978, + "grad_norm": 0.6061958074569702, + "learning_rate": 2.3986731239566326e-07, + "loss": 0.5145, + "step": 4624 + }, + { + "epoch": 5.17337807606264, + "grad_norm": 0.6350598335266113, + "learning_rate": 2.3922999638022583e-07, + "loss": 0.5245, + "step": 4625 + }, + { + "epoch": 5.174496644295302, + "grad_norm": 0.6327183246612549, + "learning_rate": 2.385934856029584e-07, + "loss": 0.5234, + "step": 4626 + }, + { + "epoch": 5.175615212527964, + "grad_norm": 0.6296136379241943, + "learning_rate": 2.3795778029057358e-07, + "loss": 0.5302, + "step": 4627 + }, + { + "epoch": 5.176733780760626, + "grad_norm": 0.6204007267951965, + "learning_rate": 2.3732288066949521e-07, + "loss": 0.5192, + "step": 4628 + }, + { + "epoch": 5.177852348993288, + "grad_norm": 0.6288585066795349, + "learning_rate": 2.3668878696586211e-07, + "loss": 0.5192, + "step": 4629 + }, + { + "epoch": 5.178970917225951, + "grad_norm": 0.6230462789535522, + "learning_rate": 2.3605549940552448e-07, + "loss": 0.5197, + "step": 4630 + }, + { + "epoch": 5.180089485458613, + "grad_norm": 0.6221714615821838, + "learning_rate": 2.3542301821404545e-07, + "loss": 0.5195, + "step": 4631 + }, + { + "epoch": 5.181208053691275, + "grad_norm": 0.6130632758140564, + "learning_rate": 2.347913436167018e-07, + "loss": 0.5019, + "step": 4632 + }, + { + "epoch": 5.182326621923937, + "grad_norm": 0.6087003350257874, + "learning_rate": 2.341604758384819e-07, + "loss": 0.4937, + "step": 4633 + }, + { + "epoch": 5.1834451901566, + "grad_norm": 0.6284063458442688, + "learning_rate": 2.3353041510408846e-07, + "loss": 0.5243, + "step": 4634 + }, + { + "epoch": 5.184563758389261, + "grad_norm": 0.6349790096282959, + "learning_rate": 2.3290116163793453e-07, + "loss": 0.5278, + "step": 4635 + }, + { + "epoch": 5.185682326621924, + "grad_norm": 0.6270334124565125, + "learning_rate": 2.3227271566414827e-07, + "loss": 0.5184, + "step": 4636 + }, + { + "epoch": 5.186800894854586, + "grad_norm": 0.6511318683624268, + "learning_rate": 2.316450774065679e-07, + "loss": 0.5415, + "step": 4637 + }, + { + "epoch": 5.1879194630872485, + "grad_norm": 0.6234907507896423, + "learning_rate": 2.3101824708874481e-07, + "loss": 0.5096, + "step": 4638 + }, + { + "epoch": 5.18903803131991, + "grad_norm": 0.6099608540534973, + "learning_rate": 2.3039222493394214e-07, + "loss": 0.5039, + "step": 4639 + }, + { + "epoch": 5.190156599552573, + "grad_norm": 0.6286628246307373, + "learning_rate": 2.2976701116513678e-07, + "loss": 0.5065, + "step": 4640 + }, + { + "epoch": 5.191275167785235, + "grad_norm": 0.6258089542388916, + "learning_rate": 2.2914260600501558e-07, + "loss": 0.5274, + "step": 4641 + }, + { + "epoch": 5.192393736017897, + "grad_norm": 0.6157236099243164, + "learning_rate": 2.285190096759793e-07, + "loss": 0.5206, + "step": 4642 + }, + { + "epoch": 5.193512304250559, + "grad_norm": 0.6404297351837158, + "learning_rate": 2.2789622240013937e-07, + "loss": 0.5556, + "step": 4643 + }, + { + "epoch": 5.194630872483222, + "grad_norm": 0.6148262023925781, + "learning_rate": 2.272742443993184e-07, + "loss": 0.5064, + "step": 4644 + }, + { + "epoch": 5.1957494407158835, + "grad_norm": 0.6217085719108582, + "learning_rate": 2.2665307589505304e-07, + "loss": 0.5059, + "step": 4645 + }, + { + "epoch": 5.196868008948546, + "grad_norm": 0.6438245177268982, + "learning_rate": 2.2603271710858992e-07, + "loss": 0.5066, + "step": 4646 + }, + { + "epoch": 5.197986577181208, + "grad_norm": 0.6285523176193237, + "learning_rate": 2.2541316826088732e-07, + "loss": 0.5395, + "step": 4647 + }, + { + "epoch": 5.199105145413871, + "grad_norm": 0.6250251531600952, + "learning_rate": 2.247944295726151e-07, + "loss": 0.5176, + "step": 4648 + }, + { + "epoch": 5.200223713646532, + "grad_norm": 0.6506896018981934, + "learning_rate": 2.241765012641556e-07, + "loss": 0.5354, + "step": 4649 + }, + { + "epoch": 5.201342281879195, + "grad_norm": 0.6318608522415161, + "learning_rate": 2.2355938355560103e-07, + "loss": 0.5114, + "step": 4650 + }, + { + "epoch": 5.202460850111857, + "grad_norm": 0.6211562156677246, + "learning_rate": 2.229430766667562e-07, + "loss": 0.5257, + "step": 4651 + }, + { + "epoch": 5.203579418344519, + "grad_norm": 0.6141037940979004, + "learning_rate": 2.22327580817136e-07, + "loss": 0.5138, + "step": 4652 + }, + { + "epoch": 5.204697986577181, + "grad_norm": 0.6190756559371948, + "learning_rate": 2.2171289622596765e-07, + "loss": 0.5424, + "step": 4653 + }, + { + "epoch": 5.205816554809843, + "grad_norm": 0.6211480498313904, + "learning_rate": 2.2109902311218845e-07, + "loss": 0.54, + "step": 4654 + }, + { + "epoch": 5.206935123042506, + "grad_norm": 0.6317529678344727, + "learning_rate": 2.2048596169444602e-07, + "loss": 0.5512, + "step": 4655 + }, + { + "epoch": 5.208053691275167, + "grad_norm": 0.609115481376648, + "learning_rate": 2.19873712191101e-07, + "loss": 0.5185, + "step": 4656 + }, + { + "epoch": 5.20917225950783, + "grad_norm": 0.6591163277626038, + "learning_rate": 2.1926227482022278e-07, + "loss": 0.5334, + "step": 4657 + }, + { + "epoch": 5.210290827740492, + "grad_norm": 0.6132237315177917, + "learning_rate": 2.1865164979959303e-07, + "loss": 0.5155, + "step": 4658 + }, + { + "epoch": 5.2114093959731544, + "grad_norm": 0.6162639856338501, + "learning_rate": 2.1804183734670277e-07, + "loss": 0.5393, + "step": 4659 + }, + { + "epoch": 5.212527964205816, + "grad_norm": 0.6103164553642273, + "learning_rate": 2.174328376787546e-07, + "loss": 0.5633, + "step": 4660 + }, + { + "epoch": 5.213646532438479, + "grad_norm": 0.6202261447906494, + "learning_rate": 2.168246510126615e-07, + "loss": 0.5246, + "step": 4661 + }, + { + "epoch": 5.214765100671141, + "grad_norm": 0.6211634278297424, + "learning_rate": 2.1621727756504645e-07, + "loss": 0.5675, + "step": 4662 + }, + { + "epoch": 5.215883668903803, + "grad_norm": 0.6296864748001099, + "learning_rate": 2.1561071755224205e-07, + "loss": 0.5253, + "step": 4663 + }, + { + "epoch": 5.217002237136465, + "grad_norm": 0.6110845804214478, + "learning_rate": 2.1500497119029324e-07, + "loss": 0.5203, + "step": 4664 + }, + { + "epoch": 5.218120805369128, + "grad_norm": 0.6271952986717224, + "learning_rate": 2.1440003869495379e-07, + "loss": 0.505, + "step": 4665 + }, + { + "epoch": 5.2192393736017895, + "grad_norm": 0.6295967698097229, + "learning_rate": 2.1379592028168694e-07, + "loss": 0.5306, + "step": 4666 + }, + { + "epoch": 5.220357941834452, + "grad_norm": 0.6073415875434875, + "learning_rate": 2.1319261616566776e-07, + "loss": 0.49, + "step": 4667 + }, + { + "epoch": 5.221476510067114, + "grad_norm": 0.6228402256965637, + "learning_rate": 2.1259012656177968e-07, + "loss": 0.5444, + "step": 4668 + }, + { + "epoch": 5.2225950782997765, + "grad_norm": 0.6021326780319214, + "learning_rate": 2.119884516846177e-07, + "loss": 0.4744, + "step": 4669 + }, + { + "epoch": 5.223713646532438, + "grad_norm": 0.6223728060722351, + "learning_rate": 2.113875917484845e-07, + "loss": 0.5332, + "step": 4670 + }, + { + "epoch": 5.224832214765101, + "grad_norm": 0.6258463859558105, + "learning_rate": 2.1078754696739423e-07, + "loss": 0.5248, + "step": 4671 + }, + { + "epoch": 5.225950782997763, + "grad_norm": 0.6426273584365845, + "learning_rate": 2.1018831755506925e-07, + "loss": 0.5311, + "step": 4672 + }, + { + "epoch": 5.227069351230425, + "grad_norm": 0.6181263327598572, + "learning_rate": 2.0958990372494353e-07, + "loss": 0.5261, + "step": 4673 + }, + { + "epoch": 5.228187919463087, + "grad_norm": 0.6250265836715698, + "learning_rate": 2.0899230569015828e-07, + "loss": 0.5326, + "step": 4674 + }, + { + "epoch": 5.22930648769575, + "grad_norm": 0.6190370321273804, + "learning_rate": 2.0839552366356625e-07, + "loss": 0.5214, + "step": 4675 + }, + { + "epoch": 5.230425055928412, + "grad_norm": 0.6263481974601746, + "learning_rate": 2.0779955785772772e-07, + "loss": 0.5257, + "step": 4676 + }, + { + "epoch": 5.231543624161074, + "grad_norm": 0.6063491106033325, + "learning_rate": 2.0720440848491287e-07, + "loss": 0.5221, + "step": 4677 + }, + { + "epoch": 5.232662192393736, + "grad_norm": 0.6234795451164246, + "learning_rate": 2.0661007575710185e-07, + "loss": 0.5298, + "step": 4678 + }, + { + "epoch": 5.233780760626399, + "grad_norm": 0.6292409300804138, + "learning_rate": 2.0601655988598312e-07, + "loss": 0.5201, + "step": 4679 + }, + { + "epoch": 5.23489932885906, + "grad_norm": 0.6103098392486572, + "learning_rate": 2.0542386108295454e-07, + "loss": 0.4977, + "step": 4680 + }, + { + "epoch": 5.236017897091722, + "grad_norm": 0.6124383807182312, + "learning_rate": 2.048319795591222e-07, + "loss": 0.517, + "step": 4681 + }, + { + "epoch": 5.237136465324385, + "grad_norm": 0.6371023654937744, + "learning_rate": 2.042409155253025e-07, + "loss": 0.5285, + "step": 4682 + }, + { + "epoch": 5.238255033557047, + "grad_norm": 0.6090527772903442, + "learning_rate": 2.0365066919201897e-07, + "loss": 0.5252, + "step": 4683 + }, + { + "epoch": 5.239373601789709, + "grad_norm": 0.6092851161956787, + "learning_rate": 2.0306124076950595e-07, + "loss": 0.5143, + "step": 4684 + }, + { + "epoch": 5.240492170022371, + "grad_norm": 0.6093874573707581, + "learning_rate": 2.0247263046770464e-07, + "loss": 0.5438, + "step": 4685 + }, + { + "epoch": 5.241610738255034, + "grad_norm": 0.6190868616104126, + "learning_rate": 2.0188483849626573e-07, + "loss": 0.5319, + "step": 4686 + }, + { + "epoch": 5.242729306487695, + "grad_norm": 0.626626193523407, + "learning_rate": 2.012978650645478e-07, + "loss": 0.5468, + "step": 4687 + }, + { + "epoch": 5.243847874720358, + "grad_norm": 0.6198920011520386, + "learning_rate": 2.0071171038161808e-07, + "loss": 0.5164, + "step": 4688 + }, + { + "epoch": 5.24496644295302, + "grad_norm": 0.5967995524406433, + "learning_rate": 2.0012637465625345e-07, + "loss": 0.5129, + "step": 4689 + }, + { + "epoch": 5.2460850111856825, + "grad_norm": 0.6271083354949951, + "learning_rate": 1.9954185809693717e-07, + "loss": 0.5388, + "step": 4690 + }, + { + "epoch": 5.247203579418344, + "grad_norm": 0.6470271944999695, + "learning_rate": 1.9895816091186183e-07, + "loss": 0.5386, + "step": 4691 + }, + { + "epoch": 5.248322147651007, + "grad_norm": 0.6154986023902893, + "learning_rate": 1.9837528330892781e-07, + "loss": 0.4991, + "step": 4692 + }, + { + "epoch": 5.249440715883669, + "grad_norm": 0.6213737726211548, + "learning_rate": 1.9779322549574459e-07, + "loss": 0.5355, + "step": 4693 + }, + { + "epoch": 5.250559284116331, + "grad_norm": 0.6404212713241577, + "learning_rate": 1.9721198767962775e-07, + "loss": 0.512, + "step": 4694 + }, + { + "epoch": 5.251677852348993, + "grad_norm": 0.6308888792991638, + "learning_rate": 1.966315700676022e-07, + "loss": 0.5218, + "step": 4695 + }, + { + "epoch": 5.252796420581656, + "grad_norm": 0.6243783235549927, + "learning_rate": 1.9605197286640037e-07, + "loss": 0.5288, + "step": 4696 + }, + { + "epoch": 5.2539149888143175, + "grad_norm": 0.6394689679145813, + "learning_rate": 1.9547319628246187e-07, + "loss": 0.4939, + "step": 4697 + }, + { + "epoch": 5.25503355704698, + "grad_norm": 0.6283667683601379, + "learning_rate": 1.9489524052193564e-07, + "loss": 0.5372, + "step": 4698 + }, + { + "epoch": 5.256152125279642, + "grad_norm": 0.6206676363945007, + "learning_rate": 1.9431810579067624e-07, + "loss": 0.5157, + "step": 4699 + }, + { + "epoch": 5.257270693512305, + "grad_norm": 0.6401064991950989, + "learning_rate": 1.9374179229424779e-07, + "loss": 0.548, + "step": 4700 + }, + { + "epoch": 5.258389261744966, + "grad_norm": 0.6641870141029358, + "learning_rate": 1.9316630023791972e-07, + "loss": 0.536, + "step": 4701 + }, + { + "epoch": 5.259507829977629, + "grad_norm": 0.6164604425430298, + "learning_rate": 1.9259162982667113e-07, + "loss": 0.5166, + "step": 4702 + }, + { + "epoch": 5.260626398210291, + "grad_norm": 0.6029345989227295, + "learning_rate": 1.9201778126518716e-07, + "loss": 0.497, + "step": 4703 + }, + { + "epoch": 5.261744966442953, + "grad_norm": 0.6212674975395203, + "learning_rate": 1.9144475475786016e-07, + "loss": 0.5212, + "step": 4704 + }, + { + "epoch": 5.262863534675615, + "grad_norm": 0.6274245977401733, + "learning_rate": 1.908725505087894e-07, + "loss": 0.5531, + "step": 4705 + }, + { + "epoch": 5.263982102908278, + "grad_norm": 0.6184214949607849, + "learning_rate": 1.9030116872178317e-07, + "loss": 0.5117, + "step": 4706 + }, + { + "epoch": 5.26510067114094, + "grad_norm": 0.6335828304290771, + "learning_rate": 1.8973060960035483e-07, + "loss": 0.5502, + "step": 4707 + }, + { + "epoch": 5.266219239373601, + "grad_norm": 0.6258768439292908, + "learning_rate": 1.8916087334772515e-07, + "loss": 0.5015, + "step": 4708 + }, + { + "epoch": 5.267337807606264, + "grad_norm": 0.62689608335495, + "learning_rate": 1.8859196016682262e-07, + "loss": 0.5027, + "step": 4709 + }, + { + "epoch": 5.268456375838926, + "grad_norm": 0.6329622268676758, + "learning_rate": 1.8802387026028157e-07, + "loss": 0.5447, + "step": 4710 + }, + { + "epoch": 5.269574944071588, + "grad_norm": 0.6214032173156738, + "learning_rate": 1.8745660383044402e-07, + "loss": 0.5324, + "step": 4711 + }, + { + "epoch": 5.27069351230425, + "grad_norm": 0.5974287986755371, + "learning_rate": 1.868901610793572e-07, + "loss": 0.4841, + "step": 4712 + }, + { + "epoch": 5.271812080536913, + "grad_norm": 0.6335077285766602, + "learning_rate": 1.86324542208777e-07, + "loss": 0.5387, + "step": 4713 + }, + { + "epoch": 5.272930648769575, + "grad_norm": 0.6301265954971313, + "learning_rate": 1.8575974742016418e-07, + "loss": 0.5193, + "step": 4714 + }, + { + "epoch": 5.274049217002237, + "grad_norm": 0.6316810250282288, + "learning_rate": 1.8519577691468726e-07, + "loss": 0.5205, + "step": 4715 + }, + { + "epoch": 5.275167785234899, + "grad_norm": 0.6292200088500977, + "learning_rate": 1.8463263089322003e-07, + "loss": 0.5202, + "step": 4716 + }, + { + "epoch": 5.276286353467562, + "grad_norm": 0.6736591458320618, + "learning_rate": 1.8407030955634343e-07, + "loss": 0.5316, + "step": 4717 + }, + { + "epoch": 5.2774049217002235, + "grad_norm": 0.6298484802246094, + "learning_rate": 1.8350881310434444e-07, + "loss": 0.522, + "step": 4718 + }, + { + "epoch": 5.278523489932886, + "grad_norm": 0.6381715536117554, + "learning_rate": 1.829481417372156e-07, + "loss": 0.5267, + "step": 4719 + }, + { + "epoch": 5.279642058165548, + "grad_norm": 0.6257025599479675, + "learning_rate": 1.823882956546566e-07, + "loss": 0.508, + "step": 4720 + }, + { + "epoch": 5.2807606263982105, + "grad_norm": 0.6164595484733582, + "learning_rate": 1.8182927505607212e-07, + "loss": 0.5091, + "step": 4721 + }, + { + "epoch": 5.281879194630872, + "grad_norm": 0.6251274943351746, + "learning_rate": 1.8127108014057426e-07, + "loss": 0.5267, + "step": 4722 + }, + { + "epoch": 5.282997762863535, + "grad_norm": 0.6321995854377747, + "learning_rate": 1.8071371110697928e-07, + "loss": 0.5533, + "step": 4723 + }, + { + "epoch": 5.284116331096197, + "grad_norm": 0.635021984577179, + "learning_rate": 1.8015716815381085e-07, + "loss": 0.5389, + "step": 4724 + }, + { + "epoch": 5.285234899328859, + "grad_norm": 0.6470574736595154, + "learning_rate": 1.7960145147929737e-07, + "loss": 0.5306, + "step": 4725 + }, + { + "epoch": 5.286353467561521, + "grad_norm": 0.6163790822029114, + "learning_rate": 1.7904656128137354e-07, + "loss": 0.5021, + "step": 4726 + }, + { + "epoch": 5.287472035794184, + "grad_norm": 0.6044324636459351, + "learning_rate": 1.7849249775767964e-07, + "loss": 0.5105, + "step": 4727 + }, + { + "epoch": 5.2885906040268456, + "grad_norm": 0.6140927076339722, + "learning_rate": 1.779392611055608e-07, + "loss": 0.524, + "step": 4728 + }, + { + "epoch": 5.289709172259508, + "grad_norm": 0.6188439130783081, + "learning_rate": 1.7738685152206835e-07, + "loss": 0.541, + "step": 4729 + }, + { + "epoch": 5.29082774049217, + "grad_norm": 0.6163668632507324, + "learning_rate": 1.768352692039585e-07, + "loss": 0.5051, + "step": 4730 + }, + { + "epoch": 5.291946308724833, + "grad_norm": 0.6291584372520447, + "learning_rate": 1.7628451434769377e-07, + "loss": 0.5243, + "step": 4731 + }, + { + "epoch": 5.293064876957494, + "grad_norm": 0.6155591607093811, + "learning_rate": 1.7573458714944064e-07, + "loss": 0.5363, + "step": 4732 + }, + { + "epoch": 5.294183445190157, + "grad_norm": 0.6250618696212769, + "learning_rate": 1.751854878050721e-07, + "loss": 0.5352, + "step": 4733 + }, + { + "epoch": 5.295302013422819, + "grad_norm": 0.616830587387085, + "learning_rate": 1.7463721651016563e-07, + "loss": 0.5156, + "step": 4734 + }, + { + "epoch": 5.296420581655481, + "grad_norm": 0.6188386082649231, + "learning_rate": 1.7408977346000333e-07, + "loss": 0.534, + "step": 4735 + }, + { + "epoch": 5.297539149888143, + "grad_norm": 0.6504315733909607, + "learning_rate": 1.7354315884957285e-07, + "loss": 0.5417, + "step": 4736 + }, + { + "epoch": 5.298657718120805, + "grad_norm": 0.6258623600006104, + "learning_rate": 1.7299737287356705e-07, + "loss": 0.5223, + "step": 4737 + }, + { + "epoch": 5.299776286353468, + "grad_norm": 0.6221994757652283, + "learning_rate": 1.7245241572638294e-07, + "loss": 0.5166, + "step": 4738 + }, + { + "epoch": 5.300894854586129, + "grad_norm": 0.6350775957107544, + "learning_rate": 1.7190828760212246e-07, + "loss": 0.538, + "step": 4739 + }, + { + "epoch": 5.302013422818792, + "grad_norm": 0.6318569779396057, + "learning_rate": 1.713649886945934e-07, + "loss": 0.5107, + "step": 4740 + }, + { + "epoch": 5.303131991051454, + "grad_norm": 0.6305361986160278, + "learning_rate": 1.7082251919730592e-07, + "loss": 0.493, + "step": 4741 + }, + { + "epoch": 5.3042505592841165, + "grad_norm": 0.6508610248565674, + "learning_rate": 1.7028087930347743e-07, + "loss": 0.5399, + "step": 4742 + }, + { + "epoch": 5.305369127516778, + "grad_norm": 0.6189182996749878, + "learning_rate": 1.6974006920602803e-07, + "loss": 0.5183, + "step": 4743 + }, + { + "epoch": 5.306487695749441, + "grad_norm": 0.6101722121238708, + "learning_rate": 1.6920008909758257e-07, + "loss": 0.5187, + "step": 4744 + }, + { + "epoch": 5.307606263982103, + "grad_norm": 0.6149749159812927, + "learning_rate": 1.6866093917047022e-07, + "loss": 0.4981, + "step": 4745 + }, + { + "epoch": 5.308724832214765, + "grad_norm": 0.6225912570953369, + "learning_rate": 1.6812261961672572e-07, + "loss": 0.5166, + "step": 4746 + }, + { + "epoch": 5.309843400447427, + "grad_norm": 0.6397315263748169, + "learning_rate": 1.6758513062808567e-07, + "loss": 0.5233, + "step": 4747 + }, + { + "epoch": 5.31096196868009, + "grad_norm": 0.6323255896568298, + "learning_rate": 1.6704847239599364e-07, + "loss": 0.5277, + "step": 4748 + }, + { + "epoch": 5.3120805369127515, + "grad_norm": 0.6285826563835144, + "learning_rate": 1.6651264511159504e-07, + "loss": 0.5138, + "step": 4749 + }, + { + "epoch": 5.313199105145414, + "grad_norm": 0.606264591217041, + "learning_rate": 1.6597764896573998e-07, + "loss": 0.5064, + "step": 4750 + }, + { + "epoch": 5.314317673378076, + "grad_norm": 0.6257588267326355, + "learning_rate": 1.6544348414898327e-07, + "loss": 0.5372, + "step": 4751 + }, + { + "epoch": 5.315436241610739, + "grad_norm": 0.6103993654251099, + "learning_rate": 1.6491015085158297e-07, + "loss": 0.5143, + "step": 4752 + }, + { + "epoch": 5.3165548098434, + "grad_norm": 0.625700831413269, + "learning_rate": 1.6437764926350074e-07, + "loss": 0.5086, + "step": 4753 + }, + { + "epoch": 5.317673378076063, + "grad_norm": 0.628886878490448, + "learning_rate": 1.6384597957440213e-07, + "loss": 0.5026, + "step": 4754 + }, + { + "epoch": 5.318791946308725, + "grad_norm": 0.628070056438446, + "learning_rate": 1.6331514197365726e-07, + "loss": 0.5405, + "step": 4755 + }, + { + "epoch": 5.319910514541387, + "grad_norm": 0.6196474432945251, + "learning_rate": 1.627851366503383e-07, + "loss": 0.5053, + "step": 4756 + }, + { + "epoch": 5.321029082774049, + "grad_norm": 0.6220239400863647, + "learning_rate": 1.6225596379322305e-07, + "loss": 0.5128, + "step": 4757 + }, + { + "epoch": 5.322147651006711, + "grad_norm": 0.6231882572174072, + "learning_rate": 1.6172762359079054e-07, + "loss": 0.4946, + "step": 4758 + }, + { + "epoch": 5.323266219239374, + "grad_norm": 0.6205283999443054, + "learning_rate": 1.6120011623122545e-07, + "loss": 0.5187, + "step": 4759 + }, + { + "epoch": 5.324384787472036, + "grad_norm": 0.6251282095909119, + "learning_rate": 1.6067344190241335e-07, + "loss": 0.551, + "step": 4760 + }, + { + "epoch": 5.325503355704698, + "grad_norm": 0.612058699131012, + "learning_rate": 1.60147600791945e-07, + "loss": 0.5294, + "step": 4761 + }, + { + "epoch": 5.32662192393736, + "grad_norm": 0.6167812943458557, + "learning_rate": 1.5962259308711398e-07, + "loss": 0.5269, + "step": 4762 + }, + { + "epoch": 5.327740492170022, + "grad_norm": 0.631706714630127, + "learning_rate": 1.5909841897491647e-07, + "loss": 0.5226, + "step": 4763 + }, + { + "epoch": 5.328859060402684, + "grad_norm": 0.6339399218559265, + "learning_rate": 1.5857507864205285e-07, + "loss": 0.5339, + "step": 4764 + }, + { + "epoch": 5.329977628635347, + "grad_norm": 0.624181866645813, + "learning_rate": 1.5805257227492487e-07, + "loss": 0.5376, + "step": 4765 + }, + { + "epoch": 5.331096196868009, + "grad_norm": 0.6312966346740723, + "learning_rate": 1.5753090005963924e-07, + "loss": 0.5301, + "step": 4766 + }, + { + "epoch": 5.332214765100671, + "grad_norm": 0.6200932860374451, + "learning_rate": 1.5701006218200394e-07, + "loss": 0.493, + "step": 4767 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.6275931000709534, + "learning_rate": 1.5649005882753003e-07, + "loss": 0.5149, + "step": 4768 + }, + { + "epoch": 5.334451901565996, + "grad_norm": 0.6249051094055176, + "learning_rate": 1.559708901814319e-07, + "loss": 0.5189, + "step": 4769 + }, + { + "epoch": 5.3355704697986575, + "grad_norm": 0.6257677674293518, + "learning_rate": 1.554525564286266e-07, + "loss": 0.5386, + "step": 4770 + }, + { + "epoch": 5.33668903803132, + "grad_norm": 0.6254323124885559, + "learning_rate": 1.549350577537334e-07, + "loss": 0.5187, + "step": 4771 + }, + { + "epoch": 5.337807606263982, + "grad_norm": 0.6143620610237122, + "learning_rate": 1.5441839434107404e-07, + "loss": 0.5074, + "step": 4772 + }, + { + "epoch": 5.3389261744966445, + "grad_norm": 0.6078446507453918, + "learning_rate": 1.5390256637467376e-07, + "loss": 0.5015, + "step": 4773 + }, + { + "epoch": 5.340044742729306, + "grad_norm": 0.6333840489387512, + "learning_rate": 1.5338757403825843e-07, + "loss": 0.539, + "step": 4774 + }, + { + "epoch": 5.341163310961969, + "grad_norm": 0.6052622199058533, + "learning_rate": 1.5287341751525853e-07, + "loss": 0.5245, + "step": 4775 + }, + { + "epoch": 5.342281879194631, + "grad_norm": 0.6200132966041565, + "learning_rate": 1.5236009698880532e-07, + "loss": 0.5077, + "step": 4776 + }, + { + "epoch": 5.343400447427293, + "grad_norm": 0.6504286527633667, + "learning_rate": 1.5184761264173232e-07, + "loss": 0.5431, + "step": 4777 + }, + { + "epoch": 5.344519015659955, + "grad_norm": 0.6169535517692566, + "learning_rate": 1.5133596465657513e-07, + "loss": 0.5196, + "step": 4778 + }, + { + "epoch": 5.345637583892618, + "grad_norm": 0.598827600479126, + "learning_rate": 1.50825153215573e-07, + "loss": 0.4843, + "step": 4779 + }, + { + "epoch": 5.3467561521252795, + "grad_norm": 0.6270548105239868, + "learning_rate": 1.5031517850066512e-07, + "loss": 0.5378, + "step": 4780 + }, + { + "epoch": 5.347874720357942, + "grad_norm": 0.6319693326950073, + "learning_rate": 1.4980604069349454e-07, + "loss": 0.5293, + "step": 4781 + }, + { + "epoch": 5.348993288590604, + "grad_norm": 0.6020025610923767, + "learning_rate": 1.4929773997540476e-07, + "loss": 0.5067, + "step": 4782 + }, + { + "epoch": 5.350111856823267, + "grad_norm": 0.6203758120536804, + "learning_rate": 1.4879027652744182e-07, + "loss": 0.5321, + "step": 4783 + }, + { + "epoch": 5.351230425055928, + "grad_norm": 0.6158334016799927, + "learning_rate": 1.482836505303531e-07, + "loss": 0.5011, + "step": 4784 + }, + { + "epoch": 5.35234899328859, + "grad_norm": 0.6151765584945679, + "learning_rate": 1.4777786216458813e-07, + "loss": 0.5147, + "step": 4785 + }, + { + "epoch": 5.353467561521253, + "grad_norm": 0.6194100975990295, + "learning_rate": 1.4727291161029832e-07, + "loss": 0.5256, + "step": 4786 + }, + { + "epoch": 5.3545861297539155, + "grad_norm": 0.6250581741333008, + "learning_rate": 1.4676879904733542e-07, + "loss": 0.5096, + "step": 4787 + }, + { + "epoch": 5.355704697986577, + "grad_norm": 0.6337870955467224, + "learning_rate": 1.4626552465525496e-07, + "loss": 0.5159, + "step": 4788 + }, + { + "epoch": 5.356823266219239, + "grad_norm": 0.6248489022254944, + "learning_rate": 1.4576308861331129e-07, + "loss": 0.5128, + "step": 4789 + }, + { + "epoch": 5.357941834451902, + "grad_norm": 0.6253764033317566, + "learning_rate": 1.4526149110046266e-07, + "loss": 0.53, + "step": 4790 + }, + { + "epoch": 5.359060402684563, + "grad_norm": 0.6273208260536194, + "learning_rate": 1.4476073229536647e-07, + "loss": 0.5059, + "step": 4791 + }, + { + "epoch": 5.360178970917226, + "grad_norm": 0.6125726103782654, + "learning_rate": 1.442608123763828e-07, + "loss": 0.5013, + "step": 4792 + }, + { + "epoch": 5.361297539149888, + "grad_norm": 0.6287118196487427, + "learning_rate": 1.4376173152157196e-07, + "loss": 0.5008, + "step": 4793 + }, + { + "epoch": 5.3624161073825505, + "grad_norm": 0.6248438358306885, + "learning_rate": 1.4326348990869626e-07, + "loss": 0.5378, + "step": 4794 + }, + { + "epoch": 5.363534675615212, + "grad_norm": 0.6230807900428772, + "learning_rate": 1.4276608771521926e-07, + "loss": 0.506, + "step": 4795 + }, + { + "epoch": 5.364653243847875, + "grad_norm": 0.6161750555038452, + "learning_rate": 1.42269525118304e-07, + "loss": 0.53, + "step": 4796 + }, + { + "epoch": 5.365771812080537, + "grad_norm": 0.6246821880340576, + "learning_rate": 1.4177380229481647e-07, + "loss": 0.5102, + "step": 4797 + }, + { + "epoch": 5.366890380313199, + "grad_norm": 0.6192301511764526, + "learning_rate": 1.412789194213221e-07, + "loss": 0.5233, + "step": 4798 + }, + { + "epoch": 5.368008948545861, + "grad_norm": 0.5943834185600281, + "learning_rate": 1.407848766740877e-07, + "loss": 0.5161, + "step": 4799 + }, + { + "epoch": 5.369127516778524, + "grad_norm": 0.6317890286445618, + "learning_rate": 1.4029167422908107e-07, + "loss": 0.5307, + "step": 4800 + }, + { + "epoch": 5.3702460850111855, + "grad_norm": 0.6320303678512573, + "learning_rate": 1.3979931226197036e-07, + "loss": 0.5217, + "step": 4801 + }, + { + "epoch": 5.371364653243848, + "grad_norm": 0.6137423515319824, + "learning_rate": 1.3930779094812413e-07, + "loss": 0.5047, + "step": 4802 + }, + { + "epoch": 5.37248322147651, + "grad_norm": 0.628461480140686, + "learning_rate": 1.3881711046261154e-07, + "loss": 0.5051, + "step": 4803 + }, + { + "epoch": 5.373601789709173, + "grad_norm": 0.6301923394203186, + "learning_rate": 1.3832727098020333e-07, + "loss": 0.5361, + "step": 4804 + }, + { + "epoch": 5.374720357941834, + "grad_norm": 0.6131333112716675, + "learning_rate": 1.378382726753691e-07, + "loss": 0.5276, + "step": 4805 + }, + { + "epoch": 5.375838926174497, + "grad_norm": 0.6489636301994324, + "learning_rate": 1.3735011572228036e-07, + "loss": 0.5266, + "step": 4806 + }, + { + "epoch": 5.376957494407159, + "grad_norm": 0.6167519688606262, + "learning_rate": 1.3686280029480774e-07, + "loss": 0.5295, + "step": 4807 + }, + { + "epoch": 5.378076062639821, + "grad_norm": 0.6023221611976624, + "learning_rate": 1.3637632656652326e-07, + "loss": 0.5043, + "step": 4808 + }, + { + "epoch": 5.379194630872483, + "grad_norm": 0.6256252527236938, + "learning_rate": 1.358906947106975e-07, + "loss": 0.542, + "step": 4809 + }, + { + "epoch": 5.380313199105146, + "grad_norm": 0.6184626817703247, + "learning_rate": 1.3540590490030315e-07, + "loss": 0.5152, + "step": 4810 + }, + { + "epoch": 5.381431767337808, + "grad_norm": 0.6357247829437256, + "learning_rate": 1.3492195730801127e-07, + "loss": 0.514, + "step": 4811 + }, + { + "epoch": 5.382550335570469, + "grad_norm": 0.6323006749153137, + "learning_rate": 1.344388521061943e-07, + "loss": 0.5317, + "step": 4812 + }, + { + "epoch": 5.383668903803132, + "grad_norm": 0.6259229779243469, + "learning_rate": 1.3395658946692398e-07, + "loss": 0.548, + "step": 4813 + }, + { + "epoch": 5.384787472035794, + "grad_norm": 0.6062596440315247, + "learning_rate": 1.3347516956197126e-07, + "loss": 0.5056, + "step": 4814 + }, + { + "epoch": 5.385906040268456, + "grad_norm": 0.6150470972061157, + "learning_rate": 1.3299459256280872e-07, + "loss": 0.5003, + "step": 4815 + }, + { + "epoch": 5.387024608501118, + "grad_norm": 0.6508671641349792, + "learning_rate": 1.325148586406072e-07, + "loss": 0.5284, + "step": 4816 + }, + { + "epoch": 5.388143176733781, + "grad_norm": 0.6259085536003113, + "learning_rate": 1.3203596796623753e-07, + "loss": 0.5253, + "step": 4817 + }, + { + "epoch": 5.389261744966443, + "grad_norm": 0.627656877040863, + "learning_rate": 1.3155792071027018e-07, + "loss": 0.5084, + "step": 4818 + }, + { + "epoch": 5.390380313199105, + "grad_norm": 0.629426896572113, + "learning_rate": 1.3108071704297622e-07, + "loss": 0.543, + "step": 4819 + }, + { + "epoch": 5.391498881431767, + "grad_norm": 0.629905104637146, + "learning_rate": 1.3060435713432433e-07, + "loss": 0.5056, + "step": 4820 + }, + { + "epoch": 5.39261744966443, + "grad_norm": 0.6060286164283752, + "learning_rate": 1.30128841153985e-07, + "loss": 0.4902, + "step": 4821 + }, + { + "epoch": 5.3937360178970915, + "grad_norm": 0.6170443892478943, + "learning_rate": 1.2965416927132602e-07, + "loss": 0.5072, + "step": 4822 + }, + { + "epoch": 5.394854586129754, + "grad_norm": 0.6452808976173401, + "learning_rate": 1.2918034165541577e-07, + "loss": 0.5534, + "step": 4823 + }, + { + "epoch": 5.395973154362416, + "grad_norm": 0.6226423978805542, + "learning_rate": 1.2870735847502176e-07, + "loss": 0.4934, + "step": 4824 + }, + { + "epoch": 5.3970917225950785, + "grad_norm": 0.6175986528396606, + "learning_rate": 1.2823521989861033e-07, + "loss": 0.5123, + "step": 4825 + }, + { + "epoch": 5.39821029082774, + "grad_norm": 0.6339970827102661, + "learning_rate": 1.2776392609434695e-07, + "loss": 0.5362, + "step": 4826 + }, + { + "epoch": 5.399328859060403, + "grad_norm": 0.6425620913505554, + "learning_rate": 1.2729347723009656e-07, + "loss": 0.514, + "step": 4827 + }, + { + "epoch": 5.400447427293065, + "grad_norm": 0.6049675345420837, + "learning_rate": 1.268238734734231e-07, + "loss": 0.5319, + "step": 4828 + }, + { + "epoch": 5.401565995525727, + "grad_norm": 0.6309245228767395, + "learning_rate": 1.2635511499158953e-07, + "loss": 0.5224, + "step": 4829 + }, + { + "epoch": 5.402684563758389, + "grad_norm": 0.6084965467453003, + "learning_rate": 1.258872019515575e-07, + "loss": 0.5181, + "step": 4830 + }, + { + "epoch": 5.403803131991052, + "grad_norm": 0.6367757320404053, + "learning_rate": 1.2542013451998763e-07, + "loss": 0.5162, + "step": 4831 + }, + { + "epoch": 5.4049217002237135, + "grad_norm": 0.6259651780128479, + "learning_rate": 1.2495391286323988e-07, + "loss": 0.5339, + "step": 4832 + }, + { + "epoch": 5.406040268456376, + "grad_norm": 0.6212557554244995, + "learning_rate": 1.2448853714737226e-07, + "loss": 0.5205, + "step": 4833 + }, + { + "epoch": 5.407158836689038, + "grad_norm": 0.6382363438606262, + "learning_rate": 1.240240075381413e-07, + "loss": 0.5192, + "step": 4834 + }, + { + "epoch": 5.408277404921701, + "grad_norm": 0.6295331716537476, + "learning_rate": 1.2356032420100278e-07, + "loss": 0.5283, + "step": 4835 + }, + { + "epoch": 5.409395973154362, + "grad_norm": 0.6357342600822449, + "learning_rate": 1.2309748730111092e-07, + "loss": 0.505, + "step": 4836 + }, + { + "epoch": 5.410514541387025, + "grad_norm": 0.6169006824493408, + "learning_rate": 1.2263549700331835e-07, + "loss": 0.5015, + "step": 4837 + }, + { + "epoch": 5.411633109619687, + "grad_norm": 0.6178798675537109, + "learning_rate": 1.2217435347217587e-07, + "loss": 0.5051, + "step": 4838 + }, + { + "epoch": 5.412751677852349, + "grad_norm": 0.6167458891868591, + "learning_rate": 1.2171405687193383e-07, + "loss": 0.5188, + "step": 4839 + }, + { + "epoch": 5.413870246085011, + "grad_norm": 0.6213712096214294, + "learning_rate": 1.2125460736653939e-07, + "loss": 0.5557, + "step": 4840 + }, + { + "epoch": 5.414988814317673, + "grad_norm": 0.629782497882843, + "learning_rate": 1.2079600511963886e-07, + "loss": 0.5235, + "step": 4841 + }, + { + "epoch": 5.416107382550336, + "grad_norm": 0.6333674788475037, + "learning_rate": 1.203382502945763e-07, + "loss": 0.5376, + "step": 4842 + }, + { + "epoch": 5.417225950782997, + "grad_norm": 0.629169225692749, + "learning_rate": 1.1988134305439464e-07, + "loss": 0.5072, + "step": 4843 + }, + { + "epoch": 5.41834451901566, + "grad_norm": 0.6322267055511475, + "learning_rate": 1.194252835618348e-07, + "loss": 0.5126, + "step": 4844 + }, + { + "epoch": 5.419463087248322, + "grad_norm": 0.6189628839492798, + "learning_rate": 1.1897007197933435e-07, + "loss": 0.501, + "step": 4845 + }, + { + "epoch": 5.4205816554809845, + "grad_norm": 0.5891447067260742, + "learning_rate": 1.1851570846903138e-07, + "loss": 0.4873, + "step": 4846 + }, + { + "epoch": 5.421700223713646, + "grad_norm": 0.6191017627716064, + "learning_rate": 1.1806219319275918e-07, + "loss": 0.5134, + "step": 4847 + }, + { + "epoch": 5.422818791946309, + "grad_norm": 0.62992262840271, + "learning_rate": 1.1760952631205136e-07, + "loss": 0.522, + "step": 4848 + }, + { + "epoch": 5.423937360178971, + "grad_norm": 0.6142054796218872, + "learning_rate": 1.171577079881378e-07, + "loss": 0.5203, + "step": 4849 + }, + { + "epoch": 5.425055928411633, + "grad_norm": 0.6300192475318909, + "learning_rate": 1.1670673838194646e-07, + "loss": 0.5258, + "step": 4850 + }, + { + "epoch": 5.426174496644295, + "grad_norm": 0.6284183263778687, + "learning_rate": 1.1625661765410274e-07, + "loss": 0.5122, + "step": 4851 + }, + { + "epoch": 5.427293064876958, + "grad_norm": 0.6198375821113586, + "learning_rate": 1.1580734596493115e-07, + "loss": 0.4913, + "step": 4852 + }, + { + "epoch": 5.4284116331096195, + "grad_norm": 0.6317366361618042, + "learning_rate": 1.153589234744515e-07, + "loss": 0.5153, + "step": 4853 + }, + { + "epoch": 5.429530201342282, + "grad_norm": 0.6240347027778625, + "learning_rate": 1.1491135034238321e-07, + "loss": 0.5031, + "step": 4854 + }, + { + "epoch": 5.430648769574944, + "grad_norm": 0.619958221912384, + "learning_rate": 1.1446462672814213e-07, + "loss": 0.5094, + "step": 4855 + }, + { + "epoch": 5.431767337807607, + "grad_norm": 0.6189171671867371, + "learning_rate": 1.1401875279084096e-07, + "loss": 0.5121, + "step": 4856 + }, + { + "epoch": 5.432885906040268, + "grad_norm": 0.6443674564361572, + "learning_rate": 1.1357372868929212e-07, + "loss": 0.5314, + "step": 4857 + }, + { + "epoch": 5.434004474272931, + "grad_norm": 0.6218910813331604, + "learning_rate": 1.1312955458200186e-07, + "loss": 0.5454, + "step": 4858 + }, + { + "epoch": 5.435123042505593, + "grad_norm": 0.6504713892936707, + "learning_rate": 1.1268623062717699e-07, + "loss": 0.5628, + "step": 4859 + }, + { + "epoch": 5.436241610738255, + "grad_norm": 0.6527248024940491, + "learning_rate": 1.1224375698271894e-07, + "loss": 0.5314, + "step": 4860 + }, + { + "epoch": 5.437360178970917, + "grad_norm": 0.637406051158905, + "learning_rate": 1.1180213380622834e-07, + "loss": 0.5487, + "step": 4861 + }, + { + "epoch": 5.43847874720358, + "grad_norm": 0.6148375868797302, + "learning_rate": 1.1136136125500129e-07, + "loss": 0.5017, + "step": 4862 + }, + { + "epoch": 5.439597315436242, + "grad_norm": 0.6337900161743164, + "learning_rate": 1.1092143948603224e-07, + "loss": 0.5258, + "step": 4863 + }, + { + "epoch": 5.440715883668904, + "grad_norm": 0.6227062344551086, + "learning_rate": 1.1048236865601165e-07, + "loss": 0.5265, + "step": 4864 + }, + { + "epoch": 5.441834451901566, + "grad_norm": 0.6459817886352539, + "learning_rate": 1.100441489213272e-07, + "loss": 0.5443, + "step": 4865 + }, + { + "epoch": 5.442953020134228, + "grad_norm": 0.6190674901008606, + "learning_rate": 1.0960678043806322e-07, + "loss": 0.5109, + "step": 4866 + }, + { + "epoch": 5.44407158836689, + "grad_norm": 0.623599112033844, + "learning_rate": 1.0917026336200092e-07, + "loss": 0.5121, + "step": 4867 + }, + { + "epoch": 5.445190156599552, + "grad_norm": 0.6561744809150696, + "learning_rate": 1.0873459784861929e-07, + "loss": 0.5203, + "step": 4868 + }, + { + "epoch": 5.446308724832215, + "grad_norm": 0.6452724933624268, + "learning_rate": 1.0829978405309222e-07, + "loss": 0.5411, + "step": 4869 + }, + { + "epoch": 5.447427293064877, + "grad_norm": 0.6292245388031006, + "learning_rate": 1.0786582213029168e-07, + "loss": 0.5058, + "step": 4870 + }, + { + "epoch": 5.448545861297539, + "grad_norm": 0.6322401165962219, + "learning_rate": 1.0743271223478513e-07, + "loss": 0.4991, + "step": 4871 + }, + { + "epoch": 5.449664429530201, + "grad_norm": 0.6178863048553467, + "learning_rate": 1.0700045452083751e-07, + "loss": 0.5337, + "step": 4872 + }, + { + "epoch": 5.450782997762864, + "grad_norm": 0.6415062546730042, + "learning_rate": 1.0656904914241011e-07, + "loss": 0.5519, + "step": 4873 + }, + { + "epoch": 5.4519015659955254, + "grad_norm": 0.6205759644508362, + "learning_rate": 1.0613849625315974e-07, + "loss": 0.5269, + "step": 4874 + }, + { + "epoch": 5.453020134228188, + "grad_norm": 0.6325390338897705, + "learning_rate": 1.0570879600644012e-07, + "loss": 0.5141, + "step": 4875 + }, + { + "epoch": 5.45413870246085, + "grad_norm": 0.6344964504241943, + "learning_rate": 1.0527994855530216e-07, + "loss": 0.5045, + "step": 4876 + }, + { + "epoch": 5.4552572706935125, + "grad_norm": 0.6100403070449829, + "learning_rate": 1.0485195405249144e-07, + "loss": 0.4765, + "step": 4877 + }, + { + "epoch": 5.456375838926174, + "grad_norm": 0.6220605373382568, + "learning_rate": 1.0442481265045046e-07, + "loss": 0.519, + "step": 4878 + }, + { + "epoch": 5.457494407158837, + "grad_norm": 0.6279449462890625, + "learning_rate": 1.0399852450131837e-07, + "loss": 0.5356, + "step": 4879 + }, + { + "epoch": 5.458612975391499, + "grad_norm": 0.6247804164886475, + "learning_rate": 1.0357308975692953e-07, + "loss": 0.5283, + "step": 4880 + }, + { + "epoch": 5.459731543624161, + "grad_norm": 0.6427717208862305, + "learning_rate": 1.0314850856881526e-07, + "loss": 0.546, + "step": 4881 + }, + { + "epoch": 5.460850111856823, + "grad_norm": 0.620846688747406, + "learning_rate": 1.0272478108820178e-07, + "loss": 0.5184, + "step": 4882 + }, + { + "epoch": 5.461968680089486, + "grad_norm": 0.6237017512321472, + "learning_rate": 1.0230190746601227e-07, + "loss": 0.5383, + "step": 4883 + }, + { + "epoch": 5.4630872483221475, + "grad_norm": 0.6388023495674133, + "learning_rate": 1.0187988785286485e-07, + "loss": 0.5388, + "step": 4884 + }, + { + "epoch": 5.46420581655481, + "grad_norm": 0.6214722394943237, + "learning_rate": 1.0145872239907429e-07, + "loss": 0.5348, + "step": 4885 + }, + { + "epoch": 5.465324384787472, + "grad_norm": 0.6256738901138306, + "learning_rate": 1.0103841125465031e-07, + "loss": 0.5018, + "step": 4886 + }, + { + "epoch": 5.466442953020135, + "grad_norm": 0.6301501393318176, + "learning_rate": 1.0061895456929954e-07, + "loss": 0.5179, + "step": 4887 + }, + { + "epoch": 5.467561521252796, + "grad_norm": 0.6328453421592712, + "learning_rate": 1.0020035249242304e-07, + "loss": 0.5112, + "step": 4888 + }, + { + "epoch": 5.468680089485459, + "grad_norm": 0.6387876272201538, + "learning_rate": 9.978260517311822e-08, + "loss": 0.5374, + "step": 4889 + }, + { + "epoch": 5.469798657718121, + "grad_norm": 0.631699800491333, + "learning_rate": 9.936571276017743e-08, + "loss": 0.5259, + "step": 4890 + }, + { + "epoch": 5.4709172259507834, + "grad_norm": 0.6319783926010132, + "learning_rate": 9.894967540208882e-08, + "loss": 0.5237, + "step": 4891 + }, + { + "epoch": 5.472035794183445, + "grad_norm": 0.6148185729980469, + "learning_rate": 9.85344932470364e-08, + "loss": 0.5187, + "step": 4892 + }, + { + "epoch": 5.473154362416107, + "grad_norm": 0.6257889270782471, + "learning_rate": 9.812016644289879e-08, + "loss": 0.5033, + "step": 4893 + }, + { + "epoch": 5.47427293064877, + "grad_norm": 0.6020051836967468, + "learning_rate": 9.770669513725128e-08, + "loss": 0.4926, + "step": 4894 + }, + { + "epoch": 5.475391498881431, + "grad_norm": 0.644536018371582, + "learning_rate": 9.729407947736247e-08, + "loss": 0.5278, + "step": 4895 + }, + { + "epoch": 5.476510067114094, + "grad_norm": 0.6173610091209412, + "learning_rate": 9.688231961019811e-08, + "loss": 0.5231, + "step": 4896 + }, + { + "epoch": 5.477628635346756, + "grad_norm": 0.6153662204742432, + "learning_rate": 9.647141568241785e-08, + "loss": 0.5252, + "step": 4897 + }, + { + "epoch": 5.4787472035794185, + "grad_norm": 0.6190065145492554, + "learning_rate": 9.606136784037712e-08, + "loss": 0.5399, + "step": 4898 + }, + { + "epoch": 5.47986577181208, + "grad_norm": 0.6277621984481812, + "learning_rate": 9.565217623012602e-08, + "loss": 0.5235, + "step": 4899 + }, + { + "epoch": 5.480984340044743, + "grad_norm": 0.6300347447395325, + "learning_rate": 9.524384099740991e-08, + "loss": 0.5077, + "step": 4900 + }, + { + "epoch": 5.482102908277405, + "grad_norm": 0.6399431824684143, + "learning_rate": 9.48363622876694e-08, + "loss": 0.5206, + "step": 4901 + }, + { + "epoch": 5.483221476510067, + "grad_norm": 0.6209514141082764, + "learning_rate": 9.442974024603924e-08, + "loss": 0.5073, + "step": 4902 + }, + { + "epoch": 5.484340044742729, + "grad_norm": 0.6284603476524353, + "learning_rate": 9.402397501735022e-08, + "loss": 0.5334, + "step": 4903 + }, + { + "epoch": 5.485458612975392, + "grad_norm": 0.6186599135398865, + "learning_rate": 9.361906674612675e-08, + "loss": 0.5171, + "step": 4904 + }, + { + "epoch": 5.4865771812080535, + "grad_norm": 0.628922164440155, + "learning_rate": 9.321501557658874e-08, + "loss": 0.5402, + "step": 4905 + }, + { + "epoch": 5.487695749440716, + "grad_norm": 0.644160270690918, + "learning_rate": 9.28118216526508e-08, + "loss": 0.529, + "step": 4906 + }, + { + "epoch": 5.488814317673378, + "grad_norm": 0.6241750717163086, + "learning_rate": 9.240948511792192e-08, + "loss": 0.55, + "step": 4907 + }, + { + "epoch": 5.489932885906041, + "grad_norm": 0.6287521123886108, + "learning_rate": 9.200800611570554e-08, + "loss": 0.5162, + "step": 4908 + }, + { + "epoch": 5.491051454138702, + "grad_norm": 0.6262427568435669, + "learning_rate": 9.160738478899978e-08, + "loss": 0.5105, + "step": 4909 + }, + { + "epoch": 5.492170022371365, + "grad_norm": 0.6092537045478821, + "learning_rate": 9.120762128049798e-08, + "loss": 0.4967, + "step": 4910 + }, + { + "epoch": 5.493288590604027, + "grad_norm": 0.6302291750907898, + "learning_rate": 9.08087157325871e-08, + "loss": 0.5221, + "step": 4911 + }, + { + "epoch": 5.494407158836689, + "grad_norm": 0.6405455470085144, + "learning_rate": 9.041066828734928e-08, + "loss": 0.5313, + "step": 4912 + }, + { + "epoch": 5.495525727069351, + "grad_norm": 0.6096550822257996, + "learning_rate": 9.001347908656005e-08, + "loss": 0.5001, + "step": 4913 + }, + { + "epoch": 5.496644295302014, + "grad_norm": 0.6225055456161499, + "learning_rate": 8.961714827168982e-08, + "loss": 0.5013, + "step": 4914 + }, + { + "epoch": 5.497762863534676, + "grad_norm": 0.6398980021476746, + "learning_rate": 8.922167598390291e-08, + "loss": 0.5381, + "step": 4915 + }, + { + "epoch": 5.498881431767337, + "grad_norm": 0.6393673419952393, + "learning_rate": 8.882706236405886e-08, + "loss": 0.5316, + "step": 4916 + }, + { + "epoch": 5.5, + "grad_norm": 0.6091228723526001, + "learning_rate": 8.843330755270968e-08, + "loss": 0.5161, + "step": 4917 + }, + { + "epoch": 5.501118568232663, + "grad_norm": 0.6381200551986694, + "learning_rate": 8.804041169010347e-08, + "loss": 0.5088, + "step": 4918 + }, + { + "epoch": 5.502237136465324, + "grad_norm": 0.6318781971931458, + "learning_rate": 8.764837491618105e-08, + "loss": 0.5122, + "step": 4919 + }, + { + "epoch": 5.503355704697986, + "grad_norm": 0.647095799446106, + "learning_rate": 8.725719737057714e-08, + "loss": 0.538, + "step": 4920 + }, + { + "epoch": 5.504474272930649, + "grad_norm": 0.6368705034255981, + "learning_rate": 8.686687919262138e-08, + "loss": 0.5251, + "step": 4921 + }, + { + "epoch": 5.505592841163311, + "grad_norm": 0.6014148592948914, + "learning_rate": 8.647742052133672e-08, + "loss": 0.4946, + "step": 4922 + }, + { + "epoch": 5.506711409395973, + "grad_norm": 0.6204700469970703, + "learning_rate": 8.608882149543973e-08, + "loss": 0.5353, + "step": 4923 + }, + { + "epoch": 5.507829977628635, + "grad_norm": 0.6275492310523987, + "learning_rate": 8.570108225334106e-08, + "loss": 0.5302, + "step": 4924 + }, + { + "epoch": 5.508948545861298, + "grad_norm": 0.6333678364753723, + "learning_rate": 8.53142029331458e-08, + "loss": 0.5469, + "step": 4925 + }, + { + "epoch": 5.510067114093959, + "grad_norm": 0.6428055167198181, + "learning_rate": 8.492818367265149e-08, + "loss": 0.5209, + "step": 4926 + }, + { + "epoch": 5.511185682326622, + "grad_norm": 0.6197512149810791, + "learning_rate": 8.454302460935038e-08, + "loss": 0.4998, + "step": 4927 + }, + { + "epoch": 5.512304250559284, + "grad_norm": 0.6430240273475647, + "learning_rate": 8.415872588042773e-08, + "loss": 0.5101, + "step": 4928 + }, + { + "epoch": 5.5134228187919465, + "grad_norm": 0.616450309753418, + "learning_rate": 8.377528762276294e-08, + "loss": 0.5169, + "step": 4929 + }, + { + "epoch": 5.514541387024608, + "grad_norm": 0.6274447441101074, + "learning_rate": 8.339270997292814e-08, + "loss": 0.4936, + "step": 4930 + }, + { + "epoch": 5.515659955257271, + "grad_norm": 0.6294952034950256, + "learning_rate": 8.301099306718936e-08, + "loss": 0.5356, + "step": 4931 + }, + { + "epoch": 5.516778523489933, + "grad_norm": 0.6232316493988037, + "learning_rate": 8.263013704150614e-08, + "loss": 0.5233, + "step": 4932 + }, + { + "epoch": 5.517897091722595, + "grad_norm": 0.602920651435852, + "learning_rate": 8.225014203153109e-08, + "loss": 0.5079, + "step": 4933 + }, + { + "epoch": 5.519015659955257, + "grad_norm": 0.6307414174079895, + "learning_rate": 8.187100817261095e-08, + "loss": 0.5096, + "step": 4934 + }, + { + "epoch": 5.52013422818792, + "grad_norm": 0.6347441077232361, + "learning_rate": 8.149273559978438e-08, + "loss": 0.5319, + "step": 4935 + }, + { + "epoch": 5.5212527964205815, + "grad_norm": 0.6164381504058838, + "learning_rate": 8.111532444778469e-08, + "loss": 0.4997, + "step": 4936 + }, + { + "epoch": 5.522371364653244, + "grad_norm": 0.6164196133613586, + "learning_rate": 8.073877485103742e-08, + "loss": 0.5116, + "step": 4937 + }, + { + "epoch": 5.523489932885906, + "grad_norm": 0.6242993474006653, + "learning_rate": 8.03630869436614e-08, + "loss": 0.5332, + "step": 4938 + }, + { + "epoch": 5.524608501118569, + "grad_norm": 0.6289625763893127, + "learning_rate": 7.998826085946848e-08, + "loss": 0.5447, + "step": 4939 + }, + { + "epoch": 5.52572706935123, + "grad_norm": 0.6330069303512573, + "learning_rate": 7.961429673196436e-08, + "loss": 0.51, + "step": 4940 + }, + { + "epoch": 5.526845637583893, + "grad_norm": 0.6155861020088196, + "learning_rate": 7.924119469434666e-08, + "loss": 0.5383, + "step": 4941 + }, + { + "epoch": 5.527964205816555, + "grad_norm": 0.6385422945022583, + "learning_rate": 7.886895487950629e-08, + "loss": 0.5361, + "step": 4942 + }, + { + "epoch": 5.5290827740492166, + "grad_norm": 0.6438663601875305, + "learning_rate": 7.84975774200275e-08, + "loss": 0.5078, + "step": 4943 + }, + { + "epoch": 5.530201342281879, + "grad_norm": 0.6114825010299683, + "learning_rate": 7.812706244818669e-08, + "loss": 0.4965, + "step": 4944 + }, + { + "epoch": 5.531319910514542, + "grad_norm": 0.6209043264389038, + "learning_rate": 7.775741009595411e-08, + "loss": 0.5165, + "step": 4945 + }, + { + "epoch": 5.532438478747204, + "grad_norm": 0.6273100972175598, + "learning_rate": 7.738862049499141e-08, + "loss": 0.4982, + "step": 4946 + }, + { + "epoch": 5.533557046979865, + "grad_norm": 0.6157969236373901, + "learning_rate": 7.702069377665377e-08, + "loss": 0.5298, + "step": 4947 + }, + { + "epoch": 5.534675615212528, + "grad_norm": 0.6230738759040833, + "learning_rate": 7.665363007198834e-08, + "loss": 0.5037, + "step": 4948 + }, + { + "epoch": 5.53579418344519, + "grad_norm": 0.6066111922264099, + "learning_rate": 7.628742951173634e-08, + "loss": 0.4963, + "step": 4949 + }, + { + "epoch": 5.5369127516778525, + "grad_norm": 0.6125920414924622, + "learning_rate": 7.592209222633013e-08, + "loss": 0.4975, + "step": 4950 + }, + { + "epoch": 5.538031319910514, + "grad_norm": 0.6221954226493835, + "learning_rate": 7.555761834589503e-08, + "loss": 0.5158, + "step": 4951 + }, + { + "epoch": 5.539149888143177, + "grad_norm": 0.6140834093093872, + "learning_rate": 7.519400800024918e-08, + "loss": 0.5196, + "step": 4952 + }, + { + "epoch": 5.540268456375839, + "grad_norm": 0.6193041205406189, + "learning_rate": 7.483126131890228e-08, + "loss": 0.5224, + "step": 4953 + }, + { + "epoch": 5.541387024608501, + "grad_norm": 0.6255619525909424, + "learning_rate": 7.446937843105767e-08, + "loss": 0.5111, + "step": 4954 + }, + { + "epoch": 5.542505592841163, + "grad_norm": 0.6294854283332825, + "learning_rate": 7.410835946561001e-08, + "loss": 0.515, + "step": 4955 + }, + { + "epoch": 5.543624161073826, + "grad_norm": 0.6420679688453674, + "learning_rate": 7.374820455114673e-08, + "loss": 0.5247, + "step": 4956 + }, + { + "epoch": 5.5447427293064875, + "grad_norm": 0.6379706263542175, + "learning_rate": 7.33889138159466e-08, + "loss": 0.5117, + "step": 4957 + }, + { + "epoch": 5.54586129753915, + "grad_norm": 0.6600783467292786, + "learning_rate": 7.303048738798252e-08, + "loss": 0.5533, + "step": 4958 + }, + { + "epoch": 5.546979865771812, + "grad_norm": 0.6478392481803894, + "learning_rate": 7.267292539491738e-08, + "loss": 0.5135, + "step": 4959 + }, + { + "epoch": 5.5480984340044746, + "grad_norm": 0.6144479513168335, + "learning_rate": 7.231622796410764e-08, + "loss": 0.5001, + "step": 4960 + }, + { + "epoch": 5.549217002237136, + "grad_norm": 0.611020565032959, + "learning_rate": 7.19603952226014e-08, + "loss": 0.5207, + "step": 4961 + }, + { + "epoch": 5.550335570469799, + "grad_norm": 0.6341395974159241, + "learning_rate": 7.160542729713838e-08, + "loss": 0.5177, + "step": 4962 + }, + { + "epoch": 5.551454138702461, + "grad_norm": 0.6181929707527161, + "learning_rate": 7.125132431415105e-08, + "loss": 0.5035, + "step": 4963 + }, + { + "epoch": 5.552572706935123, + "grad_norm": 0.6109751462936401, + "learning_rate": 7.08980863997627e-08, + "loss": 0.5199, + "step": 4964 + }, + { + "epoch": 5.553691275167785, + "grad_norm": 0.6331272125244141, + "learning_rate": 7.054571367978963e-08, + "loss": 0.546, + "step": 4965 + }, + { + "epoch": 5.554809843400448, + "grad_norm": 0.631727933883667, + "learning_rate": 7.019420627973921e-08, + "loss": 0.5371, + "step": 4966 + }, + { + "epoch": 5.55592841163311, + "grad_norm": 0.6268221735954285, + "learning_rate": 6.984356432481132e-08, + "loss": 0.5141, + "step": 4967 + }, + { + "epoch": 5.557046979865772, + "grad_norm": 0.6341496109962463, + "learning_rate": 6.949378793989686e-08, + "loss": 0.5201, + "step": 4968 + }, + { + "epoch": 5.558165548098434, + "grad_norm": 0.6273724436759949, + "learning_rate": 6.914487724957896e-08, + "loss": 0.5187, + "step": 4969 + }, + { + "epoch": 5.559284116331096, + "grad_norm": 0.6069916486740112, + "learning_rate": 6.879683237813212e-08, + "loss": 0.5026, + "step": 4970 + }, + { + "epoch": 5.560402684563758, + "grad_norm": 0.6301685571670532, + "learning_rate": 6.844965344952215e-08, + "loss": 0.5015, + "step": 4971 + }, + { + "epoch": 5.561521252796421, + "grad_norm": 0.6159946918487549, + "learning_rate": 6.810334058740736e-08, + "loss": 0.5198, + "step": 4972 + }, + { + "epoch": 5.562639821029083, + "grad_norm": 0.6569334268569946, + "learning_rate": 6.775789391513631e-08, + "loss": 0.5099, + "step": 4973 + }, + { + "epoch": 5.563758389261745, + "grad_norm": 0.6048318147659302, + "learning_rate": 6.741331355575059e-08, + "loss": 0.5254, + "step": 4974 + }, + { + "epoch": 5.564876957494407, + "grad_norm": 0.6056896448135376, + "learning_rate": 6.706959963198173e-08, + "loss": 0.5354, + "step": 4975 + }, + { + "epoch": 5.565995525727069, + "grad_norm": 0.6168028712272644, + "learning_rate": 6.672675226625374e-08, + "loss": 0.5215, + "step": 4976 + }, + { + "epoch": 5.567114093959732, + "grad_norm": 0.6030023097991943, + "learning_rate": 6.638477158068119e-08, + "loss": 0.4766, + "step": 4977 + }, + { + "epoch": 5.568232662192393, + "grad_norm": 0.6350347399711609, + "learning_rate": 6.604365769707077e-08, + "loss": 0.5245, + "step": 4978 + }, + { + "epoch": 5.569351230425056, + "grad_norm": 0.6145987510681152, + "learning_rate": 6.570341073691977e-08, + "loss": 0.495, + "step": 4979 + }, + { + "epoch": 5.570469798657718, + "grad_norm": 0.6181846857070923, + "learning_rate": 6.53640308214168e-08, + "loss": 0.5045, + "step": 4980 + }, + { + "epoch": 5.5715883668903805, + "grad_norm": 0.6612157821655273, + "learning_rate": 6.50255180714418e-08, + "loss": 0.5412, + "step": 4981 + }, + { + "epoch": 5.572706935123042, + "grad_norm": 0.6166629791259766, + "learning_rate": 6.468787260756587e-08, + "loss": 0.5335, + "step": 4982 + }, + { + "epoch": 5.573825503355705, + "grad_norm": 0.6299104690551758, + "learning_rate": 6.435109455005139e-08, + "loss": 0.5498, + "step": 4983 + }, + { + "epoch": 5.574944071588367, + "grad_norm": 0.610446572303772, + "learning_rate": 6.401518401885076e-08, + "loss": 0.5113, + "step": 4984 + }, + { + "epoch": 5.576062639821029, + "grad_norm": 0.6277257800102234, + "learning_rate": 6.368014113360909e-08, + "loss": 0.5104, + "step": 4985 + }, + { + "epoch": 5.577181208053691, + "grad_norm": 0.61350417137146, + "learning_rate": 6.334596601366095e-08, + "loss": 0.5157, + "step": 4986 + }, + { + "epoch": 5.578299776286354, + "grad_norm": 0.6244810223579407, + "learning_rate": 6.301265877803276e-08, + "loss": 0.4983, + "step": 4987 + }, + { + "epoch": 5.5794183445190155, + "grad_norm": 0.6340211033821106, + "learning_rate": 6.268021954544095e-08, + "loss": 0.5154, + "step": 4988 + }, + { + "epoch": 5.580536912751678, + "grad_norm": 0.6098599433898926, + "learning_rate": 6.234864843429389e-08, + "loss": 0.5159, + "step": 4989 + }, + { + "epoch": 5.58165548098434, + "grad_norm": 0.633636474609375, + "learning_rate": 6.201794556268987e-08, + "loss": 0.5304, + "step": 4990 + }, + { + "epoch": 5.582774049217003, + "grad_norm": 0.6446368098258972, + "learning_rate": 6.168811104841826e-08, + "loss": 0.5358, + "step": 4991 + }, + { + "epoch": 5.583892617449664, + "grad_norm": 0.6196097731590271, + "learning_rate": 6.135914500895901e-08, + "loss": 0.504, + "step": 4992 + }, + { + "epoch": 5.585011185682326, + "grad_norm": 0.6259081959724426, + "learning_rate": 6.103104756148281e-08, + "loss": 0.5288, + "step": 4993 + }, + { + "epoch": 5.586129753914989, + "grad_norm": 0.6243703365325928, + "learning_rate": 6.070381882285148e-08, + "loss": 0.5341, + "step": 4994 + }, + { + "epoch": 5.587248322147651, + "grad_norm": 0.6189441084861755, + "learning_rate": 6.037745890961622e-08, + "loss": 0.5155, + "step": 4995 + }, + { + "epoch": 5.588366890380313, + "grad_norm": 0.6260862946510315, + "learning_rate": 6.005196793801988e-08, + "loss": 0.522, + "step": 4996 + }, + { + "epoch": 5.589485458612975, + "grad_norm": 0.6073001623153687, + "learning_rate": 5.972734602399499e-08, + "loss": 0.5332, + "step": 4997 + }, + { + "epoch": 5.590604026845638, + "grad_norm": 0.6502573490142822, + "learning_rate": 5.940359328316575e-08, + "loss": 0.5152, + "step": 4998 + }, + { + "epoch": 5.5917225950783, + "grad_norm": 0.6168399453163147, + "learning_rate": 5.908070983084518e-08, + "loss": 0.5174, + "step": 4999 + }, + { + "epoch": 5.592841163310962, + "grad_norm": 0.6154170036315918, + "learning_rate": 5.8758695782038245e-08, + "loss": 0.5149, + "step": 5000 + }, + { + "epoch": 5.593959731543624, + "grad_norm": 0.6312116980552673, + "learning_rate": 5.843755125143902e-08, + "loss": 0.521, + "step": 5001 + }, + { + "epoch": 5.5950782997762865, + "grad_norm": 0.6130902171134949, + "learning_rate": 5.81172763534324e-08, + "loss": 0.5045, + "step": 5002 + }, + { + "epoch": 5.596196868008948, + "grad_norm": 0.6383776068687439, + "learning_rate": 5.779787120209351e-08, + "loss": 0.5143, + "step": 5003 + }, + { + "epoch": 5.597315436241611, + "grad_norm": 0.623112142086029, + "learning_rate": 5.7479335911188036e-08, + "loss": 0.5008, + "step": 5004 + }, + { + "epoch": 5.598434004474273, + "grad_norm": 0.616506814956665, + "learning_rate": 5.716167059417077e-08, + "loss": 0.5117, + "step": 5005 + }, + { + "epoch": 5.599552572706935, + "grad_norm": 0.637446403503418, + "learning_rate": 5.684487536418787e-08, + "loss": 0.5132, + "step": 5006 + }, + { + "epoch": 5.600671140939597, + "grad_norm": 0.6461523175239563, + "learning_rate": 5.65289503340749e-08, + "loss": 0.5362, + "step": 5007 + }, + { + "epoch": 5.60178970917226, + "grad_norm": 0.6367152333259583, + "learning_rate": 5.6213895616357427e-08, + "loss": 0.5344, + "step": 5008 + }, + { + "epoch": 5.6029082774049215, + "grad_norm": 0.6232407093048096, + "learning_rate": 5.5899711323251504e-08, + "loss": 0.5209, + "step": 5009 + }, + { + "epoch": 5.604026845637584, + "grad_norm": 0.6245624423027039, + "learning_rate": 5.55863975666629e-08, + "loss": 0.5402, + "step": 5010 + }, + { + "epoch": 5.605145413870246, + "grad_norm": 0.6155311465263367, + "learning_rate": 5.527395445818762e-08, + "loss": 0.5296, + "step": 5011 + }, + { + "epoch": 5.6062639821029085, + "grad_norm": 0.6244901418685913, + "learning_rate": 5.496238210911026e-08, + "loss": 0.5289, + "step": 5012 + }, + { + "epoch": 5.60738255033557, + "grad_norm": 0.6335120797157288, + "learning_rate": 5.4651680630407324e-08, + "loss": 0.5254, + "step": 5013 + }, + { + "epoch": 5.608501118568233, + "grad_norm": 0.6470105648040771, + "learning_rate": 5.434185013274335e-08, + "loss": 0.5269, + "step": 5014 + }, + { + "epoch": 5.609619686800895, + "grad_norm": 0.6275774240493774, + "learning_rate": 5.4032890726473665e-08, + "loss": 0.5242, + "step": 5015 + }, + { + "epoch": 5.610738255033557, + "grad_norm": 0.6222971677780151, + "learning_rate": 5.3724802521643026e-08, + "loss": 0.515, + "step": 5016 + }, + { + "epoch": 5.611856823266219, + "grad_norm": 0.6246849894523621, + "learning_rate": 5.34175856279856e-08, + "loss": 0.5344, + "step": 5017 + }, + { + "epoch": 5.612975391498882, + "grad_norm": 0.6252410411834717, + "learning_rate": 5.311124015492608e-08, + "loss": 0.5526, + "step": 5018 + }, + { + "epoch": 5.614093959731544, + "grad_norm": 0.6372426748275757, + "learning_rate": 5.280576621157801e-08, + "loss": 0.528, + "step": 5019 + }, + { + "epoch": 5.615212527964205, + "grad_norm": 0.619666576385498, + "learning_rate": 5.2501163906744644e-08, + "loss": 0.4975, + "step": 5020 + }, + { + "epoch": 5.616331096196868, + "grad_norm": 0.6173959374427795, + "learning_rate": 5.219743334891836e-08, + "loss": 0.53, + "step": 5021 + }, + { + "epoch": 5.617449664429531, + "grad_norm": 0.6207226514816284, + "learning_rate": 5.189457464628234e-08, + "loss": 0.5091, + "step": 5022 + }, + { + "epoch": 5.618568232662192, + "grad_norm": 0.6433895230293274, + "learning_rate": 5.159258790670779e-08, + "loss": 0.5527, + "step": 5023 + }, + { + "epoch": 5.619686800894854, + "grad_norm": 0.6271979808807373, + "learning_rate": 5.1291473237756465e-08, + "loss": 0.5119, + "step": 5024 + }, + { + "epoch": 5.620805369127517, + "grad_norm": 0.6399113535881042, + "learning_rate": 5.0991230746678664e-08, + "loss": 0.5561, + "step": 5025 + }, + { + "epoch": 5.621923937360179, + "grad_norm": 0.6349020004272461, + "learning_rate": 5.069186054041414e-08, + "loss": 0.5163, + "step": 5026 + }, + { + "epoch": 5.623042505592841, + "grad_norm": 0.6422675251960754, + "learning_rate": 5.0393362725592864e-08, + "loss": 0.5286, + "step": 5027 + }, + { + "epoch": 5.624161073825503, + "grad_norm": 0.624281108379364, + "learning_rate": 5.009573740853313e-08, + "loss": 0.5312, + "step": 5028 + }, + { + "epoch": 5.625279642058166, + "grad_norm": 0.6274269223213196, + "learning_rate": 4.9798984695242356e-08, + "loss": 0.5091, + "step": 5029 + }, + { + "epoch": 5.626398210290827, + "grad_norm": 0.6433970928192139, + "learning_rate": 4.950310469141795e-08, + "loss": 0.5392, + "step": 5030 + }, + { + "epoch": 5.62751677852349, + "grad_norm": 0.6237949728965759, + "learning_rate": 4.920809750244587e-08, + "loss": 0.5206, + "step": 5031 + }, + { + "epoch": 5.628635346756152, + "grad_norm": 0.6306619644165039, + "learning_rate": 4.891396323340153e-08, + "loss": 0.5298, + "step": 5032 + }, + { + "epoch": 5.6297539149888145, + "grad_norm": 0.660186767578125, + "learning_rate": 4.8620701989049446e-08, + "loss": 0.5452, + "step": 5033 + }, + { + "epoch": 5.630872483221476, + "grad_norm": 0.6026687622070312, + "learning_rate": 4.832831387384246e-08, + "loss": 0.4944, + "step": 5034 + }, + { + "epoch": 5.631991051454139, + "grad_norm": 0.6350778937339783, + "learning_rate": 4.8036798991923925e-08, + "loss": 0.5322, + "step": 5035 + }, + { + "epoch": 5.633109619686801, + "grad_norm": 0.6439168453216553, + "learning_rate": 4.774615744712469e-08, + "loss": 0.5235, + "step": 5036 + }, + { + "epoch": 5.634228187919463, + "grad_norm": 0.6169254183769226, + "learning_rate": 4.7456389342964706e-08, + "loss": 0.5324, + "step": 5037 + }, + { + "epoch": 5.635346756152125, + "grad_norm": 0.5996044278144836, + "learning_rate": 4.7167494782653646e-08, + "loss": 0.4986, + "step": 5038 + }, + { + "epoch": 5.636465324384788, + "grad_norm": 0.6411183476448059, + "learning_rate": 4.687947386908975e-08, + "loss": 0.5232, + "step": 5039 + }, + { + "epoch": 5.6375838926174495, + "grad_norm": 0.6323792934417725, + "learning_rate": 4.659232670485958e-08, + "loss": 0.5418, + "step": 5040 + }, + { + "epoch": 5.638702460850112, + "grad_norm": 0.6202119588851929, + "learning_rate": 4.630605339223909e-08, + "loss": 0.4827, + "step": 5041 + }, + { + "epoch": 5.639821029082774, + "grad_norm": 0.6339359879493713, + "learning_rate": 4.6020654033192556e-08, + "loss": 0.5406, + "step": 5042 + }, + { + "epoch": 5.640939597315437, + "grad_norm": 0.6353793144226074, + "learning_rate": 4.5736128729373666e-08, + "loss": 0.5176, + "step": 5043 + }, + { + "epoch": 5.642058165548098, + "grad_norm": 0.6199580430984497, + "learning_rate": 4.5452477582123564e-08, + "loss": 0.5386, + "step": 5044 + }, + { + "epoch": 5.643176733780761, + "grad_norm": 0.6359405517578125, + "learning_rate": 4.516970069247284e-08, + "loss": 0.5512, + "step": 5045 + }, + { + "epoch": 5.644295302013423, + "grad_norm": 0.6409672498703003, + "learning_rate": 4.4887798161141195e-08, + "loss": 0.5126, + "step": 5046 + }, + { + "epoch": 5.6454138702460845, + "grad_norm": 0.641411304473877, + "learning_rate": 4.460677008853609e-08, + "loss": 0.5447, + "step": 5047 + }, + { + "epoch": 5.646532438478747, + "grad_norm": 0.6199569702148438, + "learning_rate": 4.432661657475329e-08, + "loss": 0.5085, + "step": 5048 + }, + { + "epoch": 5.64765100671141, + "grad_norm": 0.6433139443397522, + "learning_rate": 4.4047337719577976e-08, + "loss": 0.5325, + "step": 5049 + }, + { + "epoch": 5.648769574944072, + "grad_norm": 0.6354572772979736, + "learning_rate": 4.376893362248308e-08, + "loss": 0.5096, + "step": 5050 + }, + { + "epoch": 5.649888143176733, + "grad_norm": 0.6433454751968384, + "learning_rate": 4.3491404382630384e-08, + "loss": 0.5288, + "step": 5051 + }, + { + "epoch": 5.651006711409396, + "grad_norm": 0.6052621006965637, + "learning_rate": 4.3214750098869993e-08, + "loss": 0.5106, + "step": 5052 + }, + { + "epoch": 5.652125279642058, + "grad_norm": 0.6144117116928101, + "learning_rate": 4.293897086974003e-08, + "loss": 0.509, + "step": 5053 + }, + { + "epoch": 5.6532438478747205, + "grad_norm": 0.6225472688674927, + "learning_rate": 4.2664066793467184e-08, + "loss": 0.5143, + "step": 5054 + }, + { + "epoch": 5.654362416107382, + "grad_norm": 0.6080011129379272, + "learning_rate": 4.239003796796648e-08, + "loss": 0.4977, + "step": 5055 + }, + { + "epoch": 5.655480984340045, + "grad_norm": 0.6192302107810974, + "learning_rate": 4.211688449084123e-08, + "loss": 0.5296, + "step": 5056 + }, + { + "epoch": 5.656599552572707, + "grad_norm": 0.6476097106933594, + "learning_rate": 4.184460645938304e-08, + "loss": 0.5234, + "step": 5057 + }, + { + "epoch": 5.657718120805369, + "grad_norm": 0.6493718028068542, + "learning_rate": 4.157320397057158e-08, + "loss": 0.5397, + "step": 5058 + }, + { + "epoch": 5.658836689038031, + "grad_norm": 0.6480401158332825, + "learning_rate": 4.130267712107394e-08, + "loss": 0.5328, + "step": 5059 + }, + { + "epoch": 5.659955257270694, + "grad_norm": 0.6256687045097351, + "learning_rate": 4.1033026007247236e-08, + "loss": 0.5216, + "step": 5060 + }, + { + "epoch": 5.6610738255033555, + "grad_norm": 0.6230656504631042, + "learning_rate": 4.076425072513407e-08, + "loss": 0.5177, + "step": 5061 + }, + { + "epoch": 5.662192393736018, + "grad_norm": 0.622850239276886, + "learning_rate": 4.049635137046759e-08, + "loss": 0.5317, + "step": 5062 + }, + { + "epoch": 5.66331096196868, + "grad_norm": 0.6384648680686951, + "learning_rate": 4.022932803866703e-08, + "loss": 0.5122, + "step": 5063 + }, + { + "epoch": 5.6644295302013425, + "grad_norm": 0.6270167827606201, + "learning_rate": 3.996318082484102e-08, + "loss": 0.5281, + "step": 5064 + }, + { + "epoch": 5.665548098434004, + "grad_norm": 0.612293004989624, + "learning_rate": 3.969790982378513e-08, + "loss": 0.5193, + "step": 5065 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 0.6229087710380554, + "learning_rate": 3.943351512998378e-08, + "loss": 0.5195, + "step": 5066 + }, + { + "epoch": 5.667785234899329, + "grad_norm": 0.6200931668281555, + "learning_rate": 3.91699968376083e-08, + "loss": 0.5051, + "step": 5067 + }, + { + "epoch": 5.668903803131991, + "grad_norm": 0.6646019220352173, + "learning_rate": 3.890735504051835e-08, + "loss": 0.5384, + "step": 5068 + }, + { + "epoch": 5.670022371364653, + "grad_norm": 0.6345646977424622, + "learning_rate": 3.864558983226158e-08, + "loss": 0.5162, + "step": 5069 + }, + { + "epoch": 5.671140939597316, + "grad_norm": 0.6333261728286743, + "learning_rate": 3.838470130607258e-08, + "loss": 0.5105, + "step": 5070 + }, + { + "epoch": 5.672259507829978, + "grad_norm": 0.6351928114891052, + "learning_rate": 3.812468955487508e-08, + "loss": 0.5266, + "step": 5071 + }, + { + "epoch": 5.67337807606264, + "grad_norm": 0.6328368782997131, + "learning_rate": 3.786555467127917e-08, + "loss": 0.5303, + "step": 5072 + }, + { + "epoch": 5.674496644295302, + "grad_norm": 0.6357427835464478, + "learning_rate": 3.760729674758351e-08, + "loss": 0.5323, + "step": 5073 + }, + { + "epoch": 5.675615212527964, + "grad_norm": 0.6096214652061462, + "learning_rate": 3.734991587577369e-08, + "loss": 0.5259, + "step": 5074 + }, + { + "epoch": 5.676733780760626, + "grad_norm": 0.6211285591125488, + "learning_rate": 3.70934121475236e-08, + "loss": 0.5092, + "step": 5075 + }, + { + "epoch": 5.677852348993289, + "grad_norm": 0.6393764615058899, + "learning_rate": 3.6837785654194604e-08, + "loss": 0.5212, + "step": 5076 + }, + { + "epoch": 5.678970917225951, + "grad_norm": 0.6334308981895447, + "learning_rate": 3.658303648683525e-08, + "loss": 0.5091, + "step": 5077 + }, + { + "epoch": 5.680089485458613, + "grad_norm": 0.6227877140045166, + "learning_rate": 3.6329164736181587e-08, + "loss": 0.5325, + "step": 5078 + }, + { + "epoch": 5.681208053691275, + "grad_norm": 0.604285717010498, + "learning_rate": 3.6076170492657106e-08, + "loss": 0.5078, + "step": 5079 + }, + { + "epoch": 5.682326621923937, + "grad_norm": 0.607947826385498, + "learning_rate": 3.582405384637361e-08, + "loss": 0.4834, + "step": 5080 + }, + { + "epoch": 5.6834451901566, + "grad_norm": 0.6142874956130981, + "learning_rate": 3.557281488712927e-08, + "loss": 0.5255, + "step": 5081 + }, + { + "epoch": 5.684563758389261, + "grad_norm": 0.6169115900993347, + "learning_rate": 3.5322453704410286e-08, + "loss": 0.4915, + "step": 5082 + }, + { + "epoch": 5.685682326621924, + "grad_norm": 0.6312294006347656, + "learning_rate": 3.5072970387389773e-08, + "loss": 0.5279, + "step": 5083 + }, + { + "epoch": 5.686800894854586, + "grad_norm": 0.6139187812805176, + "learning_rate": 3.4824365024928585e-08, + "loss": 0.5103, + "step": 5084 + }, + { + "epoch": 5.6879194630872485, + "grad_norm": 0.6018638610839844, + "learning_rate": 3.4576637705574514e-08, + "loss": 0.5077, + "step": 5085 + }, + { + "epoch": 5.68903803131991, + "grad_norm": 0.6147884130477905, + "learning_rate": 3.432978851756308e-08, + "loss": 0.4951, + "step": 5086 + }, + { + "epoch": 5.690156599552573, + "grad_norm": 0.6134939789772034, + "learning_rate": 3.40838175488159e-08, + "loss": 0.5166, + "step": 5087 + }, + { + "epoch": 5.691275167785235, + "grad_norm": 0.6277415752410889, + "learning_rate": 3.383872488694345e-08, + "loss": 0.5112, + "step": 5088 + }, + { + "epoch": 5.692393736017897, + "grad_norm": 0.6231713891029358, + "learning_rate": 3.3594510619242296e-08, + "loss": 0.5016, + "step": 5089 + }, + { + "epoch": 5.693512304250559, + "grad_norm": 0.6239691376686096, + "learning_rate": 3.33511748326959e-08, + "loss": 0.505, + "step": 5090 + }, + { + "epoch": 5.694630872483222, + "grad_norm": 0.6298589110374451, + "learning_rate": 3.310871761397605e-08, + "loss": 0.5459, + "step": 5091 + }, + { + "epoch": 5.6957494407158835, + "grad_norm": 0.6090074777603149, + "learning_rate": 3.2867139049440334e-08, + "loss": 0.497, + "step": 5092 + }, + { + "epoch": 5.696868008948546, + "grad_norm": 0.6235906481742859, + "learning_rate": 3.2626439225134086e-08, + "loss": 0.5278, + "step": 5093 + }, + { + "epoch": 5.697986577181208, + "grad_norm": 0.6487221717834473, + "learning_rate": 3.238661822678901e-08, + "loss": 0.5276, + "step": 5094 + }, + { + "epoch": 5.699105145413871, + "grad_norm": 0.6062212586402893, + "learning_rate": 3.2147676139825116e-08, + "loss": 0.5277, + "step": 5095 + }, + { + "epoch": 5.700223713646532, + "grad_norm": 0.6266761422157288, + "learning_rate": 3.190961304934764e-08, + "loss": 0.5045, + "step": 5096 + }, + { + "epoch": 5.701342281879195, + "grad_norm": 0.6370683908462524, + "learning_rate": 3.1672429040150164e-08, + "loss": 0.5122, + "step": 5097 + }, + { + "epoch": 5.702460850111857, + "grad_norm": 0.6165601015090942, + "learning_rate": 3.143612419671205e-08, + "loss": 0.4869, + "step": 5098 + }, + { + "epoch": 5.703579418344519, + "grad_norm": 0.6206772923469543, + "learning_rate": 3.120069860320069e-08, + "loss": 0.516, + "step": 5099 + }, + { + "epoch": 5.704697986577181, + "grad_norm": 0.620071291923523, + "learning_rate": 3.096615234346956e-08, + "loss": 0.5272, + "step": 5100 + }, + { + "epoch": 5.705816554809843, + "grad_norm": 0.6408053636550903, + "learning_rate": 3.073248550105851e-08, + "loss": 0.5205, + "step": 5101 + }, + { + "epoch": 5.706935123042506, + "grad_norm": 0.6162175536155701, + "learning_rate": 3.04996981591954e-08, + "loss": 0.5009, + "step": 5102 + }, + { + "epoch": 5.708053691275168, + "grad_norm": 0.6295656561851501, + "learning_rate": 3.026779040079336e-08, + "loss": 0.5121, + "step": 5103 + }, + { + "epoch": 5.70917225950783, + "grad_norm": 0.621673047542572, + "learning_rate": 3.0036762308453536e-08, + "loss": 0.5028, + "step": 5104 + }, + { + "epoch": 5.710290827740492, + "grad_norm": 0.6263106465339661, + "learning_rate": 2.980661396446316e-08, + "loss": 0.5103, + "step": 5105 + }, + { + "epoch": 5.7114093959731544, + "grad_norm": 0.627680242061615, + "learning_rate": 2.95773454507961e-08, + "loss": 0.5225, + "step": 5106 + }, + { + "epoch": 5.712527964205816, + "grad_norm": 0.6365804076194763, + "learning_rate": 2.9348956849112598e-08, + "loss": 0.5453, + "step": 5107 + }, + { + "epoch": 5.713646532438479, + "grad_norm": 0.6267880797386169, + "learning_rate": 2.912144824076063e-08, + "loss": 0.5054, + "step": 5108 + }, + { + "epoch": 5.714765100671141, + "grad_norm": 0.6227915287017822, + "learning_rate": 2.889481970677316e-08, + "loss": 0.5265, + "step": 5109 + }, + { + "epoch": 5.715883668903803, + "grad_norm": 0.6219407916069031, + "learning_rate": 2.866907132787089e-08, + "loss": 0.516, + "step": 5110 + }, + { + "epoch": 5.717002237136465, + "grad_norm": 0.6320464611053467, + "learning_rate": 2.844420318446034e-08, + "loss": 0.525, + "step": 5111 + }, + { + "epoch": 5.718120805369128, + "grad_norm": 0.6308215856552124, + "learning_rate": 2.8220215356634662e-08, + "loss": 0.4999, + "step": 5112 + }, + { + "epoch": 5.7192393736017895, + "grad_norm": 0.630021870136261, + "learning_rate": 2.799710792417393e-08, + "loss": 0.523, + "step": 5113 + }, + { + "epoch": 5.720357941834452, + "grad_norm": 0.6303372979164124, + "learning_rate": 2.7774880966544016e-08, + "loss": 0.5256, + "step": 5114 + }, + { + "epoch": 5.721476510067114, + "grad_norm": 0.6250762343406677, + "learning_rate": 2.7553534562897443e-08, + "loss": 0.542, + "step": 5115 + }, + { + "epoch": 5.7225950782997765, + "grad_norm": 0.6314460635185242, + "learning_rate": 2.733306879207337e-08, + "loss": 0.5252, + "step": 5116 + }, + { + "epoch": 5.723713646532438, + "grad_norm": 0.6296803951263428, + "learning_rate": 2.711348373259648e-08, + "loss": 0.5175, + "step": 5117 + }, + { + "epoch": 5.724832214765101, + "grad_norm": 0.6198580265045166, + "learning_rate": 2.6894779462678654e-08, + "loss": 0.5035, + "step": 5118 + }, + { + "epoch": 5.725950782997763, + "grad_norm": 0.6248620748519897, + "learning_rate": 2.6676956060217584e-08, + "loss": 0.517, + "step": 5119 + }, + { + "epoch": 5.727069351230425, + "grad_norm": 0.6448245048522949, + "learning_rate": 2.6460013602797595e-08, + "loss": 0.5053, + "step": 5120 + }, + { + "epoch": 5.728187919463087, + "grad_norm": 0.6312618255615234, + "learning_rate": 2.6243952167688546e-08, + "loss": 0.5432, + "step": 5121 + }, + { + "epoch": 5.72930648769575, + "grad_norm": 0.6253435015678406, + "learning_rate": 2.602877183184721e-08, + "loss": 0.5216, + "step": 5122 + }, + { + "epoch": 5.730425055928412, + "grad_norm": 0.625035285949707, + "learning_rate": 2.5814472671916168e-08, + "loss": 0.512, + "step": 5123 + }, + { + "epoch": 5.731543624161074, + "grad_norm": 0.6291087865829468, + "learning_rate": 2.5601054764224365e-08, + "loss": 0.5331, + "step": 5124 + }, + { + "epoch": 5.732662192393736, + "grad_norm": 0.6296796798706055, + "learning_rate": 2.538851818478655e-08, + "loss": 0.544, + "step": 5125 + }, + { + "epoch": 5.733780760626399, + "grad_norm": 0.6059831380844116, + "learning_rate": 2.517686300930383e-08, + "loss": 0.4851, + "step": 5126 + }, + { + "epoch": 5.73489932885906, + "grad_norm": 0.6191352009773254, + "learning_rate": 2.4966089313162856e-08, + "loss": 0.5241, + "step": 5127 + }, + { + "epoch": 5.736017897091722, + "grad_norm": 0.6260369420051575, + "learning_rate": 2.4756197171437457e-08, + "loss": 0.5166, + "step": 5128 + }, + { + "epoch": 5.737136465324385, + "grad_norm": 0.6300110816955566, + "learning_rate": 2.454718665888589e-08, + "loss": 0.514, + "step": 5129 + }, + { + "epoch": 5.7382550335570475, + "grad_norm": 0.635215699672699, + "learning_rate": 2.433905784995416e-08, + "loss": 0.5289, + "step": 5130 + }, + { + "epoch": 5.739373601789709, + "grad_norm": 0.6063308715820312, + "learning_rate": 2.4131810818772973e-08, + "loss": 0.5036, + "step": 5131 + }, + { + "epoch": 5.740492170022371, + "grad_norm": 0.6204779148101807, + "learning_rate": 2.392544563915883e-08, + "loss": 0.537, + "step": 5132 + }, + { + "epoch": 5.741610738255034, + "grad_norm": 0.6401379108428955, + "learning_rate": 2.3719962384615434e-08, + "loss": 0.5053, + "step": 5133 + }, + { + "epoch": 5.742729306487695, + "grad_norm": 0.6216762661933899, + "learning_rate": 2.3515361128330916e-08, + "loss": 0.5321, + "step": 5134 + }, + { + "epoch": 5.743847874720358, + "grad_norm": 0.6157628297805786, + "learning_rate": 2.3311641943180308e-08, + "loss": 0.5337, + "step": 5135 + }, + { + "epoch": 5.74496644295302, + "grad_norm": 0.6237843632698059, + "learning_rate": 2.3108804901723624e-08, + "loss": 0.5188, + "step": 5136 + }, + { + "epoch": 5.7460850111856825, + "grad_norm": 0.6436848640441895, + "learning_rate": 2.2906850076207243e-08, + "loss": 0.5229, + "step": 5137 + }, + { + "epoch": 5.747203579418344, + "grad_norm": 0.6411592960357666, + "learning_rate": 2.2705777538563344e-08, + "loss": 0.5357, + "step": 5138 + }, + { + "epoch": 5.748322147651007, + "grad_norm": 0.637458324432373, + "learning_rate": 2.250558736040992e-08, + "loss": 0.5166, + "step": 5139 + }, + { + "epoch": 5.749440715883669, + "grad_norm": 0.6265445947647095, + "learning_rate": 2.230627961304993e-08, + "loss": 0.5255, + "step": 5140 + }, + { + "epoch": 5.750559284116331, + "grad_norm": 0.6198586821556091, + "learning_rate": 2.2107854367472702e-08, + "loss": 0.5183, + "step": 5141 + }, + { + "epoch": 5.751677852348993, + "grad_norm": 0.627483606338501, + "learning_rate": 2.1910311694353093e-08, + "loss": 0.5242, + "step": 5142 + }, + { + "epoch": 5.752796420581656, + "grad_norm": 0.5981237888336182, + "learning_rate": 2.171365166405176e-08, + "loss": 0.5148, + "step": 5143 + }, + { + "epoch": 5.7539149888143175, + "grad_norm": 0.6447716951370239, + "learning_rate": 2.151787434661462e-08, + "loss": 0.5301, + "step": 5144 + }, + { + "epoch": 5.75503355704698, + "grad_norm": 0.6264230608940125, + "learning_rate": 2.132297981177367e-08, + "loss": 0.5297, + "step": 5145 + }, + { + "epoch": 5.756152125279642, + "grad_norm": 0.6270563006401062, + "learning_rate": 2.112896812894588e-08, + "loss": 0.5155, + "step": 5146 + }, + { + "epoch": 5.757270693512305, + "grad_norm": 0.6461653709411621, + "learning_rate": 2.0935839367234034e-08, + "loss": 0.5391, + "step": 5147 + }, + { + "epoch": 5.758389261744966, + "grad_norm": 0.6112791895866394, + "learning_rate": 2.074359359542699e-08, + "loss": 0.5012, + "step": 5148 + }, + { + "epoch": 5.759507829977629, + "grad_norm": 0.6347304582595825, + "learning_rate": 2.0552230881998035e-08, + "loss": 0.5434, + "step": 5149 + }, + { + "epoch": 5.760626398210291, + "grad_norm": 0.6254553198814392, + "learning_rate": 2.036175129510709e-08, + "loss": 0.5466, + "step": 5150 + }, + { + "epoch": 5.7617449664429525, + "grad_norm": 0.620561957359314, + "learning_rate": 2.017215490259822e-08, + "loss": 0.5148, + "step": 5151 + }, + { + "epoch": 5.762863534675615, + "grad_norm": 0.626850426197052, + "learning_rate": 1.998344177200212e-08, + "loss": 0.5343, + "step": 5152 + }, + { + "epoch": 5.763982102908278, + "grad_norm": 0.6148296594619751, + "learning_rate": 1.97956119705342e-08, + "loss": 0.4871, + "step": 5153 + }, + { + "epoch": 5.76510067114094, + "grad_norm": 0.6107701659202576, + "learning_rate": 1.9608665565095387e-08, + "loss": 0.4997, + "step": 5154 + }, + { + "epoch": 5.766219239373601, + "grad_norm": 0.6341060400009155, + "learning_rate": 1.9422602622272414e-08, + "loss": 0.5363, + "step": 5155 + }, + { + "epoch": 5.767337807606264, + "grad_norm": 0.6316196322441101, + "learning_rate": 1.9237423208336437e-08, + "loss": 0.5237, + "step": 5156 + }, + { + "epoch": 5.768456375838926, + "grad_norm": 0.6486344933509827, + "learning_rate": 1.9053127389244696e-08, + "loss": 0.5139, + "step": 5157 + }, + { + "epoch": 5.769574944071588, + "grad_norm": 0.6293314099311829, + "learning_rate": 1.8869715230639407e-08, + "loss": 0.52, + "step": 5158 + }, + { + "epoch": 5.77069351230425, + "grad_norm": 0.6096522212028503, + "learning_rate": 1.8687186797848034e-08, + "loss": 0.4861, + "step": 5159 + }, + { + "epoch": 5.771812080536913, + "grad_norm": 0.6466556787490845, + "learning_rate": 1.8505542155883295e-08, + "loss": 0.5326, + "step": 5160 + }, + { + "epoch": 5.772930648769575, + "grad_norm": 0.6275053024291992, + "learning_rate": 1.8324781369443157e-08, + "loss": 0.5098, + "step": 5161 + }, + { + "epoch": 5.774049217002237, + "grad_norm": 0.6209127306938171, + "learning_rate": 1.8144904502910844e-08, + "loss": 0.4968, + "step": 5162 + }, + { + "epoch": 5.775167785234899, + "grad_norm": 0.6278755068778992, + "learning_rate": 1.7965911620354548e-08, + "loss": 0.5219, + "step": 5163 + }, + { + "epoch": 5.776286353467562, + "grad_norm": 0.6085246205329895, + "learning_rate": 1.7787802785527996e-08, + "loss": 0.5141, + "step": 5164 + }, + { + "epoch": 5.7774049217002235, + "grad_norm": 0.6440350413322449, + "learning_rate": 1.7610578061869608e-08, + "loss": 0.5321, + "step": 5165 + }, + { + "epoch": 5.778523489932886, + "grad_norm": 0.6306285858154297, + "learning_rate": 1.7434237512502782e-08, + "loss": 0.5118, + "step": 5166 + }, + { + "epoch": 5.779642058165548, + "grad_norm": 0.64028000831604, + "learning_rate": 1.725878120023644e-08, + "loss": 0.511, + "step": 5167 + }, + { + "epoch": 5.7807606263982105, + "grad_norm": 0.6209614276885986, + "learning_rate": 1.708420918756476e-08, + "loss": 0.5167, + "step": 5168 + }, + { + "epoch": 5.781879194630872, + "grad_norm": 0.6197549104690552, + "learning_rate": 1.6910521536666058e-08, + "loss": 0.511, + "step": 5169 + }, + { + "epoch": 5.782997762863535, + "grad_norm": 0.6322024464607239, + "learning_rate": 1.673771830940446e-08, + "loss": 0.5315, + "step": 5170 + }, + { + "epoch": 5.784116331096197, + "grad_norm": 0.6358757615089417, + "learning_rate": 1.656579956732851e-08, + "loss": 0.542, + "step": 5171 + }, + { + "epoch": 5.785234899328859, + "grad_norm": 0.6236332654953003, + "learning_rate": 1.639476537167256e-08, + "loss": 0.5123, + "step": 5172 + }, + { + "epoch": 5.786353467561521, + "grad_norm": 0.615127682685852, + "learning_rate": 1.6224615783355102e-08, + "loss": 0.5158, + "step": 5173 + }, + { + "epoch": 5.787472035794184, + "grad_norm": 0.6324767470359802, + "learning_rate": 1.6055350862979325e-08, + "loss": 0.5246, + "step": 5174 + }, + { + "epoch": 5.7885906040268456, + "grad_norm": 0.6124656200408936, + "learning_rate": 1.58869706708345e-08, + "loss": 0.5088, + "step": 5175 + }, + { + "epoch": 5.789709172259508, + "grad_norm": 0.6163376569747925, + "learning_rate": 1.571947526689349e-08, + "loss": 0.5104, + "step": 5176 + }, + { + "epoch": 5.79082774049217, + "grad_norm": 0.6035485863685608, + "learning_rate": 1.555286471081524e-08, + "loss": 0.492, + "step": 5177 + }, + { + "epoch": 5.791946308724832, + "grad_norm": 0.6062201857566833, + "learning_rate": 1.5387139061942003e-08, + "loss": 0.5305, + "step": 5178 + }, + { + "epoch": 5.793064876957494, + "grad_norm": 0.6089250445365906, + "learning_rate": 1.522229837930267e-08, + "loss": 0.4853, + "step": 5179 + }, + { + "epoch": 5.794183445190157, + "grad_norm": 0.6465957760810852, + "learning_rate": 1.5058342721609164e-08, + "loss": 0.5424, + "step": 5180 + }, + { + "epoch": 5.795302013422819, + "grad_norm": 0.6409919857978821, + "learning_rate": 1.4895272147259498e-08, + "loss": 0.5251, + "step": 5181 + }, + { + "epoch": 5.796420581655481, + "grad_norm": 0.627995491027832, + "learning_rate": 1.4733086714336099e-08, + "loss": 0.5172, + "step": 5182 + }, + { + "epoch": 5.797539149888143, + "grad_norm": 0.6262001395225525, + "learning_rate": 1.4571786480605532e-08, + "loss": 0.5519, + "step": 5183 + }, + { + "epoch": 5.798657718120805, + "grad_norm": 0.6246026158332825, + "learning_rate": 1.4411371503519622e-08, + "loss": 0.5342, + "step": 5184 + }, + { + "epoch": 5.799776286353468, + "grad_norm": 0.6211978197097778, + "learning_rate": 1.4251841840214608e-08, + "loss": 0.4994, + "step": 5185 + }, + { + "epoch": 5.800894854586129, + "grad_norm": 0.6139915585517883, + "learning_rate": 1.4093197547511983e-08, + "loss": 0.5367, + "step": 5186 + }, + { + "epoch": 5.802013422818792, + "grad_norm": 0.6329701542854309, + "learning_rate": 1.3935438681917102e-08, + "loss": 0.528, + "step": 5187 + }, + { + "epoch": 5.803131991051454, + "grad_norm": 0.6159714460372925, + "learning_rate": 1.377856529962085e-08, + "loss": 0.5118, + "step": 5188 + }, + { + "epoch": 5.8042505592841165, + "grad_norm": 0.6241361498832703, + "learning_rate": 1.3622577456497422e-08, + "loss": 0.5358, + "step": 5189 + }, + { + "epoch": 5.805369127516778, + "grad_norm": 0.6353749632835388, + "learning_rate": 1.3467475208107095e-08, + "loss": 0.5067, + "step": 5190 + }, + { + "epoch": 5.806487695749441, + "grad_norm": 0.6253808736801147, + "learning_rate": 1.331325860969318e-08, + "loss": 0.4928, + "step": 5191 + }, + { + "epoch": 5.807606263982103, + "grad_norm": 0.6383617520332336, + "learning_rate": 1.315992771618535e-08, + "loss": 0.5145, + "step": 5192 + }, + { + "epoch": 5.808724832214765, + "grad_norm": 0.6297013759613037, + "learning_rate": 1.3007482582195752e-08, + "loss": 0.508, + "step": 5193 + }, + { + "epoch": 5.809843400447427, + "grad_norm": 0.6242703795433044, + "learning_rate": 1.2855923262022895e-08, + "loss": 0.5307, + "step": 5194 + }, + { + "epoch": 5.81096196868009, + "grad_norm": 0.6335163712501526, + "learning_rate": 1.2705249809648879e-08, + "loss": 0.5176, + "step": 5195 + }, + { + "epoch": 5.8120805369127515, + "grad_norm": 0.595480740070343, + "learning_rate": 1.255546227873966e-08, + "loss": 0.5021, + "step": 5196 + }, + { + "epoch": 5.813199105145414, + "grad_norm": 0.6208020448684692, + "learning_rate": 1.2406560722647565e-08, + "loss": 0.5267, + "step": 5197 + }, + { + "epoch": 5.814317673378076, + "grad_norm": 0.639262318611145, + "learning_rate": 1.2258545194407112e-08, + "loss": 0.5411, + "step": 5198 + }, + { + "epoch": 5.815436241610739, + "grad_norm": 0.6337928175926208, + "learning_rate": 1.2111415746738908e-08, + "loss": 0.5186, + "step": 5199 + }, + { + "epoch": 5.8165548098434, + "grad_norm": 0.6427651047706604, + "learning_rate": 1.1965172432046868e-08, + "loss": 0.5482, + "step": 5200 + }, + { + "epoch": 5.817673378076063, + "grad_norm": 0.6437339782714844, + "learning_rate": 1.1819815302420157e-08, + "loss": 0.5525, + "step": 5201 + }, + { + "epoch": 5.818791946308725, + "grad_norm": 0.6385146379470825, + "learning_rate": 1.1675344409631528e-08, + "loss": 0.533, + "step": 5202 + }, + { + "epoch": 5.819910514541387, + "grad_norm": 0.6076056957244873, + "learning_rate": 1.1531759805138987e-08, + "loss": 0.5113, + "step": 5203 + }, + { + "epoch": 5.821029082774049, + "grad_norm": 0.6523064374923706, + "learning_rate": 1.1389061540083568e-08, + "loss": 0.5326, + "step": 5204 + }, + { + "epoch": 5.822147651006711, + "grad_norm": 0.6465266346931458, + "learning_rate": 1.1247249665291837e-08, + "loss": 0.5325, + "step": 5205 + }, + { + "epoch": 5.823266219239374, + "grad_norm": 0.6119315028190613, + "learning_rate": 1.1106324231274224e-08, + "loss": 0.5084, + "step": 5206 + }, + { + "epoch": 5.824384787472036, + "grad_norm": 0.6254585385322571, + "learning_rate": 1.0966285288225298e-08, + "loss": 0.5125, + "step": 5207 + }, + { + "epoch": 5.825503355704698, + "grad_norm": 0.6371309757232666, + "learning_rate": 1.0827132886023772e-08, + "loss": 0.5279, + "step": 5208 + }, + { + "epoch": 5.82662192393736, + "grad_norm": 0.6140681505203247, + "learning_rate": 1.0688867074232778e-08, + "loss": 0.4994, + "step": 5209 + }, + { + "epoch": 5.827740492170022, + "grad_norm": 0.6336174011230469, + "learning_rate": 1.0551487902100143e-08, + "loss": 0.5262, + "step": 5210 + }, + { + "epoch": 5.828859060402684, + "grad_norm": 0.6236572861671448, + "learning_rate": 1.041499541855645e-08, + "loss": 0.523, + "step": 5211 + }, + { + "epoch": 5.829977628635347, + "grad_norm": 0.6266209483146667, + "learning_rate": 1.0279389672218366e-08, + "loss": 0.511, + "step": 5212 + }, + { + "epoch": 5.831096196868009, + "grad_norm": 0.6199517846107483, + "learning_rate": 1.0144670711385307e-08, + "loss": 0.5293, + "step": 5213 + }, + { + "epoch": 5.832214765100671, + "grad_norm": 0.632702648639679, + "learning_rate": 1.0010838584041394e-08, + "loss": 0.5178, + "step": 5214 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 0.6273003816604614, + "learning_rate": 9.877893337854882e-09, + "loss": 0.5285, + "step": 5215 + }, + { + "epoch": 5.834451901565996, + "grad_norm": 0.6303425431251526, + "learning_rate": 9.745835020177619e-09, + "loss": 0.5285, + "step": 5216 + }, + { + "epoch": 5.8355704697986575, + "grad_norm": 0.6234087944030762, + "learning_rate": 9.614663678046698e-09, + "loss": 0.5285, + "step": 5217 + }, + { + "epoch": 5.83668903803132, + "grad_norm": 0.6359259486198425, + "learning_rate": 9.484379358181695e-09, + "loss": 0.5399, + "step": 5218 + }, + { + "epoch": 5.837807606263982, + "grad_norm": 0.6433168649673462, + "learning_rate": 9.354982106987986e-09, + "loss": 0.5142, + "step": 5219 + }, + { + "epoch": 5.8389261744966445, + "grad_norm": 0.6456252336502075, + "learning_rate": 9.22647197055343e-09, + "loss": 0.5376, + "step": 5220 + }, + { + "epoch": 5.840044742729306, + "grad_norm": 0.6399451494216919, + "learning_rate": 9.098848994650855e-09, + "loss": 0.5336, + "step": 5221 + }, + { + "epoch": 5.841163310961969, + "grad_norm": 0.615469753742218, + "learning_rate": 8.972113224736956e-09, + "loss": 0.5118, + "step": 5222 + }, + { + "epoch": 5.842281879194631, + "grad_norm": 0.6215211749076843, + "learning_rate": 8.84626470595229e-09, + "loss": 0.512, + "step": 5223 + }, + { + "epoch": 5.843400447427293, + "grad_norm": 0.5972394943237305, + "learning_rate": 8.721303483121002e-09, + "loss": 0.4822, + "step": 5224 + }, + { + "epoch": 5.844519015659955, + "grad_norm": 0.6272586584091187, + "learning_rate": 8.597229600752487e-09, + "loss": 0.5159, + "step": 5225 + }, + { + "epoch": 5.845637583892618, + "grad_norm": 0.6226951479911804, + "learning_rate": 8.474043103038343e-09, + "loss": 0.5394, + "step": 5226 + }, + { + "epoch": 5.8467561521252795, + "grad_norm": 0.6463093161582947, + "learning_rate": 8.351744033855413e-09, + "loss": 0.5242, + "step": 5227 + }, + { + "epoch": 5.847874720357942, + "grad_norm": 0.6293492913246155, + "learning_rate": 8.230332436764132e-09, + "loss": 0.5132, + "step": 5228 + }, + { + "epoch": 5.848993288590604, + "grad_norm": 0.6357239484786987, + "learning_rate": 8.109808355008242e-09, + "loss": 0.5174, + "step": 5229 + }, + { + "epoch": 5.850111856823267, + "grad_norm": 0.6456530690193176, + "learning_rate": 7.990171831516457e-09, + "loss": 0.5115, + "step": 5230 + }, + { + "epoch": 5.851230425055928, + "grad_norm": 0.6421802639961243, + "learning_rate": 7.87142290890025e-09, + "loss": 0.5221, + "step": 5231 + }, + { + "epoch": 5.85234899328859, + "grad_norm": 0.6310228705406189, + "learning_rate": 7.753561629455786e-09, + "loss": 0.514, + "step": 5232 + }, + { + "epoch": 5.853467561521253, + "grad_norm": 0.6247232556343079, + "learning_rate": 7.63658803516254e-09, + "loss": 0.5239, + "step": 5233 + }, + { + "epoch": 5.8545861297539155, + "grad_norm": 0.6036512851715088, + "learning_rate": 7.52050216768413e-09, + "loss": 0.5178, + "step": 5234 + }, + { + "epoch": 5.855704697986577, + "grad_norm": 0.632284939289093, + "learning_rate": 7.405304068368035e-09, + "loss": 0.5478, + "step": 5235 + }, + { + "epoch": 5.856823266219239, + "grad_norm": 0.6284492015838623, + "learning_rate": 7.2909937782450476e-09, + "loss": 0.5409, + "step": 5236 + }, + { + "epoch": 5.857941834451902, + "grad_norm": 0.6219998598098755, + "learning_rate": 7.177571338030098e-09, + "loss": 0.5154, + "step": 5237 + }, + { + "epoch": 5.859060402684563, + "grad_norm": 0.6336826682090759, + "learning_rate": 7.065036788122259e-09, + "loss": 0.5296, + "step": 5238 + }, + { + "epoch": 5.860178970917226, + "grad_norm": 0.6352509260177612, + "learning_rate": 6.953390168603358e-09, + "loss": 0.5221, + "step": 5239 + }, + { + "epoch": 5.861297539149888, + "grad_norm": 0.6311947703361511, + "learning_rate": 6.842631519239917e-09, + "loss": 0.5224, + "step": 5240 + }, + { + "epoch": 5.8624161073825505, + "grad_norm": 0.6172339916229248, + "learning_rate": 6.73276087948177e-09, + "loss": 0.4922, + "step": 5241 + }, + { + "epoch": 5.863534675615212, + "grad_norm": 0.6095704436302185, + "learning_rate": 6.623778288462335e-09, + "loss": 0.496, + "step": 5242 + }, + { + "epoch": 5.864653243847875, + "grad_norm": 0.6246845722198486, + "learning_rate": 6.515683784999172e-09, + "loss": 0.4954, + "step": 5243 + }, + { + "epoch": 5.865771812080537, + "grad_norm": 0.6196863651275635, + "learning_rate": 6.408477407592872e-09, + "loss": 0.5252, + "step": 5244 + }, + { + "epoch": 5.866890380313199, + "grad_norm": 0.6096441745758057, + "learning_rate": 6.30215919442817e-09, + "loss": 0.4739, + "step": 5245 + }, + { + "epoch": 5.868008948545861, + "grad_norm": 0.6055436134338379, + "learning_rate": 6.196729183373662e-09, + "loss": 0.5153, + "step": 5246 + }, + { + "epoch": 5.869127516778524, + "grad_norm": 0.6252949833869934, + "learning_rate": 6.092187411981254e-09, + "loss": 0.5156, + "step": 5247 + }, + { + "epoch": 5.8702460850111855, + "grad_norm": 0.6511937379837036, + "learning_rate": 5.988533917485884e-09, + "loss": 0.5366, + "step": 5248 + }, + { + "epoch": 5.871364653243848, + "grad_norm": 0.6183834075927734, + "learning_rate": 5.885768736807462e-09, + "loss": 0.5512, + "step": 5249 + }, + { + "epoch": 5.87248322147651, + "grad_norm": 0.629889726638794, + "learning_rate": 5.783891906548378e-09, + "loss": 0.5351, + "step": 5250 + }, + { + "epoch": 5.873601789709173, + "grad_norm": 0.6106861233711243, + "learning_rate": 5.682903462994882e-09, + "loss": 0.5144, + "step": 5251 + }, + { + "epoch": 5.874720357941834, + "grad_norm": 0.6361641883850098, + "learning_rate": 5.582803442117091e-09, + "loss": 0.5379, + "step": 5252 + }, + { + "epoch": 5.875838926174497, + "grad_norm": 0.6335686445236206, + "learning_rate": 5.483591879568706e-09, + "loss": 0.5332, + "step": 5253 + }, + { + "epoch": 5.876957494407159, + "grad_norm": 0.6138365268707275, + "learning_rate": 5.385268810686462e-09, + "loss": 0.5244, + "step": 5254 + }, + { + "epoch": 5.878076062639821, + "grad_norm": 0.6017154455184937, + "learning_rate": 5.2878342704909546e-09, + "loss": 0.518, + "step": 5255 + }, + { + "epoch": 5.879194630872483, + "grad_norm": 0.6192603707313538, + "learning_rate": 5.1912882936863695e-09, + "loss": 0.5265, + "step": 5256 + }, + { + "epoch": 5.880313199105146, + "grad_norm": 0.5921730995178223, + "learning_rate": 5.0956309146604764e-09, + "loss": 0.4876, + "step": 5257 + }, + { + "epoch": 5.881431767337808, + "grad_norm": 0.6136103868484497, + "learning_rate": 5.000862167484077e-09, + "loss": 0.5069, + "step": 5258 + }, + { + "epoch": 5.882550335570469, + "grad_norm": 0.6195256114006042, + "learning_rate": 4.9069820859121155e-09, + "loss": 0.5294, + "step": 5259 + }, + { + "epoch": 5.883668903803132, + "grad_norm": 0.6202780604362488, + "learning_rate": 4.813990703382565e-09, + "loss": 0.5015, + "step": 5260 + }, + { + "epoch": 5.884787472035795, + "grad_norm": 0.6053805351257324, + "learning_rate": 4.721888053016987e-09, + "loss": 0.4866, + "step": 5261 + }, + { + "epoch": 5.885906040268456, + "grad_norm": 0.6332124471664429, + "learning_rate": 4.630674167620253e-09, + "loss": 0.5265, + "step": 5262 + }, + { + "epoch": 5.887024608501118, + "grad_norm": 0.6163149476051331, + "learning_rate": 4.540349079680817e-09, + "loss": 0.5003, + "step": 5263 + }, + { + "epoch": 5.888143176733781, + "grad_norm": 0.6262108087539673, + "learning_rate": 4.450912821370723e-09, + "loss": 0.5111, + "step": 5264 + }, + { + "epoch": 5.889261744966443, + "grad_norm": 0.6393499374389648, + "learning_rate": 4.362365424545046e-09, + "loss": 0.5444, + "step": 5265 + }, + { + "epoch": 5.890380313199105, + "grad_norm": 0.6380621194839478, + "learning_rate": 4.274706920742721e-09, + "loss": 0.5494, + "step": 5266 + }, + { + "epoch": 5.891498881431767, + "grad_norm": 0.6467360258102417, + "learning_rate": 4.187937341185999e-09, + "loss": 0.5261, + "step": 5267 + }, + { + "epoch": 5.89261744966443, + "grad_norm": 0.6211003065109253, + "learning_rate": 4.102056716779601e-09, + "loss": 0.507, + "step": 5268 + }, + { + "epoch": 5.8937360178970915, + "grad_norm": 0.6351854801177979, + "learning_rate": 4.017065078113225e-09, + "loss": 0.5369, + "step": 5269 + }, + { + "epoch": 5.894854586129754, + "grad_norm": 0.6028649806976318, + "learning_rate": 3.932962455458489e-09, + "loss": 0.5238, + "step": 5270 + }, + { + "epoch": 5.895973154362416, + "grad_norm": 0.6461777091026306, + "learning_rate": 3.849748878771431e-09, + "loss": 0.5195, + "step": 5271 + }, + { + "epoch": 5.8970917225950785, + "grad_norm": 0.6289148330688477, + "learning_rate": 3.767424377690565e-09, + "loss": 0.5359, + "step": 5272 + }, + { + "epoch": 5.89821029082774, + "grad_norm": 0.6080505847930908, + "learning_rate": 3.6859889815385464e-09, + "loss": 0.5113, + "step": 5273 + }, + { + "epoch": 5.899328859060403, + "grad_norm": 0.6087728142738342, + "learning_rate": 3.605442719320229e-09, + "loss": 0.4829, + "step": 5274 + }, + { + "epoch": 5.900447427293065, + "grad_norm": 0.6288046836853027, + "learning_rate": 3.525785619725164e-09, + "loss": 0.5186, + "step": 5275 + }, + { + "epoch": 5.901565995525727, + "grad_norm": 0.6317957639694214, + "learning_rate": 3.447017711125378e-09, + "loss": 0.531, + "step": 5276 + }, + { + "epoch": 5.902684563758389, + "grad_norm": 0.6177731156349182, + "learning_rate": 3.369139021575929e-09, + "loss": 0.518, + "step": 5277 + }, + { + "epoch": 5.903803131991052, + "grad_norm": 0.6452380418777466, + "learning_rate": 3.2921495788160173e-09, + "loss": 0.516, + "step": 5278 + }, + { + "epoch": 5.9049217002237135, + "grad_norm": 0.6196131110191345, + "learning_rate": 3.2160494102675966e-09, + "loss": 0.5373, + "step": 5279 + }, + { + "epoch": 5.906040268456376, + "grad_norm": 0.6229770183563232, + "learning_rate": 3.1408385430356513e-09, + "loss": 0.5141, + "step": 5280 + }, + { + "epoch": 5.907158836689038, + "grad_norm": 0.6274929642677307, + "learning_rate": 3.0665170039087532e-09, + "loss": 0.5218, + "step": 5281 + }, + { + "epoch": 5.9082774049217, + "grad_norm": 0.6193037629127502, + "learning_rate": 2.9930848193587825e-09, + "loss": 0.5097, + "step": 5282 + }, + { + "epoch": 5.909395973154362, + "grad_norm": 0.6319540143013, + "learning_rate": 2.920542015540928e-09, + "loss": 0.5021, + "step": 5283 + }, + { + "epoch": 5.910514541387025, + "grad_norm": 0.6170830130577087, + "learning_rate": 2.8488886182928553e-09, + "loss": 0.5224, + "step": 5284 + }, + { + "epoch": 5.911633109619687, + "grad_norm": 0.619215726852417, + "learning_rate": 2.7781246531363717e-09, + "loss": 0.4958, + "step": 5285 + }, + { + "epoch": 5.912751677852349, + "grad_norm": 0.6256300806999207, + "learning_rate": 2.7082501452757594e-09, + "loss": 0.5257, + "step": 5286 + }, + { + "epoch": 5.913870246085011, + "grad_norm": 0.614615261554718, + "learning_rate": 2.639265119599166e-09, + "loss": 0.5157, + "step": 5287 + }, + { + "epoch": 5.914988814317674, + "grad_norm": 0.6297681331634521, + "learning_rate": 2.5711696006777697e-09, + "loss": 0.5236, + "step": 5288 + }, + { + "epoch": 5.916107382550336, + "grad_norm": 0.6178224682807922, + "learning_rate": 2.5039636127652258e-09, + "loss": 0.5399, + "step": 5289 + }, + { + "epoch": 5.917225950782997, + "grad_norm": 0.6354064345359802, + "learning_rate": 2.4376471797990518e-09, + "loss": 0.5156, + "step": 5290 + }, + { + "epoch": 5.91834451901566, + "grad_norm": 0.6362186670303345, + "learning_rate": 2.3722203253997987e-09, + "loss": 0.5096, + "step": 5291 + }, + { + "epoch": 5.919463087248322, + "grad_norm": 0.6331197619438171, + "learning_rate": 2.307683072871325e-09, + "loss": 0.5112, + "step": 5292 + }, + { + "epoch": 5.9205816554809845, + "grad_norm": 0.607315182685852, + "learning_rate": 2.244035445199966e-09, + "loss": 0.517, + "step": 5293 + }, + { + "epoch": 5.921700223713646, + "grad_norm": 0.6437689661979675, + "learning_rate": 2.1812774650561973e-09, + "loss": 0.5305, + "step": 5294 + }, + { + "epoch": 5.922818791946309, + "grad_norm": 0.6246294379234314, + "learning_rate": 2.119409154792695e-09, + "loss": 0.5113, + "step": 5295 + }, + { + "epoch": 5.923937360178971, + "grad_norm": 0.6368975639343262, + "learning_rate": 2.0584305364457214e-09, + "loss": 0.5221, + "step": 5296 + }, + { + "epoch": 5.925055928411633, + "grad_norm": 0.6122807264328003, + "learning_rate": 1.9983416317345683e-09, + "loss": 0.5139, + "step": 5297 + }, + { + "epoch": 5.926174496644295, + "grad_norm": 0.6223189830780029, + "learning_rate": 1.9391424620615605e-09, + "loss": 0.5069, + "step": 5298 + }, + { + "epoch": 5.927293064876958, + "grad_norm": 0.610745370388031, + "learning_rate": 1.8808330485123315e-09, + "loss": 0.529, + "step": 5299 + }, + { + "epoch": 5.9284116331096195, + "grad_norm": 0.6255449652671814, + "learning_rate": 1.8234134118552682e-09, + "loss": 0.4946, + "step": 5300 + }, + { + "epoch": 5.929530201342282, + "grad_norm": 0.5994164347648621, + "learning_rate": 1.766883572542344e-09, + "loss": 0.5165, + "step": 5301 + }, + { + "epoch": 5.930648769574944, + "grad_norm": 0.6134458780288696, + "learning_rate": 1.7112435507080084e-09, + "loss": 0.4978, + "step": 5302 + }, + { + "epoch": 5.931767337807607, + "grad_norm": 0.6228348612785339, + "learning_rate": 1.65649336617002e-09, + "loss": 0.5365, + "step": 5303 + }, + { + "epoch": 5.932885906040268, + "grad_norm": 0.632337212562561, + "learning_rate": 1.6026330384294464e-09, + "loss": 0.488, + "step": 5304 + }, + { + "epoch": 5.934004474272931, + "grad_norm": 0.635128915309906, + "learning_rate": 1.5496625866701087e-09, + "loss": 0.5293, + "step": 5305 + }, + { + "epoch": 5.935123042505593, + "grad_norm": 0.6232084035873413, + "learning_rate": 1.4975820297585818e-09, + "loss": 0.5079, + "step": 5306 + }, + { + "epoch": 5.936241610738255, + "grad_norm": 0.649949848651886, + "learning_rate": 1.4463913862455825e-09, + "loss": 0.539, + "step": 5307 + }, + { + "epoch": 5.937360178970917, + "grad_norm": 0.628314733505249, + "learning_rate": 1.3960906743634706e-09, + "loss": 0.5372, + "step": 5308 + }, + { + "epoch": 5.938478747203579, + "grad_norm": 0.6106909513473511, + "learning_rate": 1.3466799120287477e-09, + "loss": 0.5152, + "step": 5309 + }, + { + "epoch": 5.939597315436242, + "grad_norm": 0.6156041026115417, + "learning_rate": 1.2981591168401142e-09, + "loss": 0.5156, + "step": 5310 + }, + { + "epoch": 5.940715883668904, + "grad_norm": 0.6326484680175781, + "learning_rate": 1.2505283060798568e-09, + "loss": 0.5426, + "step": 5311 + }, + { + "epoch": 5.941834451901566, + "grad_norm": 0.6201187968254089, + "learning_rate": 1.203787496713016e-09, + "loss": 0.5181, + "step": 5312 + }, + { + "epoch": 5.942953020134228, + "grad_norm": 0.6157323122024536, + "learning_rate": 1.1579367053876633e-09, + "loss": 0.4929, + "step": 5313 + }, + { + "epoch": 5.94407158836689, + "grad_norm": 0.6356390714645386, + "learning_rate": 1.1129759484351798e-09, + "loss": 0.5273, + "step": 5314 + }, + { + "epoch": 5.945190156599552, + "grad_norm": 0.6241071820259094, + "learning_rate": 1.0689052418688672e-09, + "loss": 0.5373, + "step": 5315 + }, + { + "epoch": 5.946308724832215, + "grad_norm": 0.6367274522781372, + "learning_rate": 1.0257246013864464e-09, + "loss": 0.5198, + "step": 5316 + }, + { + "epoch": 5.947427293064877, + "grad_norm": 0.6525731682777405, + "learning_rate": 9.834340423678368e-10, + "loss": 0.5223, + "step": 5317 + }, + { + "epoch": 5.948545861297539, + "grad_norm": 0.6122352480888367, + "learning_rate": 9.42033579875712e-10, + "loss": 0.5204, + "step": 5318 + }, + { + "epoch": 5.949664429530201, + "grad_norm": 0.6140770316123962, + "learning_rate": 9.015232286563314e-10, + "loss": 0.5144, + "step": 5319 + }, + { + "epoch": 5.950782997762864, + "grad_norm": 0.6173872351646423, + "learning_rate": 8.619030031387088e-10, + "loss": 0.5259, + "step": 5320 + }, + { + "epoch": 5.9519015659955254, + "grad_norm": 0.6160332560539246, + "learning_rate": 8.231729174343339e-10, + "loss": 0.511, + "step": 5321 + }, + { + "epoch": 5.953020134228188, + "grad_norm": 0.6415054202079773, + "learning_rate": 7.853329853385606e-10, + "loss": 0.5127, + "step": 5322 + }, + { + "epoch": 5.95413870246085, + "grad_norm": 0.639754056930542, + "learning_rate": 7.483832203286634e-10, + "loss": 0.5267, + "step": 5323 + }, + { + "epoch": 5.9552572706935125, + "grad_norm": 0.6301175355911255, + "learning_rate": 7.123236355655039e-10, + "loss": 0.5029, + "step": 5324 + }, + { + "epoch": 5.956375838926174, + "grad_norm": 0.6206248998641968, + "learning_rate": 6.771542438929745e-10, + "loss": 0.4976, + "step": 5325 + }, + { + "epoch": 5.957494407158837, + "grad_norm": 0.6185736060142517, + "learning_rate": 6.428750578374443e-10, + "loss": 0.5078, + "step": 5326 + }, + { + "epoch": 5.958612975391499, + "grad_norm": 0.619414210319519, + "learning_rate": 6.094860896083132e-10, + "loss": 0.506, + "step": 5327 + }, + { + "epoch": 5.959731543624161, + "grad_norm": 0.6052616238594055, + "learning_rate": 5.76987351098568e-10, + "loss": 0.4929, + "step": 5328 + }, + { + "epoch": 5.960850111856823, + "grad_norm": 0.6281414031982422, + "learning_rate": 5.453788538828386e-10, + "loss": 0.522, + "step": 5329 + }, + { + "epoch": 5.961968680089486, + "grad_norm": 0.6182817220687866, + "learning_rate": 5.146606092198969e-10, + "loss": 0.5152, + "step": 5330 + }, + { + "epoch": 5.9630872483221475, + "grad_norm": 0.6360781788825989, + "learning_rate": 4.848326280507132e-10, + "loss": 0.5004, + "step": 5331 + }, + { + "epoch": 5.96420581655481, + "grad_norm": 0.6343223452568054, + "learning_rate": 4.5589492099956667e-10, + "loss": 0.5442, + "step": 5332 + }, + { + "epoch": 5.965324384787472, + "grad_norm": 0.6362835764884949, + "learning_rate": 4.2784749837349037e-10, + "loss": 0.5425, + "step": 5333 + }, + { + "epoch": 5.966442953020135, + "grad_norm": 0.6407535672187805, + "learning_rate": 4.0069037016199353e-10, + "loss": 0.5242, + "step": 5334 + }, + { + "epoch": 5.967561521252796, + "grad_norm": 0.6314567923545837, + "learning_rate": 3.7442354603789423e-10, + "loss": 0.5337, + "step": 5335 + }, + { + "epoch": 5.968680089485458, + "grad_norm": 0.6194212436676025, + "learning_rate": 3.490470353573194e-10, + "loss": 0.5034, + "step": 5336 + }, + { + "epoch": 5.969798657718121, + "grad_norm": 0.6163673400878906, + "learning_rate": 3.245608471588724e-10, + "loss": 0.5076, + "step": 5337 + }, + { + "epoch": 5.9709172259507834, + "grad_norm": 0.6216757297515869, + "learning_rate": 3.009649901633549e-10, + "loss": 0.5026, + "step": 5338 + }, + { + "epoch": 5.972035794183445, + "grad_norm": 0.6332550644874573, + "learning_rate": 2.782594727757104e-10, + "loss": 0.5388, + "step": 5339 + }, + { + "epoch": 5.973154362416107, + "grad_norm": 0.6377925276756287, + "learning_rate": 2.564443030828034e-10, + "loss": 0.5262, + "step": 5340 + }, + { + "epoch": 5.97427293064877, + "grad_norm": 0.6142863631248474, + "learning_rate": 2.3551948885480735e-10, + "loss": 0.514, + "step": 5341 + }, + { + "epoch": 5.975391498881431, + "grad_norm": 0.6186676025390625, + "learning_rate": 2.1548503754492689e-10, + "loss": 0.5305, + "step": 5342 + }, + { + "epoch": 5.976510067114094, + "grad_norm": 0.6159397959709167, + "learning_rate": 1.9634095628884297e-10, + "loss": 0.5304, + "step": 5343 + }, + { + "epoch": 5.977628635346756, + "grad_norm": 0.6136459112167358, + "learning_rate": 1.7808725190526788e-10, + "loss": 0.495, + "step": 5344 + }, + { + "epoch": 5.9787472035794185, + "grad_norm": 0.6465275883674622, + "learning_rate": 1.6072393089566762e-10, + "loss": 0.5324, + "step": 5345 + }, + { + "epoch": 5.97986577181208, + "grad_norm": 0.6192064881324768, + "learning_rate": 1.4425099944481713e-10, + "loss": 0.536, + "step": 5346 + }, + { + "epoch": 5.980984340044743, + "grad_norm": 0.6215797066688538, + "learning_rate": 1.2866846341968997e-10, + "loss": 0.513, + "step": 5347 + }, + { + "epoch": 5.982102908277405, + "grad_norm": 0.6283316612243652, + "learning_rate": 1.1397632837056861e-10, + "loss": 0.5165, + "step": 5348 + }, + { + "epoch": 5.983221476510067, + "grad_norm": 0.6479595899581909, + "learning_rate": 1.0017459953048924e-10, + "loss": 0.5349, + "step": 5349 + }, + { + "epoch": 5.984340044742729, + "grad_norm": 0.6275001764297485, + "learning_rate": 8.726328181551946e-11, + "loss": 0.5068, + "step": 5350 + }, + { + "epoch": 5.985458612975392, + "grad_norm": 0.6299359202384949, + "learning_rate": 7.524237982392546e-11, + "loss": 0.5095, + "step": 5351 + }, + { + "epoch": 5.9865771812080535, + "grad_norm": 0.6552029848098755, + "learning_rate": 6.411189783783744e-11, + "loss": 0.5414, + "step": 5352 + }, + { + "epoch": 5.987695749440716, + "grad_norm": 0.6292834877967834, + "learning_rate": 5.3871839821306724e-11, + "loss": 0.5241, + "step": 5353 + }, + { + "epoch": 5.988814317673378, + "grad_norm": 0.6433687210083008, + "learning_rate": 4.45222094216935e-11, + "loss": 0.5338, + "step": 5354 + }, + { + "epoch": 5.989932885906041, + "grad_norm": 0.6214168667793274, + "learning_rate": 3.606300996938927e-11, + "loss": 0.5383, + "step": 5355 + }, + { + "epoch": 5.991051454138702, + "grad_norm": 0.6196667551994324, + "learning_rate": 2.8494244476984234e-11, + "loss": 0.5377, + "step": 5356 + }, + { + "epoch": 5.992170022371365, + "grad_norm": 0.6240503191947937, + "learning_rate": 2.1815915640654993e-11, + "loss": 0.5061, + "step": 5357 + }, + { + "epoch": 5.993288590604027, + "grad_norm": 0.632733941078186, + "learning_rate": 1.6028025839054385e-11, + "loss": 0.5507, + "step": 5358 + }, + { + "epoch": 5.994407158836689, + "grad_norm": 0.6091477274894714, + "learning_rate": 1.1130577133311449e-11, + "loss": 0.5107, + "step": 5359 + }, + { + "epoch": 5.995525727069351, + "grad_norm": 0.6358879208564758, + "learning_rate": 7.123571268419227e-12, + "loss": 0.5254, + "step": 5360 + }, + { + "epoch": 5.996644295302014, + "grad_norm": 0.6285417079925537, + "learning_rate": 4.0070096710143055e-12, + "loss": 0.5151, + "step": 5361 + }, + { + "epoch": 5.997762863534676, + "grad_norm": 0.6216637492179871, + "learning_rate": 1.7808934513197096e-12, + "loss": 0.5365, + "step": 5362 + }, + { + "epoch": 5.998881431767337, + "grad_norm": 0.6240010857582092, + "learning_rate": 4.4522340258978945e-13, + "loss": 0.5231, + "step": 5363 + }, + { + "epoch": 6.0, + "grad_norm": 0.6094616651535034, + "learning_rate": 0.0, + "loss": 0.5097, + "step": 5364 + } + ], + "logging_steps": 1, + "max_steps": 5364, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 894, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.2015940394895278e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}