diff --git "a/checkpoint-18830/trainer_state.json" "b/checkpoint-18830/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-18830/trainer_state.json" @@ -0,0 +1,131843 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.9988052568697725, + "eval_steps": 500, + "global_step": 18830, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00026549847338377806, + "grad_norm": 1.4133170389245202, + "learning_rate": 5.0000000000000004e-08, + "loss": 0.9082, + "step": 1 + }, + { + "epoch": 0.0005309969467675561, + "grad_norm": 1.4044375677052006, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.9046, + "step": 2 + }, + { + "epoch": 0.0007964954201513341, + "grad_norm": 1.3102037815788463, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.871, + "step": 3 + }, + { + "epoch": 0.0010619938935351122, + "grad_norm": 1.3366471282054486, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.8782, + "step": 4 + }, + { + "epoch": 0.0013274923669188902, + "grad_norm": 1.3635185920921395, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.9146, + "step": 5 + }, + { + "epoch": 0.0015929908403026682, + "grad_norm": 1.4014578584835566, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.911, + "step": 6 + }, + { + "epoch": 0.0018584893136864462, + "grad_norm": 1.3549748295314084, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.8819, + "step": 7 + }, + { + "epoch": 0.0021239877870702245, + "grad_norm": 1.3135635642099046, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.893, + "step": 8 + }, + { + "epoch": 0.0023894862604540022, + "grad_norm": 1.2926227219867545, + "learning_rate": 4.5000000000000003e-07, + "loss": 0.8847, + "step": 9 + }, + { + "epoch": 0.0026549847338377805, + "grad_norm": 1.251755492618311, + "learning_rate": 5.000000000000001e-07, + "loss": 0.8509, + "step": 10 + }, + { + "epoch": 0.0029204832072215587, + "grad_norm": 1.3284034155858548, + "learning_rate": 5.5e-07, + "loss": 0.8776, + "step": 11 + }, + { + "epoch": 0.0031859816806053365, + "grad_norm": 1.3011186118234537, + "learning_rate": 6.000000000000001e-07, + "loss": 0.8948, + "step": 12 + }, + { + "epoch": 0.0034514801539891147, + "grad_norm": 1.2341704129726287, + "learning_rate": 6.5e-07, + "loss": 0.8831, + "step": 13 + }, + { + "epoch": 0.0037169786273728925, + "grad_norm": 1.2514230159054196, + "learning_rate": 7.000000000000001e-07, + "loss": 0.8712, + "step": 14 + }, + { + "epoch": 0.00398247710075667, + "grad_norm": 1.1987764538870713, + "learning_rate": 7.5e-07, + "loss": 0.8823, + "step": 15 + }, + { + "epoch": 0.004247975574140449, + "grad_norm": 1.2672326543360966, + "learning_rate": 8.000000000000001e-07, + "loss": 0.8759, + "step": 16 + }, + { + "epoch": 0.004513474047524227, + "grad_norm": 1.2320004691691897, + "learning_rate": 8.500000000000001e-07, + "loss": 0.8802, + "step": 17 + }, + { + "epoch": 0.0047789725209080045, + "grad_norm": 1.1676684676908597, + "learning_rate": 9.000000000000001e-07, + "loss": 0.854, + "step": 18 + }, + { + "epoch": 0.005044470994291783, + "grad_norm": 1.1050533839495187, + "learning_rate": 9.500000000000001e-07, + "loss": 0.8827, + "step": 19 + }, + { + "epoch": 0.005309969467675561, + "grad_norm": 1.0752828509499528, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.8508, + "step": 20 + }, + { + "epoch": 0.005575467941059339, + "grad_norm": 1.0120183920536465, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.8603, + "step": 21 + }, + { + "epoch": 0.005840966414443117, + "grad_norm": 0.994980565363457, + "learning_rate": 1.1e-06, + "loss": 0.8676, + "step": 22 + }, + { + "epoch": 0.006106464887826895, + "grad_norm": 0.8764882048485012, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.8489, + "step": 23 + }, + { + "epoch": 0.006371963361210673, + "grad_norm": 0.8550001678984946, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.8422, + "step": 24 + }, + { + "epoch": 0.006637461834594451, + "grad_norm": 0.7578437022717379, + "learning_rate": 1.25e-06, + "loss": 0.8232, + "step": 25 + }, + { + "epoch": 0.006902960307978229, + "grad_norm": 0.6992717109150458, + "learning_rate": 1.3e-06, + "loss": 0.8427, + "step": 26 + }, + { + "epoch": 0.007168458781362007, + "grad_norm": 0.6689093642420254, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.7961, + "step": 27 + }, + { + "epoch": 0.007433957254745785, + "grad_norm": 0.6399612312421933, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.8, + "step": 28 + }, + { + "epoch": 0.007699455728129564, + "grad_norm": 0.572695626409587, + "learning_rate": 1.45e-06, + "loss": 0.7684, + "step": 29 + }, + { + "epoch": 0.00796495420151334, + "grad_norm": 0.6000840388722052, + "learning_rate": 1.5e-06, + "loss": 0.8152, + "step": 30 + }, + { + "epoch": 0.00823045267489712, + "grad_norm": 0.5831757904515951, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.825, + "step": 31 + }, + { + "epoch": 0.008495951148280898, + "grad_norm": 0.5376764754780556, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.7904, + "step": 32 + }, + { + "epoch": 0.008761449621664676, + "grad_norm": 0.544894260928256, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.7981, + "step": 33 + }, + { + "epoch": 0.009026948095048453, + "grad_norm": 0.5044098805336641, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.8049, + "step": 34 + }, + { + "epoch": 0.009292446568432231, + "grad_norm": 0.5105087348699734, + "learning_rate": 1.75e-06, + "loss": 0.8189, + "step": 35 + }, + { + "epoch": 0.009557945041816009, + "grad_norm": 0.5211090086716426, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.7974, + "step": 36 + }, + { + "epoch": 0.009823443515199787, + "grad_norm": 0.5131137326253041, + "learning_rate": 1.85e-06, + "loss": 0.7662, + "step": 37 + }, + { + "epoch": 0.010088941988583566, + "grad_norm": 0.49907189290405146, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.747, + "step": 38 + }, + { + "epoch": 0.010354440461967344, + "grad_norm": 0.47771973021953945, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.7463, + "step": 39 + }, + { + "epoch": 0.010619938935351122, + "grad_norm": 0.4785638397033305, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8147, + "step": 40 + }, + { + "epoch": 0.0108854374087349, + "grad_norm": 0.4558671534536234, + "learning_rate": 2.05e-06, + "loss": 0.7741, + "step": 41 + }, + { + "epoch": 0.011150935882118677, + "grad_norm": 0.44950473746412367, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.7629, + "step": 42 + }, + { + "epoch": 0.011416434355502455, + "grad_norm": 0.4443003313102011, + "learning_rate": 2.15e-06, + "loss": 0.7771, + "step": 43 + }, + { + "epoch": 0.011681932828886235, + "grad_norm": 0.3910766012475798, + "learning_rate": 2.2e-06, + "loss": 0.7125, + "step": 44 + }, + { + "epoch": 0.011947431302270013, + "grad_norm": 0.4008723150350752, + "learning_rate": 2.25e-06, + "loss": 0.7306, + "step": 45 + }, + { + "epoch": 0.01221292977565379, + "grad_norm": 0.3754687761913381, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.7331, + "step": 46 + }, + { + "epoch": 0.012478428249037568, + "grad_norm": 0.36958796475999844, + "learning_rate": 2.35e-06, + "loss": 0.7114, + "step": 47 + }, + { + "epoch": 0.012743926722421346, + "grad_norm": 0.36066586543548546, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.7646, + "step": 48 + }, + { + "epoch": 0.013009425195805124, + "grad_norm": 0.377952015244848, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.7574, + "step": 49 + }, + { + "epoch": 0.013274923669188901, + "grad_norm": 0.35033265192734125, + "learning_rate": 2.5e-06, + "loss": 0.7175, + "step": 50 + }, + { + "epoch": 0.013540422142572681, + "grad_norm": 0.32967169997783835, + "learning_rate": 2.55e-06, + "loss": 0.7142, + "step": 51 + }, + { + "epoch": 0.013805920615956459, + "grad_norm": 0.3128738432355881, + "learning_rate": 2.6e-06, + "loss": 0.772, + "step": 52 + }, + { + "epoch": 0.014071419089340237, + "grad_norm": 0.3121467711239632, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.7435, + "step": 53 + }, + { + "epoch": 0.014336917562724014, + "grad_norm": 0.30406168256466826, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.7307, + "step": 54 + }, + { + "epoch": 0.014602416036107792, + "grad_norm": 0.30253139985404487, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.764, + "step": 55 + }, + { + "epoch": 0.01486791450949157, + "grad_norm": 0.2932645101598996, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.7545, + "step": 56 + }, + { + "epoch": 0.015133412982875348, + "grad_norm": 0.28874763194548436, + "learning_rate": 2.85e-06, + "loss": 0.7204, + "step": 57 + }, + { + "epoch": 0.015398911456259127, + "grad_norm": 0.3043644175395429, + "learning_rate": 2.9e-06, + "loss": 0.7431, + "step": 58 + }, + { + "epoch": 0.015664409929642905, + "grad_norm": 0.29277216204800016, + "learning_rate": 2.95e-06, + "loss": 0.6851, + "step": 59 + }, + { + "epoch": 0.01592990840302668, + "grad_norm": 0.2999037795611065, + "learning_rate": 3e-06, + "loss": 0.7458, + "step": 60 + }, + { + "epoch": 0.01619540687641046, + "grad_norm": 0.28603682629407434, + "learning_rate": 3.05e-06, + "loss": 0.743, + "step": 61 + }, + { + "epoch": 0.01646090534979424, + "grad_norm": 0.28812321057239954, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.7232, + "step": 62 + }, + { + "epoch": 0.016726403823178016, + "grad_norm": 0.2941773752215629, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.7664, + "step": 63 + }, + { + "epoch": 0.016991902296561796, + "grad_norm": 0.28683774669729817, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.7103, + "step": 64 + }, + { + "epoch": 0.01725740076994557, + "grad_norm": 0.268635186496577, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.7105, + "step": 65 + }, + { + "epoch": 0.01752289924332935, + "grad_norm": 0.27286564198875624, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.7309, + "step": 66 + }, + { + "epoch": 0.017788397716713127, + "grad_norm": 0.27685825537587966, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.6991, + "step": 67 + }, + { + "epoch": 0.018053896190096907, + "grad_norm": 0.26542577831742614, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.7258, + "step": 68 + }, + { + "epoch": 0.018319394663480686, + "grad_norm": 0.25455554558379656, + "learning_rate": 3.45e-06, + "loss": 0.6762, + "step": 69 + }, + { + "epoch": 0.018584893136864462, + "grad_norm": 0.26551700128237027, + "learning_rate": 3.5e-06, + "loss": 0.7118, + "step": 70 + }, + { + "epoch": 0.018850391610248242, + "grad_norm": 0.26405604223669027, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.74, + "step": 71 + }, + { + "epoch": 0.019115890083632018, + "grad_norm": 0.2682710044475517, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.7176, + "step": 72 + }, + { + "epoch": 0.019381388557015797, + "grad_norm": 0.26835845009771225, + "learning_rate": 3.65e-06, + "loss": 0.7033, + "step": 73 + }, + { + "epoch": 0.019646887030399574, + "grad_norm": 0.27186708180710134, + "learning_rate": 3.7e-06, + "loss": 0.7531, + "step": 74 + }, + { + "epoch": 0.019912385503783353, + "grad_norm": 0.26739347296891547, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.7331, + "step": 75 + }, + { + "epoch": 0.020177883977167133, + "grad_norm": 0.2620288921624337, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.6777, + "step": 76 + }, + { + "epoch": 0.02044338245055091, + "grad_norm": 0.27361490719897, + "learning_rate": 3.85e-06, + "loss": 0.766, + "step": 77 + }, + { + "epoch": 0.020708880923934688, + "grad_norm": 0.2532185746262512, + "learning_rate": 3.900000000000001e-06, + "loss": 0.7097, + "step": 78 + }, + { + "epoch": 0.020974379397318464, + "grad_norm": 0.2567082880948565, + "learning_rate": 3.95e-06, + "loss": 0.6954, + "step": 79 + }, + { + "epoch": 0.021239877870702244, + "grad_norm": 0.2566665605997135, + "learning_rate": 4.000000000000001e-06, + "loss": 0.7383, + "step": 80 + }, + { + "epoch": 0.021505376344086023, + "grad_norm": 0.25736368043811986, + "learning_rate": 4.05e-06, + "loss": 0.7464, + "step": 81 + }, + { + "epoch": 0.0217708748174698, + "grad_norm": 0.2524846582503469, + "learning_rate": 4.1e-06, + "loss": 0.7511, + "step": 82 + }, + { + "epoch": 0.02203637329085358, + "grad_norm": 0.25627792119137593, + "learning_rate": 4.15e-06, + "loss": 0.745, + "step": 83 + }, + { + "epoch": 0.022301871764237355, + "grad_norm": 0.2637730516534178, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.75, + "step": 84 + }, + { + "epoch": 0.022567370237621134, + "grad_norm": 0.25510395075205117, + "learning_rate": 4.25e-06, + "loss": 0.6979, + "step": 85 + }, + { + "epoch": 0.02283286871100491, + "grad_norm": 0.2890559540691289, + "learning_rate": 4.3e-06, + "loss": 0.7075, + "step": 86 + }, + { + "epoch": 0.02309836718438869, + "grad_norm": 0.2509012922405937, + "learning_rate": 4.350000000000001e-06, + "loss": 0.7087, + "step": 87 + }, + { + "epoch": 0.02336386565777247, + "grad_norm": 0.25550464739881684, + "learning_rate": 4.4e-06, + "loss": 0.7016, + "step": 88 + }, + { + "epoch": 0.023629364131156246, + "grad_norm": 0.2556178702239233, + "learning_rate": 4.450000000000001e-06, + "loss": 0.6977, + "step": 89 + }, + { + "epoch": 0.023894862604540025, + "grad_norm": 0.24845133311886222, + "learning_rate": 4.5e-06, + "loss": 0.6902, + "step": 90 + }, + { + "epoch": 0.0241603610779238, + "grad_norm": 0.2487833268099019, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.7192, + "step": 91 + }, + { + "epoch": 0.02442585955130758, + "grad_norm": 0.2523603689649045, + "learning_rate": 4.600000000000001e-06, + "loss": 0.6753, + "step": 92 + }, + { + "epoch": 0.024691358024691357, + "grad_norm": 0.24635668385065182, + "learning_rate": 4.65e-06, + "loss": 0.6986, + "step": 93 + }, + { + "epoch": 0.024956856498075136, + "grad_norm": 0.26107306703307287, + "learning_rate": 4.7e-06, + "loss": 0.7201, + "step": 94 + }, + { + "epoch": 0.025222354971458916, + "grad_norm": 0.24664587374623534, + "learning_rate": 4.75e-06, + "loss": 0.6522, + "step": 95 + }, + { + "epoch": 0.025487853444842692, + "grad_norm": 0.2520881808354904, + "learning_rate": 4.800000000000001e-06, + "loss": 0.7131, + "step": 96 + }, + { + "epoch": 0.02575335191822647, + "grad_norm": 0.25598702780366006, + "learning_rate": 4.85e-06, + "loss": 0.7467, + "step": 97 + }, + { + "epoch": 0.026018850391610247, + "grad_norm": 0.2616550173587426, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.7336, + "step": 98 + }, + { + "epoch": 0.026284348864994027, + "grad_norm": 0.25891740874006586, + "learning_rate": 4.95e-06, + "loss": 0.7221, + "step": 99 + }, + { + "epoch": 0.026549847338377803, + "grad_norm": 0.26549726719468714, + "learning_rate": 5e-06, + "loss": 0.7071, + "step": 100 + }, + { + "epoch": 0.026815345811761582, + "grad_norm": 0.25874316429601724, + "learning_rate": 4.99999997562194e-06, + "loss": 0.755, + "step": 101 + }, + { + "epoch": 0.027080844285145362, + "grad_norm": 0.2511929232702433, + "learning_rate": 4.999999902487759e-06, + "loss": 0.7244, + "step": 102 + }, + { + "epoch": 0.027346342758529138, + "grad_norm": 0.2558497677742771, + "learning_rate": 4.999999780597459e-06, + "loss": 0.7185, + "step": 103 + }, + { + "epoch": 0.027611841231912918, + "grad_norm": 0.25423090267132625, + "learning_rate": 4.999999609951042e-06, + "loss": 0.7142, + "step": 104 + }, + { + "epoch": 0.027877339705296694, + "grad_norm": 0.25520571574035367, + "learning_rate": 4.9999993905485115e-06, + "loss": 0.7254, + "step": 105 + }, + { + "epoch": 0.028142838178680473, + "grad_norm": 0.2577386731233315, + "learning_rate": 4.999999122389871e-06, + "loss": 0.7032, + "step": 106 + }, + { + "epoch": 0.02840833665206425, + "grad_norm": 0.24645487363162205, + "learning_rate": 4.999998805475128e-06, + "loss": 0.677, + "step": 107 + }, + { + "epoch": 0.02867383512544803, + "grad_norm": 0.2505991297766978, + "learning_rate": 4.999998439804287e-06, + "loss": 0.7279, + "step": 108 + }, + { + "epoch": 0.028939333598831808, + "grad_norm": 0.24612817265821374, + "learning_rate": 4.999998025377354e-06, + "loss": 0.6821, + "step": 109 + }, + { + "epoch": 0.029204832072215584, + "grad_norm": 0.2559063653494428, + "learning_rate": 4.99999756219434e-06, + "loss": 0.7362, + "step": 110 + }, + { + "epoch": 0.029470330545599364, + "grad_norm": 0.2529304236380953, + "learning_rate": 4.999997050255252e-06, + "loss": 0.7276, + "step": 111 + }, + { + "epoch": 0.02973582901898314, + "grad_norm": 0.25661217078094695, + "learning_rate": 4.9999964895601e-06, + "loss": 0.7101, + "step": 112 + }, + { + "epoch": 0.03000132749236692, + "grad_norm": 0.25593901891463355, + "learning_rate": 4.999995880108896e-06, + "loss": 0.7209, + "step": 113 + }, + { + "epoch": 0.030266825965750695, + "grad_norm": 0.2468092183814833, + "learning_rate": 4.999995221901651e-06, + "loss": 0.674, + "step": 114 + }, + { + "epoch": 0.030532324439134475, + "grad_norm": 0.25989065906974634, + "learning_rate": 4.9999945149383785e-06, + "loss": 0.6863, + "step": 115 + }, + { + "epoch": 0.030797822912518254, + "grad_norm": 0.2501700260472302, + "learning_rate": 4.999993759219091e-06, + "loss": 0.7129, + "step": 116 + }, + { + "epoch": 0.03106332138590203, + "grad_norm": 0.2477100927680114, + "learning_rate": 4.999992954743805e-06, + "loss": 0.6745, + "step": 117 + }, + { + "epoch": 0.03132881985928581, + "grad_norm": 0.25731972201226244, + "learning_rate": 4.999992101512536e-06, + "loss": 0.7153, + "step": 118 + }, + { + "epoch": 0.03159431833266959, + "grad_norm": 0.25648861667989575, + "learning_rate": 4.999991199525299e-06, + "loss": 0.6942, + "step": 119 + }, + { + "epoch": 0.03185981680605336, + "grad_norm": 0.25543098235740036, + "learning_rate": 4.999990248782113e-06, + "loss": 0.7018, + "step": 120 + }, + { + "epoch": 0.03212531527943714, + "grad_norm": 0.2573491732459776, + "learning_rate": 4.999989249282995e-06, + "loss": 0.721, + "step": 121 + }, + { + "epoch": 0.03239081375282092, + "grad_norm": 0.24897639401490088, + "learning_rate": 4.999988201027967e-06, + "loss": 0.6665, + "step": 122 + }, + { + "epoch": 0.0326563122262047, + "grad_norm": 0.25353835471321695, + "learning_rate": 4.999987104017048e-06, + "loss": 0.6535, + "step": 123 + }, + { + "epoch": 0.03292181069958848, + "grad_norm": 0.25324921439833153, + "learning_rate": 4.999985958250259e-06, + "loss": 0.6652, + "step": 124 + }, + { + "epoch": 0.03318730917297225, + "grad_norm": 0.2531800167423538, + "learning_rate": 4.999984763727622e-06, + "loss": 0.6899, + "step": 125 + }, + { + "epoch": 0.03345280764635603, + "grad_norm": 0.25245484453871886, + "learning_rate": 4.999983520449162e-06, + "loss": 0.7255, + "step": 126 + }, + { + "epoch": 0.03371830611973981, + "grad_norm": 0.24636543509983638, + "learning_rate": 4.999982228414902e-06, + "loss": 0.7039, + "step": 127 + }, + { + "epoch": 0.03398380459312359, + "grad_norm": 0.2596679023066606, + "learning_rate": 4.999980887624868e-06, + "loss": 0.7585, + "step": 128 + }, + { + "epoch": 0.03424930306650737, + "grad_norm": 0.2544471820006133, + "learning_rate": 4.999979498079085e-06, + "loss": 0.7226, + "step": 129 + }, + { + "epoch": 0.03451480153989114, + "grad_norm": 0.25541840400438764, + "learning_rate": 4.999978059777582e-06, + "loss": 0.7151, + "step": 130 + }, + { + "epoch": 0.03478030001327492, + "grad_norm": 0.25798389533674054, + "learning_rate": 4.999976572720384e-06, + "loss": 0.7209, + "step": 131 + }, + { + "epoch": 0.0350457984866587, + "grad_norm": 0.25504242933602034, + "learning_rate": 4.999975036907523e-06, + "loss": 0.7069, + "step": 132 + }, + { + "epoch": 0.03531129696004248, + "grad_norm": 0.2509343509012928, + "learning_rate": 4.999973452339028e-06, + "loss": 0.7057, + "step": 133 + }, + { + "epoch": 0.035576795433426255, + "grad_norm": 0.2513894881269105, + "learning_rate": 4.99997181901493e-06, + "loss": 0.7033, + "step": 134 + }, + { + "epoch": 0.035842293906810034, + "grad_norm": 0.2584358731096311, + "learning_rate": 4.9999701369352595e-06, + "loss": 0.6961, + "step": 135 + }, + { + "epoch": 0.036107792380193814, + "grad_norm": 0.25256173256109765, + "learning_rate": 4.99996840610005e-06, + "loss": 0.7059, + "step": 136 + }, + { + "epoch": 0.03637329085357759, + "grad_norm": 0.27139559456131557, + "learning_rate": 4.999966626509336e-06, + "loss": 0.7153, + "step": 137 + }, + { + "epoch": 0.03663878932696137, + "grad_norm": 0.26317533069134746, + "learning_rate": 4.999964798163153e-06, + "loss": 0.728, + "step": 138 + }, + { + "epoch": 0.036904287800345145, + "grad_norm": 0.2523361779869315, + "learning_rate": 4.9999629210615345e-06, + "loss": 0.6856, + "step": 139 + }, + { + "epoch": 0.037169786273728925, + "grad_norm": 0.25623775469647725, + "learning_rate": 4.999960995204518e-06, + "loss": 0.6928, + "step": 140 + }, + { + "epoch": 0.037435284747112704, + "grad_norm": 0.2606127602795901, + "learning_rate": 4.999959020592141e-06, + "loss": 0.7049, + "step": 141 + }, + { + "epoch": 0.037700783220496484, + "grad_norm": 0.25596727411162024, + "learning_rate": 4.999956997224443e-06, + "loss": 0.7175, + "step": 142 + }, + { + "epoch": 0.03796628169388026, + "grad_norm": 0.26383740632476566, + "learning_rate": 4.999954925101462e-06, + "loss": 0.6857, + "step": 143 + }, + { + "epoch": 0.038231780167264036, + "grad_norm": 0.26891725396566324, + "learning_rate": 4.99995280422324e-06, + "loss": 0.6928, + "step": 144 + }, + { + "epoch": 0.038497278640647815, + "grad_norm": 0.24704940977635542, + "learning_rate": 4.999950634589815e-06, + "loss": 0.6439, + "step": 145 + }, + { + "epoch": 0.038762777114031595, + "grad_norm": 0.25891770187188756, + "learning_rate": 4.9999484162012335e-06, + "loss": 0.7084, + "step": 146 + }, + { + "epoch": 0.039028275587415374, + "grad_norm": 0.2747765622828544, + "learning_rate": 4.999946149057536e-06, + "loss": 0.71, + "step": 147 + }, + { + "epoch": 0.03929377406079915, + "grad_norm": 0.2629632566296953, + "learning_rate": 4.999943833158769e-06, + "loss": 0.7176, + "step": 148 + }, + { + "epoch": 0.03955927253418293, + "grad_norm": 0.26013386317185416, + "learning_rate": 4.999941468504975e-06, + "loss": 0.7096, + "step": 149 + }, + { + "epoch": 0.039824771007566706, + "grad_norm": 0.2602432005734629, + "learning_rate": 4.999939055096203e-06, + "loss": 0.7376, + "step": 150 + }, + { + "epoch": 0.040090269480950486, + "grad_norm": 0.255755143030168, + "learning_rate": 4.999936592932497e-06, + "loss": 0.7107, + "step": 151 + }, + { + "epoch": 0.040355767954334265, + "grad_norm": 0.26459441298069336, + "learning_rate": 4.9999340820139065e-06, + "loss": 0.6986, + "step": 152 + }, + { + "epoch": 0.04062126642771804, + "grad_norm": 0.2580175240666269, + "learning_rate": 4.999931522340482e-06, + "loss": 0.6941, + "step": 153 + }, + { + "epoch": 0.04088676490110182, + "grad_norm": 0.2548066104486532, + "learning_rate": 4.999928913912271e-06, + "loss": 0.7073, + "step": 154 + }, + { + "epoch": 0.0411522633744856, + "grad_norm": 0.2566359870885854, + "learning_rate": 4.999926256729325e-06, + "loss": 0.695, + "step": 155 + }, + { + "epoch": 0.041417761847869376, + "grad_norm": 0.2549048866184483, + "learning_rate": 4.9999235507916966e-06, + "loss": 0.6859, + "step": 156 + }, + { + "epoch": 0.041683260321253156, + "grad_norm": 0.2632131747724388, + "learning_rate": 4.999920796099438e-06, + "loss": 0.7316, + "step": 157 + }, + { + "epoch": 0.04194875879463693, + "grad_norm": 0.2545162111395541, + "learning_rate": 4.9999179926526035e-06, + "loss": 0.7016, + "step": 158 + }, + { + "epoch": 0.04221425726802071, + "grad_norm": 0.2705893544135468, + "learning_rate": 4.999915140451245e-06, + "loss": 0.7202, + "step": 159 + }, + { + "epoch": 0.04247975574140449, + "grad_norm": 0.26185974573938753, + "learning_rate": 4.999912239495424e-06, + "loss": 0.6786, + "step": 160 + }, + { + "epoch": 0.04274525421478827, + "grad_norm": 0.26128486781124644, + "learning_rate": 4.9999092897851905e-06, + "loss": 0.69, + "step": 161 + }, + { + "epoch": 0.043010752688172046, + "grad_norm": 0.27173877465739715, + "learning_rate": 4.9999062913206065e-06, + "loss": 0.7127, + "step": 162 + }, + { + "epoch": 0.04327625116155582, + "grad_norm": 0.2623584024641717, + "learning_rate": 4.999903244101728e-06, + "loss": 0.679, + "step": 163 + }, + { + "epoch": 0.0435417496349396, + "grad_norm": 0.26064668777175376, + "learning_rate": 4.999900148128617e-06, + "loss": 0.7104, + "step": 164 + }, + { + "epoch": 0.04380724810832338, + "grad_norm": 0.2620858617788341, + "learning_rate": 4.999897003401331e-06, + "loss": 0.7191, + "step": 165 + }, + { + "epoch": 0.04407274658170716, + "grad_norm": 0.2592932893814249, + "learning_rate": 4.999893809919934e-06, + "loss": 0.6995, + "step": 166 + }, + { + "epoch": 0.04433824505509093, + "grad_norm": 0.26211056697252927, + "learning_rate": 4.999890567684485e-06, + "loss": 0.6786, + "step": 167 + }, + { + "epoch": 0.04460374352847471, + "grad_norm": 0.26822117591960615, + "learning_rate": 4.99988727669505e-06, + "loss": 0.7037, + "step": 168 + }, + { + "epoch": 0.04486924200185849, + "grad_norm": 0.2717335949331894, + "learning_rate": 4.999883936951693e-06, + "loss": 0.6964, + "step": 169 + }, + { + "epoch": 0.04513474047524227, + "grad_norm": 0.26646268613952157, + "learning_rate": 4.999880548454478e-06, + "loss": 0.6981, + "step": 170 + }, + { + "epoch": 0.04540023894862605, + "grad_norm": 0.26895211181167006, + "learning_rate": 4.999877111203471e-06, + "loss": 0.7185, + "step": 171 + }, + { + "epoch": 0.04566573742200982, + "grad_norm": 0.26596194964860426, + "learning_rate": 4.99987362519874e-06, + "loss": 0.6934, + "step": 172 + }, + { + "epoch": 0.0459312358953936, + "grad_norm": 0.2638074081092255, + "learning_rate": 4.999870090440352e-06, + "loss": 0.6726, + "step": 173 + }, + { + "epoch": 0.04619673436877738, + "grad_norm": 0.2685766772130088, + "learning_rate": 4.999866506928377e-06, + "loss": 0.7, + "step": 174 + }, + { + "epoch": 0.04646223284216116, + "grad_norm": 0.26125994565790756, + "learning_rate": 4.999862874662885e-06, + "loss": 0.6729, + "step": 175 + }, + { + "epoch": 0.04672773131554494, + "grad_norm": 0.2585324922913285, + "learning_rate": 4.999859193643946e-06, + "loss": 0.6717, + "step": 176 + }, + { + "epoch": 0.04699322978892871, + "grad_norm": 0.2719323796706383, + "learning_rate": 4.999855463871631e-06, + "loss": 0.7109, + "step": 177 + }, + { + "epoch": 0.04725872826231249, + "grad_norm": 0.28112974850028344, + "learning_rate": 4.999851685346015e-06, + "loss": 0.7035, + "step": 178 + }, + { + "epoch": 0.04752422673569627, + "grad_norm": 0.263044613805147, + "learning_rate": 4.9998478580671694e-06, + "loss": 0.6709, + "step": 179 + }, + { + "epoch": 0.04778972520908005, + "grad_norm": 0.26340405459611804, + "learning_rate": 4.99984398203517e-06, + "loss": 0.6843, + "step": 180 + }, + { + "epoch": 0.04805522368246382, + "grad_norm": 0.2717026054338667, + "learning_rate": 4.999840057250092e-06, + "loss": 0.7074, + "step": 181 + }, + { + "epoch": 0.0483207221558476, + "grad_norm": 0.26018460501123286, + "learning_rate": 4.999836083712014e-06, + "loss": 0.6874, + "step": 182 + }, + { + "epoch": 0.04858622062923138, + "grad_norm": 0.2962560946041906, + "learning_rate": 4.99983206142101e-06, + "loss": 0.6984, + "step": 183 + }, + { + "epoch": 0.04885171910261516, + "grad_norm": 0.26542573047174617, + "learning_rate": 4.999827990377161e-06, + "loss": 0.6746, + "step": 184 + }, + { + "epoch": 0.04911721757599894, + "grad_norm": 0.2627478069344486, + "learning_rate": 4.999823870580545e-06, + "loss": 0.6983, + "step": 185 + }, + { + "epoch": 0.04938271604938271, + "grad_norm": 0.2695317200701367, + "learning_rate": 4.999819702031243e-06, + "loss": 0.6904, + "step": 186 + }, + { + "epoch": 0.04964821452276649, + "grad_norm": 0.2692151455364245, + "learning_rate": 4.999815484729337e-06, + "loss": 0.7121, + "step": 187 + }, + { + "epoch": 0.04991371299615027, + "grad_norm": 0.2670591361605896, + "learning_rate": 4.999811218674907e-06, + "loss": 0.6779, + "step": 188 + }, + { + "epoch": 0.05017921146953405, + "grad_norm": 0.2665842036825122, + "learning_rate": 4.999806903868039e-06, + "loss": 0.6778, + "step": 189 + }, + { + "epoch": 0.05044470994291783, + "grad_norm": 0.26415858006089876, + "learning_rate": 4.999802540308815e-06, + "loss": 0.6705, + "step": 190 + }, + { + "epoch": 0.050710208416301604, + "grad_norm": 0.2699353100740373, + "learning_rate": 4.999798127997322e-06, + "loss": 0.6625, + "step": 191 + }, + { + "epoch": 0.050975706889685383, + "grad_norm": 0.276360043882742, + "learning_rate": 4.999793666933643e-06, + "loss": 0.7146, + "step": 192 + }, + { + "epoch": 0.05124120536306916, + "grad_norm": 0.27731778712619215, + "learning_rate": 4.999789157117868e-06, + "loss": 0.7244, + "step": 193 + }, + { + "epoch": 0.05150670383645294, + "grad_norm": 0.28021136346995323, + "learning_rate": 4.999784598550085e-06, + "loss": 0.7223, + "step": 194 + }, + { + "epoch": 0.051772202309836715, + "grad_norm": 0.27363447721763196, + "learning_rate": 4.99977999123038e-06, + "loss": 0.6751, + "step": 195 + }, + { + "epoch": 0.052037700783220495, + "grad_norm": 0.27293263360243547, + "learning_rate": 4.999775335158845e-06, + "loss": 0.666, + "step": 196 + }, + { + "epoch": 0.052303199256604274, + "grad_norm": 0.26892809104063314, + "learning_rate": 4.999770630335571e-06, + "loss": 0.6627, + "step": 197 + }, + { + "epoch": 0.052568697729988054, + "grad_norm": 0.2655011939742062, + "learning_rate": 4.999765876760649e-06, + "loss": 0.6376, + "step": 198 + }, + { + "epoch": 0.05283419620337183, + "grad_norm": 0.27378030490933, + "learning_rate": 4.999761074434172e-06, + "loss": 0.699, + "step": 199 + }, + { + "epoch": 0.053099694676755606, + "grad_norm": 0.2797580892313466, + "learning_rate": 4.999756223356233e-06, + "loss": 0.6817, + "step": 200 + }, + { + "epoch": 0.053365193150139385, + "grad_norm": 0.27676547571532695, + "learning_rate": 4.999751323526928e-06, + "loss": 0.7059, + "step": 201 + }, + { + "epoch": 0.053630691623523165, + "grad_norm": 0.2800957514600047, + "learning_rate": 4.999746374946351e-06, + "loss": 0.7007, + "step": 202 + }, + { + "epoch": 0.053896190096906944, + "grad_norm": 0.2658748217580449, + "learning_rate": 4.9997413776146006e-06, + "loss": 0.7133, + "step": 203 + }, + { + "epoch": 0.054161688570290724, + "grad_norm": 0.27165720985917746, + "learning_rate": 4.999736331531772e-06, + "loss": 0.7057, + "step": 204 + }, + { + "epoch": 0.054427187043674496, + "grad_norm": 0.27110457882180444, + "learning_rate": 4.9997312366979644e-06, + "loss": 0.6923, + "step": 205 + }, + { + "epoch": 0.054692685517058276, + "grad_norm": 0.26608928525249387, + "learning_rate": 4.999726093113277e-06, + "loss": 0.6808, + "step": 206 + }, + { + "epoch": 0.054958183990442055, + "grad_norm": 0.27284820287434014, + "learning_rate": 4.9997209007778115e-06, + "loss": 0.6769, + "step": 207 + }, + { + "epoch": 0.055223682463825835, + "grad_norm": 0.2754244175594125, + "learning_rate": 4.9997156596916675e-06, + "loss": 0.6949, + "step": 208 + }, + { + "epoch": 0.05548918093720961, + "grad_norm": 0.280054498798005, + "learning_rate": 4.999710369854948e-06, + "loss": 0.7043, + "step": 209 + }, + { + "epoch": 0.05575467941059339, + "grad_norm": 0.27321589443694444, + "learning_rate": 4.9997050312677555e-06, + "loss": 0.6894, + "step": 210 + }, + { + "epoch": 0.05602017788397717, + "grad_norm": 0.2783392162014966, + "learning_rate": 4.999699643930194e-06, + "loss": 0.6887, + "step": 211 + }, + { + "epoch": 0.056285676357360946, + "grad_norm": 0.27440601906566764, + "learning_rate": 4.99969420784237e-06, + "loss": 0.6768, + "step": 212 + }, + { + "epoch": 0.056551174830744726, + "grad_norm": 0.2734307791646763, + "learning_rate": 4.999688723004388e-06, + "loss": 0.6838, + "step": 213 + }, + { + "epoch": 0.0568166733041285, + "grad_norm": 0.27394668345734086, + "learning_rate": 4.999683189416357e-06, + "loss": 0.6711, + "step": 214 + }, + { + "epoch": 0.05708217177751228, + "grad_norm": 0.28115025983760983, + "learning_rate": 4.999677607078382e-06, + "loss": 0.7068, + "step": 215 + }, + { + "epoch": 0.05734767025089606, + "grad_norm": 0.26899994312737147, + "learning_rate": 4.999671975990574e-06, + "loss": 0.6776, + "step": 216 + }, + { + "epoch": 0.05761316872427984, + "grad_norm": 0.2817314356169039, + "learning_rate": 4.999666296153042e-06, + "loss": 0.7044, + "step": 217 + }, + { + "epoch": 0.057878667197663616, + "grad_norm": 0.27902869905165584, + "learning_rate": 4.999660567565897e-06, + "loss": 0.6744, + "step": 218 + }, + { + "epoch": 0.05814416567104739, + "grad_norm": 0.27022592081224195, + "learning_rate": 4.9996547902292514e-06, + "loss": 0.7199, + "step": 219 + }, + { + "epoch": 0.05840966414443117, + "grad_norm": 0.27208455882650584, + "learning_rate": 4.999648964143217e-06, + "loss": 0.6908, + "step": 220 + }, + { + "epoch": 0.05867516261781495, + "grad_norm": 0.2737163124143047, + "learning_rate": 4.999643089307907e-06, + "loss": 0.6652, + "step": 221 + }, + { + "epoch": 0.05894066109119873, + "grad_norm": 0.2688280646468157, + "learning_rate": 4.999637165723437e-06, + "loss": 0.705, + "step": 222 + }, + { + "epoch": 0.05920615956458251, + "grad_norm": 0.26927072313433303, + "learning_rate": 4.999631193389922e-06, + "loss": 0.6798, + "step": 223 + }, + { + "epoch": 0.05947165803796628, + "grad_norm": 0.26469848070271984, + "learning_rate": 4.999625172307479e-06, + "loss": 0.6719, + "step": 224 + }, + { + "epoch": 0.05973715651135006, + "grad_norm": 0.27456219654052877, + "learning_rate": 4.9996191024762235e-06, + "loss": 0.6746, + "step": 225 + }, + { + "epoch": 0.06000265498473384, + "grad_norm": 0.27871327711391836, + "learning_rate": 4.999612983896277e-06, + "loss": 0.7, + "step": 226 + }, + { + "epoch": 0.06026815345811762, + "grad_norm": 0.27110710717724956, + "learning_rate": 4.999606816567756e-06, + "loss": 0.6953, + "step": 227 + }, + { + "epoch": 0.06053365193150139, + "grad_norm": 0.2755922210275337, + "learning_rate": 4.999600600490783e-06, + "loss": 0.6938, + "step": 228 + }, + { + "epoch": 0.06079915040488517, + "grad_norm": 0.28005633334714464, + "learning_rate": 4.999594335665477e-06, + "loss": 0.7015, + "step": 229 + }, + { + "epoch": 0.06106464887826895, + "grad_norm": 0.2825554641299085, + "learning_rate": 4.999588022091963e-06, + "loss": 0.6875, + "step": 230 + }, + { + "epoch": 0.06133014735165273, + "grad_norm": 0.28230308518275543, + "learning_rate": 4.999581659770362e-06, + "loss": 0.674, + "step": 231 + }, + { + "epoch": 0.06159564582503651, + "grad_norm": 0.28030786780792255, + "learning_rate": 4.999575248700797e-06, + "loss": 0.6727, + "step": 232 + }, + { + "epoch": 0.06186114429842028, + "grad_norm": 0.28322905286732064, + "learning_rate": 4.999568788883396e-06, + "loss": 0.6759, + "step": 233 + }, + { + "epoch": 0.06212664277180406, + "grad_norm": 0.28063162337237035, + "learning_rate": 4.999562280318284e-06, + "loss": 0.6507, + "step": 234 + }, + { + "epoch": 0.06239214124518784, + "grad_norm": 0.2803099234303004, + "learning_rate": 4.999555723005588e-06, + "loss": 0.6762, + "step": 235 + }, + { + "epoch": 0.06265763971857162, + "grad_norm": 0.29590886163225577, + "learning_rate": 4.9995491169454345e-06, + "loss": 0.7273, + "step": 236 + }, + { + "epoch": 0.0629231381919554, + "grad_norm": 0.2824885396276655, + "learning_rate": 4.999542462137953e-06, + "loss": 0.6792, + "step": 237 + }, + { + "epoch": 0.06318863666533918, + "grad_norm": 0.2816431483123683, + "learning_rate": 4.999535758583274e-06, + "loss": 0.69, + "step": 238 + }, + { + "epoch": 0.06345413513872296, + "grad_norm": 0.2884562386922022, + "learning_rate": 4.999529006281527e-06, + "loss": 0.7017, + "step": 239 + }, + { + "epoch": 0.06371963361210672, + "grad_norm": 0.28273311581933935, + "learning_rate": 4.999522205232846e-06, + "loss": 0.7186, + "step": 240 + }, + { + "epoch": 0.0639851320854905, + "grad_norm": 0.2783167115742715, + "learning_rate": 4.999515355437361e-06, + "loss": 0.6668, + "step": 241 + }, + { + "epoch": 0.06425063055887428, + "grad_norm": 0.2816825131413215, + "learning_rate": 4.999508456895207e-06, + "loss": 0.6843, + "step": 242 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 0.2801353687084795, + "learning_rate": 4.999501509606518e-06, + "loss": 0.6968, + "step": 243 + }, + { + "epoch": 0.06478162750564184, + "grad_norm": 0.2835734150455514, + "learning_rate": 4.9994945135714305e-06, + "loss": 0.6862, + "step": 244 + }, + { + "epoch": 0.06504712597902562, + "grad_norm": 0.27741725505020737, + "learning_rate": 4.999487468790079e-06, + "loss": 0.6647, + "step": 245 + }, + { + "epoch": 0.0653126244524094, + "grad_norm": 0.26681654360115015, + "learning_rate": 4.999480375262603e-06, + "loss": 0.643, + "step": 246 + }, + { + "epoch": 0.06557812292579318, + "grad_norm": 0.2789170195456692, + "learning_rate": 4.999473232989139e-06, + "loss": 0.7272, + "step": 247 + }, + { + "epoch": 0.06584362139917696, + "grad_norm": 0.27742709910456664, + "learning_rate": 4.999466041969828e-06, + "loss": 0.7021, + "step": 248 + }, + { + "epoch": 0.06610911987256073, + "grad_norm": 0.2911592528989723, + "learning_rate": 4.999458802204809e-06, + "loss": 0.7011, + "step": 249 + }, + { + "epoch": 0.0663746183459445, + "grad_norm": 0.2704968024069899, + "learning_rate": 4.999451513694224e-06, + "loss": 0.6421, + "step": 250 + }, + { + "epoch": 0.06664011681932829, + "grad_norm": 0.28089468200732376, + "learning_rate": 4.999444176438214e-06, + "loss": 0.7055, + "step": 251 + }, + { + "epoch": 0.06690561529271206, + "grad_norm": 0.2767204642624905, + "learning_rate": 4.999436790436924e-06, + "loss": 0.6503, + "step": 252 + }, + { + "epoch": 0.06717111376609584, + "grad_norm": 0.2922773807732511, + "learning_rate": 4.999429355690496e-06, + "loss": 0.7075, + "step": 253 + }, + { + "epoch": 0.06743661223947962, + "grad_norm": 0.2825676930712853, + "learning_rate": 4.999421872199076e-06, + "loss": 0.6934, + "step": 254 + }, + { + "epoch": 0.0677021107128634, + "grad_norm": 0.2875336732999721, + "learning_rate": 4.99941433996281e-06, + "loss": 0.6761, + "step": 255 + }, + { + "epoch": 0.06796760918624718, + "grad_norm": 0.29005402811931774, + "learning_rate": 4.999406758981845e-06, + "loss": 0.6926, + "step": 256 + }, + { + "epoch": 0.06823310765963096, + "grad_norm": 0.28761923552882396, + "learning_rate": 4.99939912925633e-06, + "loss": 0.7085, + "step": 257 + }, + { + "epoch": 0.06849860613301474, + "grad_norm": 0.28727780363272737, + "learning_rate": 4.999391450786409e-06, + "loss": 0.6802, + "step": 258 + }, + { + "epoch": 0.06876410460639851, + "grad_norm": 0.28205517649545725, + "learning_rate": 4.999383723572238e-06, + "loss": 0.6708, + "step": 259 + }, + { + "epoch": 0.06902960307978229, + "grad_norm": 0.28491849512760403, + "learning_rate": 4.999375947613963e-06, + "loss": 0.6706, + "step": 260 + }, + { + "epoch": 0.06929510155316607, + "grad_norm": 0.2829201389826739, + "learning_rate": 4.999368122911739e-06, + "loss": 0.6926, + "step": 261 + }, + { + "epoch": 0.06956060002654985, + "grad_norm": 0.2878773147526592, + "learning_rate": 4.9993602494657166e-06, + "loss": 0.6837, + "step": 262 + }, + { + "epoch": 0.06982609849993363, + "grad_norm": 0.28952392627424006, + "learning_rate": 4.999352327276049e-06, + "loss": 0.6871, + "step": 263 + }, + { + "epoch": 0.0700915969733174, + "grad_norm": 0.2780781868581628, + "learning_rate": 4.999344356342893e-06, + "loss": 0.6665, + "step": 264 + }, + { + "epoch": 0.07035709544670118, + "grad_norm": 0.2837049791196424, + "learning_rate": 4.999336336666402e-06, + "loss": 0.6594, + "step": 265 + }, + { + "epoch": 0.07062259392008496, + "grad_norm": 0.2821916104605583, + "learning_rate": 4.999328268246732e-06, + "loss": 0.7093, + "step": 266 + }, + { + "epoch": 0.07088809239346874, + "grad_norm": 0.2837950250863991, + "learning_rate": 4.999320151084043e-06, + "loss": 0.6961, + "step": 267 + }, + { + "epoch": 0.07115359086685251, + "grad_norm": 0.2832447193649773, + "learning_rate": 4.99931198517849e-06, + "loss": 0.703, + "step": 268 + }, + { + "epoch": 0.07141908934023629, + "grad_norm": 0.2812061141776504, + "learning_rate": 4.999303770530235e-06, + "loss": 0.6981, + "step": 269 + }, + { + "epoch": 0.07168458781362007, + "grad_norm": 0.26950781781942745, + "learning_rate": 4.9992955071394365e-06, + "loss": 0.6412, + "step": 270 + }, + { + "epoch": 0.07195008628700385, + "grad_norm": 0.28606325968475227, + "learning_rate": 4.999287195006257e-06, + "loss": 0.6802, + "step": 271 + }, + { + "epoch": 0.07221558476038763, + "grad_norm": 0.28217180124962293, + "learning_rate": 4.999278834130857e-06, + "loss": 0.6704, + "step": 272 + }, + { + "epoch": 0.0724810832337714, + "grad_norm": 0.2799545605253567, + "learning_rate": 4.999270424513402e-06, + "loss": 0.6739, + "step": 273 + }, + { + "epoch": 0.07274658170715519, + "grad_norm": 0.2948366125568834, + "learning_rate": 4.999261966154053e-06, + "loss": 0.6996, + "step": 274 + }, + { + "epoch": 0.07301208018053897, + "grad_norm": 0.28526141854662124, + "learning_rate": 4.999253459052977e-06, + "loss": 0.6786, + "step": 275 + }, + { + "epoch": 0.07327757865392275, + "grad_norm": 0.2851150240842429, + "learning_rate": 4.99924490321034e-06, + "loss": 0.6933, + "step": 276 + }, + { + "epoch": 0.07354307712730652, + "grad_norm": 0.28820089750565403, + "learning_rate": 4.999236298626307e-06, + "loss": 0.681, + "step": 277 + }, + { + "epoch": 0.07380857560069029, + "grad_norm": 0.2846131009168272, + "learning_rate": 4.999227645301047e-06, + "loss": 0.6672, + "step": 278 + }, + { + "epoch": 0.07407407407407407, + "grad_norm": 0.2928403434083448, + "learning_rate": 4.999218943234729e-06, + "loss": 0.6856, + "step": 279 + }, + { + "epoch": 0.07433957254745785, + "grad_norm": 0.29824152148219807, + "learning_rate": 4.999210192427523e-06, + "loss": 0.6768, + "step": 280 + }, + { + "epoch": 0.07460507102084163, + "grad_norm": 0.2840864286946336, + "learning_rate": 4.999201392879598e-06, + "loss": 0.6833, + "step": 281 + }, + { + "epoch": 0.07487056949422541, + "grad_norm": 0.28658971140766376, + "learning_rate": 4.999192544591128e-06, + "loss": 0.6767, + "step": 282 + }, + { + "epoch": 0.07513606796760919, + "grad_norm": 0.28735745839725857, + "learning_rate": 4.999183647562283e-06, + "loss": 0.6775, + "step": 283 + }, + { + "epoch": 0.07540156644099297, + "grad_norm": 0.28250300851086513, + "learning_rate": 4.999174701793239e-06, + "loss": 0.6529, + "step": 284 + }, + { + "epoch": 0.07566706491437675, + "grad_norm": 0.2872554066283219, + "learning_rate": 4.999165707284169e-06, + "loss": 0.6548, + "step": 285 + }, + { + "epoch": 0.07593256338776053, + "grad_norm": 0.29246682414100555, + "learning_rate": 4.999156664035248e-06, + "loss": 0.6973, + "step": 286 + }, + { + "epoch": 0.07619806186114429, + "grad_norm": 0.2900221113157214, + "learning_rate": 4.999147572046654e-06, + "loss": 0.6885, + "step": 287 + }, + { + "epoch": 0.07646356033452807, + "grad_norm": 0.2944016883960358, + "learning_rate": 4.9991384313185635e-06, + "loss": 0.6914, + "step": 288 + }, + { + "epoch": 0.07672905880791185, + "grad_norm": 0.2878252740936143, + "learning_rate": 4.9991292418511536e-06, + "loss": 0.6938, + "step": 289 + }, + { + "epoch": 0.07699455728129563, + "grad_norm": 0.2929662685483102, + "learning_rate": 4.999120003644606e-06, + "loss": 0.6996, + "step": 290 + }, + { + "epoch": 0.07726005575467941, + "grad_norm": 0.3000498782620618, + "learning_rate": 4.999110716699097e-06, + "loss": 0.6668, + "step": 291 + }, + { + "epoch": 0.07752555422806319, + "grad_norm": 0.2935909465153773, + "learning_rate": 4.999101381014813e-06, + "loss": 0.6538, + "step": 292 + }, + { + "epoch": 0.07779105270144697, + "grad_norm": 0.29460974819047786, + "learning_rate": 4.999091996591931e-06, + "loss": 0.6922, + "step": 293 + }, + { + "epoch": 0.07805655117483075, + "grad_norm": 0.2888230041157458, + "learning_rate": 4.999082563430637e-06, + "loss": 0.6873, + "step": 294 + }, + { + "epoch": 0.07832204964821453, + "grad_norm": 0.2946906069535359, + "learning_rate": 4.9990730815311144e-06, + "loss": 0.6367, + "step": 295 + }, + { + "epoch": 0.0785875481215983, + "grad_norm": 0.29255274301221523, + "learning_rate": 4.999063550893548e-06, + "loss": 0.6142, + "step": 296 + }, + { + "epoch": 0.07885304659498207, + "grad_norm": 0.2932435372434208, + "learning_rate": 4.999053971518123e-06, + "loss": 0.6708, + "step": 297 + }, + { + "epoch": 0.07911854506836585, + "grad_norm": 0.29885523057090346, + "learning_rate": 4.999044343405027e-06, + "loss": 0.6917, + "step": 298 + }, + { + "epoch": 0.07938404354174963, + "grad_norm": 0.295942948386189, + "learning_rate": 4.999034666554448e-06, + "loss": 0.6771, + "step": 299 + }, + { + "epoch": 0.07964954201513341, + "grad_norm": 0.3141460477901498, + "learning_rate": 4.999024940966575e-06, + "loss": 0.6724, + "step": 300 + }, + { + "epoch": 0.07991504048851719, + "grad_norm": 0.29660845458593904, + "learning_rate": 4.999015166641595e-06, + "loss": 0.6973, + "step": 301 + }, + { + "epoch": 0.08018053896190097, + "grad_norm": 0.2970126436601212, + "learning_rate": 4.999005343579703e-06, + "loss": 0.6648, + "step": 302 + }, + { + "epoch": 0.08044603743528475, + "grad_norm": 0.29744317293632544, + "learning_rate": 4.9989954717810865e-06, + "loss": 0.6891, + "step": 303 + }, + { + "epoch": 0.08071153590866853, + "grad_norm": 0.2901410497684151, + "learning_rate": 4.99898555124594e-06, + "loss": 0.6735, + "step": 304 + }, + { + "epoch": 0.08097703438205231, + "grad_norm": 0.29798459544629646, + "learning_rate": 4.9989755819744565e-06, + "loss": 0.6539, + "step": 305 + }, + { + "epoch": 0.08124253285543608, + "grad_norm": 0.2929904256856655, + "learning_rate": 4.998965563966831e-06, + "loss": 0.6562, + "step": 306 + }, + { + "epoch": 0.08150803132881985, + "grad_norm": 0.2937270688048953, + "learning_rate": 4.998955497223258e-06, + "loss": 0.663, + "step": 307 + }, + { + "epoch": 0.08177352980220363, + "grad_norm": 0.2931852456506224, + "learning_rate": 4.9989453817439345e-06, + "loss": 0.6823, + "step": 308 + }, + { + "epoch": 0.08203902827558741, + "grad_norm": 0.2912478762560809, + "learning_rate": 4.998935217529058e-06, + "loss": 0.6713, + "step": 309 + }, + { + "epoch": 0.0823045267489712, + "grad_norm": 0.30838628503323395, + "learning_rate": 4.998925004578826e-06, + "loss": 0.6922, + "step": 310 + }, + { + "epoch": 0.08257002522235497, + "grad_norm": 0.2967990541380092, + "learning_rate": 4.9989147428934385e-06, + "loss": 0.6448, + "step": 311 + }, + { + "epoch": 0.08283552369573875, + "grad_norm": 0.2953837409606851, + "learning_rate": 4.998904432473095e-06, + "loss": 0.6708, + "step": 312 + }, + { + "epoch": 0.08310102216912253, + "grad_norm": 0.29801587193664514, + "learning_rate": 4.998894073317996e-06, + "loss": 0.6945, + "step": 313 + }, + { + "epoch": 0.08336652064250631, + "grad_norm": 0.29225932313117575, + "learning_rate": 4.998883665428345e-06, + "loss": 0.6492, + "step": 314 + }, + { + "epoch": 0.08363201911589008, + "grad_norm": 0.29959881567048147, + "learning_rate": 4.998873208804343e-06, + "loss": 0.6806, + "step": 315 + }, + { + "epoch": 0.08389751758927386, + "grad_norm": 0.29960446019233394, + "learning_rate": 4.998862703446196e-06, + "loss": 0.6573, + "step": 316 + }, + { + "epoch": 0.08416301606265764, + "grad_norm": 0.2946962090420051, + "learning_rate": 4.998852149354108e-06, + "loss": 0.656, + "step": 317 + }, + { + "epoch": 0.08442851453604142, + "grad_norm": 0.307819842583063, + "learning_rate": 4.998841546528284e-06, + "loss": 0.664, + "step": 318 + }, + { + "epoch": 0.0846940130094252, + "grad_norm": 0.2940226762339422, + "learning_rate": 4.998830894968934e-06, + "loss": 0.6504, + "step": 319 + }, + { + "epoch": 0.08495951148280897, + "grad_norm": 0.3010880006100906, + "learning_rate": 4.9988201946762615e-06, + "loss": 0.6499, + "step": 320 + }, + { + "epoch": 0.08522500995619275, + "grad_norm": 0.29970401482842796, + "learning_rate": 4.998809445650476e-06, + "loss": 0.678, + "step": 321 + }, + { + "epoch": 0.08549050842957653, + "grad_norm": 0.3121225152546364, + "learning_rate": 4.99879864789179e-06, + "loss": 0.6821, + "step": 322 + }, + { + "epoch": 0.08575600690296031, + "grad_norm": 0.3040775724714788, + "learning_rate": 4.998787801400412e-06, + "loss": 0.6874, + "step": 323 + }, + { + "epoch": 0.08602150537634409, + "grad_norm": 0.2974808855776655, + "learning_rate": 4.998776906176553e-06, + "loss": 0.7015, + "step": 324 + }, + { + "epoch": 0.08628700384972786, + "grad_norm": 0.29641748546634916, + "learning_rate": 4.998765962220428e-06, + "loss": 0.6909, + "step": 325 + }, + { + "epoch": 0.08655250232311164, + "grad_norm": 0.28834263017403083, + "learning_rate": 4.998754969532247e-06, + "loss": 0.6707, + "step": 326 + }, + { + "epoch": 0.08681800079649542, + "grad_norm": 0.30874417666950094, + "learning_rate": 4.998743928112226e-06, + "loss": 0.6893, + "step": 327 + }, + { + "epoch": 0.0870834992698792, + "grad_norm": 0.3023373305283153, + "learning_rate": 4.998732837960582e-06, + "loss": 0.7246, + "step": 328 + }, + { + "epoch": 0.08734899774326298, + "grad_norm": 0.2986985311476253, + "learning_rate": 4.998721699077528e-06, + "loss": 0.6878, + "step": 329 + }, + { + "epoch": 0.08761449621664676, + "grad_norm": 0.30676169244288776, + "learning_rate": 4.998710511463284e-06, + "loss": 0.6941, + "step": 330 + }, + { + "epoch": 0.08787999469003054, + "grad_norm": 0.30787895260330744, + "learning_rate": 4.998699275118066e-06, + "loss": 0.6797, + "step": 331 + }, + { + "epoch": 0.08814549316341432, + "grad_norm": 0.2923955549513281, + "learning_rate": 4.998687990042096e-06, + "loss": 0.6725, + "step": 332 + }, + { + "epoch": 0.0884109916367981, + "grad_norm": 0.29892236532536787, + "learning_rate": 4.998676656235591e-06, + "loss": 0.66, + "step": 333 + }, + { + "epoch": 0.08867649011018186, + "grad_norm": 0.29981490267370287, + "learning_rate": 4.998665273698774e-06, + "loss": 0.6555, + "step": 334 + }, + { + "epoch": 0.08894198858356564, + "grad_norm": 0.2937419301980639, + "learning_rate": 4.998653842431865e-06, + "loss": 0.6445, + "step": 335 + }, + { + "epoch": 0.08920748705694942, + "grad_norm": 0.2900337493308749, + "learning_rate": 4.99864236243509e-06, + "loss": 0.6574, + "step": 336 + }, + { + "epoch": 0.0894729855303332, + "grad_norm": 0.29057820345685953, + "learning_rate": 4.998630833708671e-06, + "loss": 0.6315, + "step": 337 + }, + { + "epoch": 0.08973848400371698, + "grad_norm": 0.29714366006288034, + "learning_rate": 4.998619256252832e-06, + "loss": 0.6929, + "step": 338 + }, + { + "epoch": 0.09000398247710076, + "grad_norm": 0.2991233791943958, + "learning_rate": 4.998607630067801e-06, + "loss": 0.6658, + "step": 339 + }, + { + "epoch": 0.09026948095048454, + "grad_norm": 0.3024435971160214, + "learning_rate": 4.998595955153803e-06, + "loss": 0.674, + "step": 340 + }, + { + "epoch": 0.09053497942386832, + "grad_norm": 0.28226917416513464, + "learning_rate": 4.998584231511067e-06, + "loss": 0.6463, + "step": 341 + }, + { + "epoch": 0.0908004778972521, + "grad_norm": 0.3001637870421723, + "learning_rate": 4.99857245913982e-06, + "loss": 0.692, + "step": 342 + }, + { + "epoch": 0.09106597637063586, + "grad_norm": 0.29754947156555495, + "learning_rate": 4.998560638040292e-06, + "loss": 0.6416, + "step": 343 + }, + { + "epoch": 0.09133147484401964, + "grad_norm": 0.30565826213510777, + "learning_rate": 4.998548768212716e-06, + "loss": 0.6686, + "step": 344 + }, + { + "epoch": 0.09159697331740342, + "grad_norm": 0.2998450329350609, + "learning_rate": 4.998536849657319e-06, + "loss": 0.6331, + "step": 345 + }, + { + "epoch": 0.0918624717907872, + "grad_norm": 0.3048929107122252, + "learning_rate": 4.998524882374338e-06, + "loss": 0.6475, + "step": 346 + }, + { + "epoch": 0.09212797026417098, + "grad_norm": 0.30975926255815844, + "learning_rate": 4.9985128663640035e-06, + "loss": 0.6633, + "step": 347 + }, + { + "epoch": 0.09239346873755476, + "grad_norm": 0.3168439032314714, + "learning_rate": 4.99850080162655e-06, + "loss": 0.6846, + "step": 348 + }, + { + "epoch": 0.09265896721093854, + "grad_norm": 0.29738742881475877, + "learning_rate": 4.9984886881622146e-06, + "loss": 0.6838, + "step": 349 + }, + { + "epoch": 0.09292446568432232, + "grad_norm": 0.2963089371897067, + "learning_rate": 4.998476525971232e-06, + "loss": 0.6365, + "step": 350 + }, + { + "epoch": 0.0931899641577061, + "grad_norm": 0.3120127203908184, + "learning_rate": 4.998464315053841e-06, + "loss": 0.6847, + "step": 351 + }, + { + "epoch": 0.09345546263108988, + "grad_norm": 0.30644369963379364, + "learning_rate": 4.998452055410276e-06, + "loss": 0.6804, + "step": 352 + }, + { + "epoch": 0.09372096110447364, + "grad_norm": 0.3166463471381308, + "learning_rate": 4.998439747040781e-06, + "loss": 0.6245, + "step": 353 + }, + { + "epoch": 0.09398645957785742, + "grad_norm": 0.30056253488107737, + "learning_rate": 4.9984273899455924e-06, + "loss": 0.6619, + "step": 354 + }, + { + "epoch": 0.0942519580512412, + "grad_norm": 0.29551938579257553, + "learning_rate": 4.998414984124953e-06, + "loss": 0.6793, + "step": 355 + }, + { + "epoch": 0.09451745652462498, + "grad_norm": 0.30577293764977503, + "learning_rate": 4.998402529579104e-06, + "loss": 0.685, + "step": 356 + }, + { + "epoch": 0.09478295499800876, + "grad_norm": 0.3136027154810347, + "learning_rate": 4.998390026308288e-06, + "loss": 0.6522, + "step": 357 + }, + { + "epoch": 0.09504845347139254, + "grad_norm": 0.3097738609974774, + "learning_rate": 4.998377474312751e-06, + "loss": 0.6835, + "step": 358 + }, + { + "epoch": 0.09531395194477632, + "grad_norm": 0.29881152239543424, + "learning_rate": 4.998364873592734e-06, + "loss": 0.6532, + "step": 359 + }, + { + "epoch": 0.0955794504181601, + "grad_norm": 0.30803760769027594, + "learning_rate": 4.998352224148487e-06, + "loss": 0.7106, + "step": 360 + }, + { + "epoch": 0.09584494889154388, + "grad_norm": 0.3146232150958707, + "learning_rate": 4.998339525980254e-06, + "loss": 0.7225, + "step": 361 + }, + { + "epoch": 0.09611044736492765, + "grad_norm": 0.30707379260872547, + "learning_rate": 4.998326779088283e-06, + "loss": 0.6821, + "step": 362 + }, + { + "epoch": 0.09637594583831142, + "grad_norm": 0.308760345968221, + "learning_rate": 4.998313983472823e-06, + "loss": 0.6844, + "step": 363 + }, + { + "epoch": 0.0966414443116952, + "grad_norm": 0.3042905996368259, + "learning_rate": 4.998301139134123e-06, + "loss": 0.6572, + "step": 364 + }, + { + "epoch": 0.09690694278507898, + "grad_norm": 0.30086485484817665, + "learning_rate": 4.998288246072434e-06, + "loss": 0.6521, + "step": 365 + }, + { + "epoch": 0.09717244125846276, + "grad_norm": 0.30234981928885984, + "learning_rate": 4.9982753042880076e-06, + "loss": 0.684, + "step": 366 + }, + { + "epoch": 0.09743793973184654, + "grad_norm": 0.30072324839807785, + "learning_rate": 4.998262313781096e-06, + "loss": 0.6916, + "step": 367 + }, + { + "epoch": 0.09770343820523032, + "grad_norm": 0.3078805070343228, + "learning_rate": 4.998249274551952e-06, + "loss": 0.6932, + "step": 368 + }, + { + "epoch": 0.0979689366786141, + "grad_norm": 0.3106113547728943, + "learning_rate": 4.9982361866008306e-06, + "loss": 0.6991, + "step": 369 + }, + { + "epoch": 0.09823443515199788, + "grad_norm": 0.3069038054416314, + "learning_rate": 4.998223049927987e-06, + "loss": 0.7023, + "step": 370 + }, + { + "epoch": 0.09849993362538166, + "grad_norm": 0.30210262430895374, + "learning_rate": 4.998209864533677e-06, + "loss": 0.6651, + "step": 371 + }, + { + "epoch": 0.09876543209876543, + "grad_norm": 0.31127245322981645, + "learning_rate": 4.998196630418159e-06, + "loss": 0.6575, + "step": 372 + }, + { + "epoch": 0.0990309305721492, + "grad_norm": 0.30865037988370164, + "learning_rate": 4.9981833475816886e-06, + "loss": 0.6797, + "step": 373 + }, + { + "epoch": 0.09929642904553299, + "grad_norm": 0.3056326792569424, + "learning_rate": 4.998170016024526e-06, + "loss": 0.6576, + "step": 374 + }, + { + "epoch": 0.09956192751891677, + "grad_norm": 0.30830297591571204, + "learning_rate": 4.9981566357469325e-06, + "loss": 0.6684, + "step": 375 + }, + { + "epoch": 0.09982742599230054, + "grad_norm": 0.31003718799398505, + "learning_rate": 4.998143206749168e-06, + "loss": 0.6819, + "step": 376 + }, + { + "epoch": 0.10009292446568432, + "grad_norm": 0.3104855221548017, + "learning_rate": 4.998129729031495e-06, + "loss": 0.6804, + "step": 377 + }, + { + "epoch": 0.1003584229390681, + "grad_norm": 0.3079010557016698, + "learning_rate": 4.998116202594175e-06, + "loss": 0.6551, + "step": 378 + }, + { + "epoch": 0.10062392141245188, + "grad_norm": 0.2990541945875933, + "learning_rate": 4.998102627437472e-06, + "loss": 0.6565, + "step": 379 + }, + { + "epoch": 0.10088941988583566, + "grad_norm": 0.31555616099171835, + "learning_rate": 4.998089003561653e-06, + "loss": 0.6869, + "step": 380 + }, + { + "epoch": 0.10115491835921943, + "grad_norm": 0.2949840548334904, + "learning_rate": 4.99807533096698e-06, + "loss": 0.6668, + "step": 381 + }, + { + "epoch": 0.10142041683260321, + "grad_norm": 0.3158090992458187, + "learning_rate": 4.9980616096537235e-06, + "loss": 0.6974, + "step": 382 + }, + { + "epoch": 0.10168591530598699, + "grad_norm": 0.30802755793824466, + "learning_rate": 4.998047839622149e-06, + "loss": 0.6868, + "step": 383 + }, + { + "epoch": 0.10195141377937077, + "grad_norm": 0.30873942546894295, + "learning_rate": 4.998034020872526e-06, + "loss": 0.6974, + "step": 384 + }, + { + "epoch": 0.10221691225275455, + "grad_norm": 0.31268243841582133, + "learning_rate": 4.9980201534051214e-06, + "loss": 0.7004, + "step": 385 + }, + { + "epoch": 0.10248241072613833, + "grad_norm": 0.30737025093264747, + "learning_rate": 4.998006237220209e-06, + "loss": 0.6492, + "step": 386 + }, + { + "epoch": 0.1027479091995221, + "grad_norm": 0.3067303152805426, + "learning_rate": 4.997992272318058e-06, + "loss": 0.6869, + "step": 387 + }, + { + "epoch": 0.10301340767290589, + "grad_norm": 0.3125290276549197, + "learning_rate": 4.997978258698942e-06, + "loss": 0.6474, + "step": 388 + }, + { + "epoch": 0.10327890614628966, + "grad_norm": 0.30298746561845363, + "learning_rate": 4.9979641963631344e-06, + "loss": 0.6521, + "step": 389 + }, + { + "epoch": 0.10354440461967343, + "grad_norm": 0.3185722182882203, + "learning_rate": 4.997950085310907e-06, + "loss": 0.7171, + "step": 390 + }, + { + "epoch": 0.10380990309305721, + "grad_norm": 0.3056792173121192, + "learning_rate": 4.997935925542539e-06, + "loss": 0.663, + "step": 391 + }, + { + "epoch": 0.10407540156644099, + "grad_norm": 0.3162449557105779, + "learning_rate": 4.997921717058304e-06, + "loss": 0.684, + "step": 392 + }, + { + "epoch": 0.10434090003982477, + "grad_norm": 0.3114840922452039, + "learning_rate": 4.997907459858479e-06, + "loss": 0.6651, + "step": 393 + }, + { + "epoch": 0.10460639851320855, + "grad_norm": 0.3160506470893945, + "learning_rate": 4.997893153943343e-06, + "loss": 0.7194, + "step": 394 + }, + { + "epoch": 0.10487189698659233, + "grad_norm": 0.3106758543108377, + "learning_rate": 4.997878799313174e-06, + "loss": 0.6599, + "step": 395 + }, + { + "epoch": 0.10513739545997611, + "grad_norm": 0.3086954997584318, + "learning_rate": 4.997864395968252e-06, + "loss": 0.6911, + "step": 396 + }, + { + "epoch": 0.10540289393335989, + "grad_norm": 0.3114318196879341, + "learning_rate": 4.99784994390886e-06, + "loss": 0.6593, + "step": 397 + }, + { + "epoch": 0.10566839240674367, + "grad_norm": 0.31406721651161573, + "learning_rate": 4.9978354431352785e-06, + "loss": 0.6338, + "step": 398 + }, + { + "epoch": 0.10593389088012745, + "grad_norm": 0.3177758176653506, + "learning_rate": 4.997820893647789e-06, + "loss": 0.7053, + "step": 399 + }, + { + "epoch": 0.10619938935351121, + "grad_norm": 0.3027956837248792, + "learning_rate": 4.997806295446676e-06, + "loss": 0.6804, + "step": 400 + }, + { + "epoch": 0.10646488782689499, + "grad_norm": 0.3020081713951576, + "learning_rate": 4.9977916485322255e-06, + "loss": 0.6412, + "step": 401 + }, + { + "epoch": 0.10673038630027877, + "grad_norm": 0.32617568438218675, + "learning_rate": 4.997776952904723e-06, + "loss": 0.6756, + "step": 402 + }, + { + "epoch": 0.10699588477366255, + "grad_norm": 0.30597090699286184, + "learning_rate": 4.9977622085644525e-06, + "loss": 0.6723, + "step": 403 + }, + { + "epoch": 0.10726138324704633, + "grad_norm": 0.3144766515591238, + "learning_rate": 4.997747415511705e-06, + "loss": 0.6653, + "step": 404 + }, + { + "epoch": 0.10752688172043011, + "grad_norm": 0.31378962465347404, + "learning_rate": 4.997732573746765e-06, + "loss": 0.6941, + "step": 405 + }, + { + "epoch": 0.10779238019381389, + "grad_norm": 0.31097367436263995, + "learning_rate": 4.997717683269926e-06, + "loss": 0.624, + "step": 406 + }, + { + "epoch": 0.10805787866719767, + "grad_norm": 0.31114714417062406, + "learning_rate": 4.997702744081477e-06, + "loss": 0.6889, + "step": 407 + }, + { + "epoch": 0.10832337714058145, + "grad_norm": 0.3278466064162668, + "learning_rate": 4.9976877561817086e-06, + "loss": 0.6977, + "step": 408 + }, + { + "epoch": 0.10858887561396521, + "grad_norm": 0.31784295440346955, + "learning_rate": 4.997672719570913e-06, + "loss": 0.6588, + "step": 409 + }, + { + "epoch": 0.10885437408734899, + "grad_norm": 0.31856573937167837, + "learning_rate": 4.997657634249384e-06, + "loss": 0.712, + "step": 410 + }, + { + "epoch": 0.10911987256073277, + "grad_norm": 0.3086321126823493, + "learning_rate": 4.997642500217417e-06, + "loss": 0.694, + "step": 411 + }, + { + "epoch": 0.10938537103411655, + "grad_norm": 0.3233180475823364, + "learning_rate": 4.997627317475305e-06, + "loss": 0.6972, + "step": 412 + }, + { + "epoch": 0.10965086950750033, + "grad_norm": 0.31087689547222547, + "learning_rate": 4.997612086023345e-06, + "loss": 0.653, + "step": 413 + }, + { + "epoch": 0.10991636798088411, + "grad_norm": 0.3176995483680813, + "learning_rate": 4.997596805861835e-06, + "loss": 0.6807, + "step": 414 + }, + { + "epoch": 0.11018186645426789, + "grad_norm": 0.31386266013051656, + "learning_rate": 4.997581476991071e-06, + "loss": 0.6618, + "step": 415 + }, + { + "epoch": 0.11044736492765167, + "grad_norm": 0.3152387993743806, + "learning_rate": 4.997566099411354e-06, + "loss": 0.686, + "step": 416 + }, + { + "epoch": 0.11071286340103545, + "grad_norm": 0.33351140546506874, + "learning_rate": 4.997550673122983e-06, + "loss": 0.6772, + "step": 417 + }, + { + "epoch": 0.11097836187441922, + "grad_norm": 0.3186827810679576, + "learning_rate": 4.997535198126258e-06, + "loss": 0.662, + "step": 418 + }, + { + "epoch": 0.111243860347803, + "grad_norm": 0.31009634384997614, + "learning_rate": 4.997519674421483e-06, + "loss": 0.6655, + "step": 419 + }, + { + "epoch": 0.11150935882118677, + "grad_norm": 0.3268830658834569, + "learning_rate": 4.997504102008957e-06, + "loss": 0.693, + "step": 420 + }, + { + "epoch": 0.11177485729457055, + "grad_norm": 0.30954548187647873, + "learning_rate": 4.997488480888988e-06, + "loss": 0.671, + "step": 421 + }, + { + "epoch": 0.11204035576795433, + "grad_norm": 0.30894394142603065, + "learning_rate": 4.997472811061879e-06, + "loss": 0.6645, + "step": 422 + }, + { + "epoch": 0.11230585424133811, + "grad_norm": 0.31069961094923226, + "learning_rate": 4.997457092527934e-06, + "loss": 0.6619, + "step": 423 + }, + { + "epoch": 0.11257135271472189, + "grad_norm": 0.31267417835999584, + "learning_rate": 4.997441325287461e-06, + "loss": 0.6625, + "step": 424 + }, + { + "epoch": 0.11283685118810567, + "grad_norm": 0.31286268667247885, + "learning_rate": 4.997425509340767e-06, + "loss": 0.6898, + "step": 425 + }, + { + "epoch": 0.11310234966148945, + "grad_norm": 0.3133878370932067, + "learning_rate": 4.997409644688162e-06, + "loss": 0.6498, + "step": 426 + }, + { + "epoch": 0.11336784813487323, + "grad_norm": 0.30696063627302694, + "learning_rate": 4.997393731329954e-06, + "loss": 0.6828, + "step": 427 + }, + { + "epoch": 0.113633346608257, + "grad_norm": 0.30531189678202303, + "learning_rate": 4.997377769266454e-06, + "loss": 0.6434, + "step": 428 + }, + { + "epoch": 0.11389884508164078, + "grad_norm": 0.30062330676572224, + "learning_rate": 4.997361758497972e-06, + "loss": 0.6208, + "step": 429 + }, + { + "epoch": 0.11416434355502456, + "grad_norm": 0.2946536636599198, + "learning_rate": 4.997345699024821e-06, + "loss": 0.6227, + "step": 430 + }, + { + "epoch": 0.11442984202840834, + "grad_norm": 0.3069850692030948, + "learning_rate": 4.9973295908473146e-06, + "loss": 0.6576, + "step": 431 + }, + { + "epoch": 0.11469534050179211, + "grad_norm": 0.31148890419045955, + "learning_rate": 4.997313433965767e-06, + "loss": 0.6678, + "step": 432 + }, + { + "epoch": 0.1149608389751759, + "grad_norm": 0.324529924104944, + "learning_rate": 4.997297228380492e-06, + "loss": 0.7094, + "step": 433 + }, + { + "epoch": 0.11522633744855967, + "grad_norm": 0.3155063789449674, + "learning_rate": 4.997280974091808e-06, + "loss": 0.6392, + "step": 434 + }, + { + "epoch": 0.11549183592194345, + "grad_norm": 0.3120980596384352, + "learning_rate": 4.997264671100029e-06, + "loss": 0.6454, + "step": 435 + }, + { + "epoch": 0.11575733439532723, + "grad_norm": 0.31139341298472606, + "learning_rate": 4.997248319405475e-06, + "loss": 0.6834, + "step": 436 + }, + { + "epoch": 0.116022832868711, + "grad_norm": 0.3220778362463673, + "learning_rate": 4.997231919008466e-06, + "loss": 0.6931, + "step": 437 + }, + { + "epoch": 0.11628833134209478, + "grad_norm": 0.3140339484239711, + "learning_rate": 4.997215469909319e-06, + "loss": 0.6612, + "step": 438 + }, + { + "epoch": 0.11655382981547856, + "grad_norm": 0.3130511631833597, + "learning_rate": 4.997198972108356e-06, + "loss": 0.6604, + "step": 439 + }, + { + "epoch": 0.11681932828886234, + "grad_norm": 0.31838552167395007, + "learning_rate": 4.997182425605899e-06, + "loss": 0.6819, + "step": 440 + }, + { + "epoch": 0.11708482676224612, + "grad_norm": 0.3231736062338604, + "learning_rate": 4.9971658304022705e-06, + "loss": 0.7176, + "step": 441 + }, + { + "epoch": 0.1173503252356299, + "grad_norm": 0.31796270168634794, + "learning_rate": 4.997149186497795e-06, + "loss": 0.6435, + "step": 442 + }, + { + "epoch": 0.11761582370901368, + "grad_norm": 0.31256477602255317, + "learning_rate": 4.997132493892796e-06, + "loss": 0.6753, + "step": 443 + }, + { + "epoch": 0.11788132218239745, + "grad_norm": 0.3187404644494588, + "learning_rate": 4.997115752587598e-06, + "loss": 0.6952, + "step": 444 + }, + { + "epoch": 0.11814682065578123, + "grad_norm": 0.3297731732543038, + "learning_rate": 4.99709896258253e-06, + "loss": 0.6911, + "step": 445 + }, + { + "epoch": 0.11841231912916501, + "grad_norm": 0.31247350710985655, + "learning_rate": 4.997082123877918e-06, + "loss": 0.6803, + "step": 446 + }, + { + "epoch": 0.11867781760254878, + "grad_norm": 0.3297760667742294, + "learning_rate": 4.997065236474092e-06, + "loss": 0.6922, + "step": 447 + }, + { + "epoch": 0.11894331607593256, + "grad_norm": 0.30422819180050326, + "learning_rate": 4.997048300371377e-06, + "loss": 0.6466, + "step": 448 + }, + { + "epoch": 0.11920881454931634, + "grad_norm": 0.3199451761919143, + "learning_rate": 4.997031315570109e-06, + "loss": 0.6569, + "step": 449 + }, + { + "epoch": 0.11947431302270012, + "grad_norm": 0.32478782472338646, + "learning_rate": 4.997014282070615e-06, + "loss": 0.6674, + "step": 450 + }, + { + "epoch": 0.1197398114960839, + "grad_norm": 0.3316660848846396, + "learning_rate": 4.9969971998732294e-06, + "loss": 0.7, + "step": 451 + }, + { + "epoch": 0.12000530996946768, + "grad_norm": 0.31690024112628196, + "learning_rate": 4.996980068978285e-06, + "loss": 0.6643, + "step": 452 + }, + { + "epoch": 0.12027080844285146, + "grad_norm": 0.3267511446290268, + "learning_rate": 4.996962889386116e-06, + "loss": 0.6668, + "step": 453 + }, + { + "epoch": 0.12053630691623524, + "grad_norm": 0.31948431087726, + "learning_rate": 4.9969456610970555e-06, + "loss": 0.6709, + "step": 454 + }, + { + "epoch": 0.12080180538961902, + "grad_norm": 0.32208238773189896, + "learning_rate": 4.996928384111442e-06, + "loss": 0.6874, + "step": 455 + }, + { + "epoch": 0.12106730386300278, + "grad_norm": 0.3148108500947568, + "learning_rate": 4.996911058429611e-06, + "loss": 0.6862, + "step": 456 + }, + { + "epoch": 0.12133280233638656, + "grad_norm": 0.3119940304171294, + "learning_rate": 4.996893684051901e-06, + "loss": 0.6406, + "step": 457 + }, + { + "epoch": 0.12159830080977034, + "grad_norm": 0.3232865917452015, + "learning_rate": 4.996876260978651e-06, + "loss": 0.7013, + "step": 458 + }, + { + "epoch": 0.12186379928315412, + "grad_norm": 0.3252102414886518, + "learning_rate": 4.9968587892102e-06, + "loss": 0.6559, + "step": 459 + }, + { + "epoch": 0.1221292977565379, + "grad_norm": 0.3269747897933381, + "learning_rate": 4.99684126874689e-06, + "loss": 0.6155, + "step": 460 + }, + { + "epoch": 0.12239479622992168, + "grad_norm": 0.3179398124119427, + "learning_rate": 4.996823699589062e-06, + "loss": 0.6369, + "step": 461 + }, + { + "epoch": 0.12266029470330546, + "grad_norm": 0.3280930124454813, + "learning_rate": 4.996806081737058e-06, + "loss": 0.7001, + "step": 462 + }, + { + "epoch": 0.12292579317668924, + "grad_norm": 0.3370923358901789, + "learning_rate": 4.996788415191223e-06, + "loss": 0.6858, + "step": 463 + }, + { + "epoch": 0.12319129165007302, + "grad_norm": 0.3295220353946801, + "learning_rate": 4.9967706999519e-06, + "loss": 0.6602, + "step": 464 + }, + { + "epoch": 0.12345679012345678, + "grad_norm": 0.3152774696864616, + "learning_rate": 4.996752936019435e-06, + "loss": 0.6837, + "step": 465 + }, + { + "epoch": 0.12372228859684056, + "grad_norm": 0.3239072427931697, + "learning_rate": 4.996735123394175e-06, + "loss": 0.7113, + "step": 466 + }, + { + "epoch": 0.12398778707022434, + "grad_norm": 0.31285507970163806, + "learning_rate": 4.996717262076467e-06, + "loss": 0.6255, + "step": 467 + }, + { + "epoch": 0.12425328554360812, + "grad_norm": 0.3425777000470296, + "learning_rate": 4.996699352066659e-06, + "loss": 0.6906, + "step": 468 + }, + { + "epoch": 0.1245187840169919, + "grad_norm": 0.3226226899231145, + "learning_rate": 4.9966813933651005e-06, + "loss": 0.6753, + "step": 469 + }, + { + "epoch": 0.12478428249037568, + "grad_norm": 0.31435598476016197, + "learning_rate": 4.996663385972143e-06, + "loss": 0.6144, + "step": 470 + }, + { + "epoch": 0.12504978096375946, + "grad_norm": 0.3193846061113134, + "learning_rate": 4.996645329888135e-06, + "loss": 0.6238, + "step": 471 + }, + { + "epoch": 0.12531527943714324, + "grad_norm": 0.31361568411029606, + "learning_rate": 4.996627225113432e-06, + "loss": 0.6552, + "step": 472 + }, + { + "epoch": 0.12558077791052702, + "grad_norm": 0.309880081345077, + "learning_rate": 4.996609071648384e-06, + "loss": 0.6661, + "step": 473 + }, + { + "epoch": 0.1258462763839108, + "grad_norm": 0.32315101335746027, + "learning_rate": 4.996590869493347e-06, + "loss": 0.7002, + "step": 474 + }, + { + "epoch": 0.12611177485729458, + "grad_norm": 0.3149890808320547, + "learning_rate": 4.996572618648675e-06, + "loss": 0.6614, + "step": 475 + }, + { + "epoch": 0.12637727333067836, + "grad_norm": 0.3322823792242008, + "learning_rate": 4.996554319114724e-06, + "loss": 0.6387, + "step": 476 + }, + { + "epoch": 0.12664277180406214, + "grad_norm": 0.3371307654023024, + "learning_rate": 4.996535970891852e-06, + "loss": 0.6466, + "step": 477 + }, + { + "epoch": 0.12690827027744592, + "grad_norm": 0.32510819181993517, + "learning_rate": 4.9965175739804165e-06, + "loss": 0.7028, + "step": 478 + }, + { + "epoch": 0.1271737687508297, + "grad_norm": 0.3288285496065907, + "learning_rate": 4.996499128380775e-06, + "loss": 0.6783, + "step": 479 + }, + { + "epoch": 0.12743926722421345, + "grad_norm": 0.3090508055553086, + "learning_rate": 4.996480634093287e-06, + "loss": 0.6767, + "step": 480 + }, + { + "epoch": 0.12770476569759723, + "grad_norm": 0.32731228442771315, + "learning_rate": 4.9964620911183154e-06, + "loss": 0.6456, + "step": 481 + }, + { + "epoch": 0.127970264170981, + "grad_norm": 0.323293904647429, + "learning_rate": 4.996443499456221e-06, + "loss": 0.6439, + "step": 482 + }, + { + "epoch": 0.1282357626443648, + "grad_norm": 0.3316511232865672, + "learning_rate": 4.996424859107365e-06, + "loss": 0.7088, + "step": 483 + }, + { + "epoch": 0.12850126111774857, + "grad_norm": 0.32521408570883387, + "learning_rate": 4.9964061700721124e-06, + "loss": 0.6177, + "step": 484 + }, + { + "epoch": 0.12876675959113235, + "grad_norm": 0.3316638909021025, + "learning_rate": 4.996387432350827e-06, + "loss": 0.6778, + "step": 485 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 0.3299686641445629, + "learning_rate": 4.996368645943874e-06, + "loss": 0.6897, + "step": 486 + }, + { + "epoch": 0.1292977565378999, + "grad_norm": 0.32230058363824565, + "learning_rate": 4.996349810851621e-06, + "loss": 0.6731, + "step": 487 + }, + { + "epoch": 0.12956325501128368, + "grad_norm": 0.31931572173577116, + "learning_rate": 4.996330927074434e-06, + "loss": 0.6659, + "step": 488 + }, + { + "epoch": 0.12982875348466746, + "grad_norm": 0.3400639593522898, + "learning_rate": 4.996311994612683e-06, + "loss": 0.658, + "step": 489 + }, + { + "epoch": 0.13009425195805124, + "grad_norm": 0.3169094401640569, + "learning_rate": 4.996293013466735e-06, + "loss": 0.6712, + "step": 490 + }, + { + "epoch": 0.13035975043143502, + "grad_norm": 0.32357816964838454, + "learning_rate": 4.996273983636961e-06, + "loss": 0.6663, + "step": 491 + }, + { + "epoch": 0.1306252489048188, + "grad_norm": 0.32940241014043, + "learning_rate": 4.996254905123734e-06, + "loss": 0.6579, + "step": 492 + }, + { + "epoch": 0.13089074737820258, + "grad_norm": 0.3193910004394228, + "learning_rate": 4.9962357779274225e-06, + "loss": 0.6527, + "step": 493 + }, + { + "epoch": 0.13115624585158636, + "grad_norm": 0.3432696084572552, + "learning_rate": 4.996216602048402e-06, + "loss": 0.6496, + "step": 494 + }, + { + "epoch": 0.13142174432497014, + "grad_norm": 0.33351583885342173, + "learning_rate": 4.996197377487047e-06, + "loss": 0.6813, + "step": 495 + }, + { + "epoch": 0.13168724279835392, + "grad_norm": 0.34081496762738256, + "learning_rate": 4.996178104243731e-06, + "loss": 0.6772, + "step": 496 + }, + { + "epoch": 0.1319527412717377, + "grad_norm": 0.33491355000505957, + "learning_rate": 4.9961587823188294e-06, + "loss": 0.662, + "step": 497 + }, + { + "epoch": 0.13221823974512145, + "grad_norm": 0.33301061936692605, + "learning_rate": 4.996139411712721e-06, + "loss": 0.6918, + "step": 498 + }, + { + "epoch": 0.13248373821850523, + "grad_norm": 0.3400842904392137, + "learning_rate": 4.996119992425782e-06, + "loss": 0.6647, + "step": 499 + }, + { + "epoch": 0.132749236691889, + "grad_norm": 0.3102669536299804, + "learning_rate": 4.9961005244583915e-06, + "loss": 0.6541, + "step": 500 + }, + { + "epoch": 0.1330147351652728, + "grad_norm": 0.328665079455763, + "learning_rate": 4.99608100781093e-06, + "loss": 0.6683, + "step": 501 + }, + { + "epoch": 0.13328023363865657, + "grad_norm": 0.33072326613560726, + "learning_rate": 4.996061442483776e-06, + "loss": 0.6707, + "step": 502 + }, + { + "epoch": 0.13354573211204035, + "grad_norm": 0.3359127670041347, + "learning_rate": 4.996041828477314e-06, + "loss": 0.6756, + "step": 503 + }, + { + "epoch": 0.13381123058542413, + "grad_norm": 0.34455880718341786, + "learning_rate": 4.996022165791924e-06, + "loss": 0.6917, + "step": 504 + }, + { + "epoch": 0.1340767290588079, + "grad_norm": 0.32900522206732946, + "learning_rate": 4.99600245442799e-06, + "loss": 0.6471, + "step": 505 + }, + { + "epoch": 0.1343422275321917, + "grad_norm": 0.3104006173223105, + "learning_rate": 4.995982694385898e-06, + "loss": 0.6592, + "step": 506 + }, + { + "epoch": 0.13460772600557547, + "grad_norm": 0.3134989782521258, + "learning_rate": 4.995962885666031e-06, + "loss": 0.6551, + "step": 507 + }, + { + "epoch": 0.13487322447895925, + "grad_norm": 0.33422565458562004, + "learning_rate": 4.9959430282687785e-06, + "loss": 0.6428, + "step": 508 + }, + { + "epoch": 0.13513872295234303, + "grad_norm": 0.32444176124655094, + "learning_rate": 4.995923122194523e-06, + "loss": 0.6525, + "step": 509 + }, + { + "epoch": 0.1354042214257268, + "grad_norm": 0.33384089903040765, + "learning_rate": 4.995903167443658e-06, + "loss": 0.6828, + "step": 510 + }, + { + "epoch": 0.13566971989911059, + "grad_norm": 0.34164211670121414, + "learning_rate": 4.99588316401657e-06, + "loss": 0.6701, + "step": 511 + }, + { + "epoch": 0.13593521837249437, + "grad_norm": 0.3323764019600334, + "learning_rate": 4.995863111913648e-06, + "loss": 0.6707, + "step": 512 + }, + { + "epoch": 0.13620071684587814, + "grad_norm": 0.3228690168718786, + "learning_rate": 4.9958430111352855e-06, + "loss": 0.6482, + "step": 513 + }, + { + "epoch": 0.13646621531926192, + "grad_norm": 0.3210241856808829, + "learning_rate": 4.995822861681873e-06, + "loss": 0.6383, + "step": 514 + }, + { + "epoch": 0.1367317137926457, + "grad_norm": 0.32995295199460256, + "learning_rate": 4.995802663553803e-06, + "loss": 0.6645, + "step": 515 + }, + { + "epoch": 0.13699721226602948, + "grad_norm": 0.32683288941278976, + "learning_rate": 4.995782416751472e-06, + "loss": 0.6647, + "step": 516 + }, + { + "epoch": 0.13726271073941324, + "grad_norm": 0.3196919907438924, + "learning_rate": 4.995762121275272e-06, + "loss": 0.6881, + "step": 517 + }, + { + "epoch": 0.13752820921279701, + "grad_norm": 0.33328796423808194, + "learning_rate": 4.9957417771256e-06, + "loss": 0.6933, + "step": 518 + }, + { + "epoch": 0.1377937076861808, + "grad_norm": 0.3349629222708952, + "learning_rate": 4.995721384302853e-06, + "loss": 0.6785, + "step": 519 + }, + { + "epoch": 0.13805920615956457, + "grad_norm": 0.34465169753892805, + "learning_rate": 4.995700942807429e-06, + "loss": 0.6657, + "step": 520 + }, + { + "epoch": 0.13832470463294835, + "grad_norm": 0.3135898169384583, + "learning_rate": 4.995680452639725e-06, + "loss": 0.6412, + "step": 521 + }, + { + "epoch": 0.13859020310633213, + "grad_norm": 0.35602453516277327, + "learning_rate": 4.995659913800143e-06, + "loss": 0.6343, + "step": 522 + }, + { + "epoch": 0.1388557015797159, + "grad_norm": 0.3204032153635913, + "learning_rate": 4.995639326289081e-06, + "loss": 0.675, + "step": 523 + }, + { + "epoch": 0.1391212000530997, + "grad_norm": 0.3277898414945331, + "learning_rate": 4.995618690106942e-06, + "loss": 0.6886, + "step": 524 + }, + { + "epoch": 0.13938669852648347, + "grad_norm": 0.3393355770985635, + "learning_rate": 4.995598005254129e-06, + "loss": 0.6717, + "step": 525 + }, + { + "epoch": 0.13965219699986725, + "grad_norm": 0.32347596138398255, + "learning_rate": 4.995577271731044e-06, + "loss": 0.644, + "step": 526 + }, + { + "epoch": 0.13991769547325103, + "grad_norm": 0.3266478883557009, + "learning_rate": 4.995556489538091e-06, + "loss": 0.6767, + "step": 527 + }, + { + "epoch": 0.1401831939466348, + "grad_norm": 0.3381961614423876, + "learning_rate": 4.995535658675678e-06, + "loss": 0.6889, + "step": 528 + }, + { + "epoch": 0.1404486924200186, + "grad_norm": 0.3283413859213309, + "learning_rate": 4.995514779144208e-06, + "loss": 0.6768, + "step": 529 + }, + { + "epoch": 0.14071419089340237, + "grad_norm": 0.3281449282605594, + "learning_rate": 4.995493850944091e-06, + "loss": 0.6835, + "step": 530 + }, + { + "epoch": 0.14097968936678615, + "grad_norm": 0.32330376989056236, + "learning_rate": 4.9954728740757336e-06, + "loss": 0.6586, + "step": 531 + }, + { + "epoch": 0.14124518784016993, + "grad_norm": 0.34094390843593775, + "learning_rate": 4.995451848539545e-06, + "loss": 0.662, + "step": 532 + }, + { + "epoch": 0.1415106863135537, + "grad_norm": 0.31373142731202014, + "learning_rate": 4.9954307743359355e-06, + "loss": 0.6439, + "step": 533 + }, + { + "epoch": 0.1417761847869375, + "grad_norm": 0.33183275332426326, + "learning_rate": 4.995409651465316e-06, + "loss": 0.6567, + "step": 534 + }, + { + "epoch": 0.14204168326032127, + "grad_norm": 0.3390225618878591, + "learning_rate": 4.995388479928098e-06, + "loss": 0.6446, + "step": 535 + }, + { + "epoch": 0.14230718173370502, + "grad_norm": 0.32369220295055845, + "learning_rate": 4.995367259724696e-06, + "loss": 0.6731, + "step": 536 + }, + { + "epoch": 0.1425726802070888, + "grad_norm": 0.33924915117232796, + "learning_rate": 4.9953459908555215e-06, + "loss": 0.6767, + "step": 537 + }, + { + "epoch": 0.14283817868047258, + "grad_norm": 0.3521701665111978, + "learning_rate": 4.995324673320991e-06, + "loss": 0.657, + "step": 538 + }, + { + "epoch": 0.14310367715385636, + "grad_norm": 0.3343091569613087, + "learning_rate": 4.99530330712152e-06, + "loss": 0.6953, + "step": 539 + }, + { + "epoch": 0.14336917562724014, + "grad_norm": 0.33082562613574634, + "learning_rate": 4.995281892257525e-06, + "loss": 0.6555, + "step": 540 + }, + { + "epoch": 0.14363467410062392, + "grad_norm": 0.3360100839446358, + "learning_rate": 4.9952604287294236e-06, + "loss": 0.6534, + "step": 541 + }, + { + "epoch": 0.1439001725740077, + "grad_norm": 0.3196775960507627, + "learning_rate": 4.995238916537634e-06, + "loss": 0.6713, + "step": 542 + }, + { + "epoch": 0.14416567104739147, + "grad_norm": 0.33537763471186866, + "learning_rate": 4.995217355682576e-06, + "loss": 0.676, + "step": 543 + }, + { + "epoch": 0.14443116952077525, + "grad_norm": 0.34609653703087356, + "learning_rate": 4.9951957461646705e-06, + "loss": 0.6683, + "step": 544 + }, + { + "epoch": 0.14469666799415903, + "grad_norm": 0.35264005734812376, + "learning_rate": 4.995174087984339e-06, + "loss": 0.6619, + "step": 545 + }, + { + "epoch": 0.1449621664675428, + "grad_norm": 0.3245915297253958, + "learning_rate": 4.995152381142003e-06, + "loss": 0.66, + "step": 546 + }, + { + "epoch": 0.1452276649409266, + "grad_norm": 0.342351782543807, + "learning_rate": 4.995130625638087e-06, + "loss": 0.6909, + "step": 547 + }, + { + "epoch": 0.14549316341431037, + "grad_norm": 0.34685925800788514, + "learning_rate": 4.995108821473014e-06, + "loss": 0.6886, + "step": 548 + }, + { + "epoch": 0.14575866188769415, + "grad_norm": 0.33818507932197633, + "learning_rate": 4.99508696864721e-06, + "loss": 0.7143, + "step": 549 + }, + { + "epoch": 0.14602416036107793, + "grad_norm": 0.32524686758940563, + "learning_rate": 4.995065067161101e-06, + "loss": 0.6731, + "step": 550 + }, + { + "epoch": 0.1462896588344617, + "grad_norm": 0.32694976818783966, + "learning_rate": 4.9950431170151145e-06, + "loss": 0.6794, + "step": 551 + }, + { + "epoch": 0.1465551573078455, + "grad_norm": 0.33462789426067663, + "learning_rate": 4.995021118209679e-06, + "loss": 0.665, + "step": 552 + }, + { + "epoch": 0.14682065578122927, + "grad_norm": 0.34655446891220265, + "learning_rate": 4.994999070745222e-06, + "loss": 0.6797, + "step": 553 + }, + { + "epoch": 0.14708615425461305, + "grad_norm": 0.326030150865353, + "learning_rate": 4.994976974622175e-06, + "loss": 0.6561, + "step": 554 + }, + { + "epoch": 0.1473516527279968, + "grad_norm": 0.3414446163531893, + "learning_rate": 4.9949548298409665e-06, + "loss": 0.6751, + "step": 555 + }, + { + "epoch": 0.14761715120138058, + "grad_norm": 0.3539416165824423, + "learning_rate": 4.994932636402032e-06, + "loss": 0.6449, + "step": 556 + }, + { + "epoch": 0.14788264967476436, + "grad_norm": 0.3316388913017657, + "learning_rate": 4.994910394305801e-06, + "loss": 0.6387, + "step": 557 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 0.32747030798146964, + "learning_rate": 4.994888103552709e-06, + "loss": 0.6707, + "step": 558 + }, + { + "epoch": 0.14841364662153192, + "grad_norm": 0.37322744776731953, + "learning_rate": 4.994865764143192e-06, + "loss": 0.6819, + "step": 559 + }, + { + "epoch": 0.1486791450949157, + "grad_norm": 0.33672340444534893, + "learning_rate": 4.994843376077683e-06, + "loss": 0.6607, + "step": 560 + }, + { + "epoch": 0.14894464356829948, + "grad_norm": 0.3298195074729293, + "learning_rate": 4.994820939356619e-06, + "loss": 0.6569, + "step": 561 + }, + { + "epoch": 0.14921014204168326, + "grad_norm": 0.3498886832736364, + "learning_rate": 4.994798453980439e-06, + "loss": 0.65, + "step": 562 + }, + { + "epoch": 0.14947564051506704, + "grad_norm": 0.3371708895587616, + "learning_rate": 4.994775919949582e-06, + "loss": 0.6969, + "step": 563 + }, + { + "epoch": 0.14974113898845082, + "grad_norm": 0.3342634727667992, + "learning_rate": 4.994753337264484e-06, + "loss": 0.6767, + "step": 564 + }, + { + "epoch": 0.1500066374618346, + "grad_norm": 0.3260166590996332, + "learning_rate": 4.994730705925589e-06, + "loss": 0.6432, + "step": 565 + }, + { + "epoch": 0.15027213593521838, + "grad_norm": 0.3291001951665862, + "learning_rate": 4.994708025933337e-06, + "loss": 0.6365, + "step": 566 + }, + { + "epoch": 0.15053763440860216, + "grad_norm": 0.3310069934220363, + "learning_rate": 4.994685297288171e-06, + "loss": 0.6412, + "step": 567 + }, + { + "epoch": 0.15080313288198594, + "grad_norm": 0.3323823688607964, + "learning_rate": 4.994662519990533e-06, + "loss": 0.6277, + "step": 568 + }, + { + "epoch": 0.15106863135536971, + "grad_norm": 0.33134140774263565, + "learning_rate": 4.9946396940408674e-06, + "loss": 0.6284, + "step": 569 + }, + { + "epoch": 0.1513341298287535, + "grad_norm": 0.3366495795396674, + "learning_rate": 4.994616819439621e-06, + "loss": 0.6159, + "step": 570 + }, + { + "epoch": 0.15159962830213727, + "grad_norm": 0.34612904693930246, + "learning_rate": 4.994593896187238e-06, + "loss": 0.6755, + "step": 571 + }, + { + "epoch": 0.15186512677552105, + "grad_norm": 0.3325808273913825, + "learning_rate": 4.9945709242841656e-06, + "loss": 0.6744, + "step": 572 + }, + { + "epoch": 0.1521306252489048, + "grad_norm": 0.33475212875983545, + "learning_rate": 4.994547903730853e-06, + "loss": 0.6713, + "step": 573 + }, + { + "epoch": 0.15239612372228858, + "grad_norm": 0.3419898738743717, + "learning_rate": 4.994524834527748e-06, + "loss": 0.6991, + "step": 574 + }, + { + "epoch": 0.15266162219567236, + "grad_norm": 0.3359194326215708, + "learning_rate": 4.994501716675303e-06, + "loss": 0.635, + "step": 575 + }, + { + "epoch": 0.15292712066905614, + "grad_norm": 0.3310403527739758, + "learning_rate": 4.994478550173965e-06, + "loss": 0.6285, + "step": 576 + }, + { + "epoch": 0.15319261914243992, + "grad_norm": 0.32681290994687995, + "learning_rate": 4.994455335024188e-06, + "loss": 0.6506, + "step": 577 + }, + { + "epoch": 0.1534581176158237, + "grad_norm": 0.335434627448189, + "learning_rate": 4.9944320712264254e-06, + "loss": 0.6811, + "step": 578 + }, + { + "epoch": 0.15372361608920748, + "grad_norm": 0.3315355703841914, + "learning_rate": 4.994408758781129e-06, + "loss": 0.6841, + "step": 579 + }, + { + "epoch": 0.15398911456259126, + "grad_norm": 0.453893517786475, + "learning_rate": 4.994385397688756e-06, + "loss": 0.5861, + "step": 580 + }, + { + "epoch": 0.15425461303597504, + "grad_norm": 0.35520556295894407, + "learning_rate": 4.994361987949759e-06, + "loss": 0.6726, + "step": 581 + }, + { + "epoch": 0.15452011150935882, + "grad_norm": 0.3597665130331063, + "learning_rate": 4.994338529564597e-06, + "loss": 0.6865, + "step": 582 + }, + { + "epoch": 0.1547856099827426, + "grad_norm": 0.3598421860319184, + "learning_rate": 4.994315022533725e-06, + "loss": 0.6689, + "step": 583 + }, + { + "epoch": 0.15505110845612638, + "grad_norm": 0.3335391109966114, + "learning_rate": 4.994291466857604e-06, + "loss": 0.6286, + "step": 584 + }, + { + "epoch": 0.15531660692951016, + "grad_norm": 0.35510010063761116, + "learning_rate": 4.994267862536691e-06, + "loss": 0.6747, + "step": 585 + }, + { + "epoch": 0.15558210540289394, + "grad_norm": 0.345721931026568, + "learning_rate": 4.994244209571449e-06, + "loss": 0.6901, + "step": 586 + }, + { + "epoch": 0.15584760387627772, + "grad_norm": 0.3329197069443996, + "learning_rate": 4.994220507962338e-06, + "loss": 0.6862, + "step": 587 + }, + { + "epoch": 0.1561131023496615, + "grad_norm": 0.3260484470975118, + "learning_rate": 4.99419675770982e-06, + "loss": 0.6617, + "step": 588 + }, + { + "epoch": 0.15637860082304528, + "grad_norm": 0.33040165821619644, + "learning_rate": 4.9941729588143586e-06, + "loss": 0.6654, + "step": 589 + }, + { + "epoch": 0.15664409929642906, + "grad_norm": 0.3479610458896437, + "learning_rate": 4.994149111276417e-06, + "loss": 0.6471, + "step": 590 + }, + { + "epoch": 0.15690959776981284, + "grad_norm": 0.3371334049835157, + "learning_rate": 4.994125215096462e-06, + "loss": 0.6469, + "step": 591 + }, + { + "epoch": 0.1571750962431966, + "grad_norm": 0.3356169532422175, + "learning_rate": 4.994101270274957e-06, + "loss": 0.6708, + "step": 592 + }, + { + "epoch": 0.15744059471658037, + "grad_norm": 0.33618341704576754, + "learning_rate": 4.994077276812372e-06, + "loss": 0.6622, + "step": 593 + }, + { + "epoch": 0.15770609318996415, + "grad_norm": 0.33281710205959186, + "learning_rate": 4.994053234709173e-06, + "loss": 0.6735, + "step": 594 + }, + { + "epoch": 0.15797159166334793, + "grad_norm": 0.3571218236860454, + "learning_rate": 4.99402914396583e-06, + "loss": 0.6611, + "step": 595 + }, + { + "epoch": 0.1582370901367317, + "grad_norm": 0.3339588523505776, + "learning_rate": 4.994005004582811e-06, + "loss": 0.6447, + "step": 596 + }, + { + "epoch": 0.15850258861011549, + "grad_norm": 0.3199764131162705, + "learning_rate": 4.993980816560589e-06, + "loss": 0.6297, + "step": 597 + }, + { + "epoch": 0.15876808708349927, + "grad_norm": 0.37021465038246504, + "learning_rate": 4.993956579899635e-06, + "loss": 0.6348, + "step": 598 + }, + { + "epoch": 0.15903358555688304, + "grad_norm": 0.3270374220899211, + "learning_rate": 4.993932294600421e-06, + "loss": 0.6613, + "step": 599 + }, + { + "epoch": 0.15929908403026682, + "grad_norm": 0.33111837493896173, + "learning_rate": 4.9939079606634214e-06, + "loss": 0.6502, + "step": 600 + }, + { + "epoch": 0.1595645825036506, + "grad_norm": 0.3311759761687824, + "learning_rate": 4.99388357808911e-06, + "loss": 0.6713, + "step": 601 + }, + { + "epoch": 0.15983008097703438, + "grad_norm": 0.3365585056743753, + "learning_rate": 4.993859146877963e-06, + "loss": 0.653, + "step": 602 + }, + { + "epoch": 0.16009557945041816, + "grad_norm": 0.34668796669168633, + "learning_rate": 4.993834667030456e-06, + "loss": 0.7061, + "step": 603 + }, + { + "epoch": 0.16036107792380194, + "grad_norm": 0.32664222917788455, + "learning_rate": 4.993810138547067e-06, + "loss": 0.6335, + "step": 604 + }, + { + "epoch": 0.16062657639718572, + "grad_norm": 0.33890684011208744, + "learning_rate": 4.993785561428275e-06, + "loss": 0.6339, + "step": 605 + }, + { + "epoch": 0.1608920748705695, + "grad_norm": 0.35136215572926227, + "learning_rate": 4.993760935674559e-06, + "loss": 0.6745, + "step": 606 + }, + { + "epoch": 0.16115757334395328, + "grad_norm": 0.3203309936280434, + "learning_rate": 4.993736261286399e-06, + "loss": 0.6084, + "step": 607 + }, + { + "epoch": 0.16142307181733706, + "grad_norm": 0.3452582678950179, + "learning_rate": 4.993711538264275e-06, + "loss": 0.6735, + "step": 608 + }, + { + "epoch": 0.16168857029072084, + "grad_norm": 0.3354994569214719, + "learning_rate": 4.993686766608672e-06, + "loss": 0.6628, + "step": 609 + }, + { + "epoch": 0.16195406876410462, + "grad_norm": 0.334237430848364, + "learning_rate": 4.99366194632007e-06, + "loss": 0.6781, + "step": 610 + }, + { + "epoch": 0.16221956723748837, + "grad_norm": 0.3313359117252434, + "learning_rate": 4.993637077398955e-06, + "loss": 0.6456, + "step": 611 + }, + { + "epoch": 0.16248506571087215, + "grad_norm": 0.34060902405580806, + "learning_rate": 4.993612159845812e-06, + "loss": 0.6532, + "step": 612 + }, + { + "epoch": 0.16275056418425593, + "grad_norm": 0.3255501853576438, + "learning_rate": 4.993587193661126e-06, + "loss": 0.606, + "step": 613 + }, + { + "epoch": 0.1630160626576397, + "grad_norm": 0.3507908793390629, + "learning_rate": 4.993562178845384e-06, + "loss": 0.6698, + "step": 614 + }, + { + "epoch": 0.1632815611310235, + "grad_norm": 0.34343872187243774, + "learning_rate": 4.993537115399075e-06, + "loss": 0.6901, + "step": 615 + }, + { + "epoch": 0.16354705960440727, + "grad_norm": 0.33413101033583026, + "learning_rate": 4.993512003322687e-06, + "loss": 0.6741, + "step": 616 + }, + { + "epoch": 0.16381255807779105, + "grad_norm": 0.34354965288666184, + "learning_rate": 4.9934868426167095e-06, + "loss": 0.6784, + "step": 617 + }, + { + "epoch": 0.16407805655117483, + "grad_norm": 0.34372422112041834, + "learning_rate": 4.993461633281633e-06, + "loss": 0.6296, + "step": 618 + }, + { + "epoch": 0.1643435550245586, + "grad_norm": 0.3334250118427838, + "learning_rate": 4.993436375317951e-06, + "loss": 0.5963, + "step": 619 + }, + { + "epoch": 0.1646090534979424, + "grad_norm": 0.37412542083164385, + "learning_rate": 4.993411068726153e-06, + "loss": 0.6439, + "step": 620 + }, + { + "epoch": 0.16487455197132617, + "grad_norm": 0.34083625580390686, + "learning_rate": 4.993385713506735e-06, + "loss": 0.6488, + "step": 621 + }, + { + "epoch": 0.16514005044470995, + "grad_norm": 0.32910774835941825, + "learning_rate": 4.9933603096601904e-06, + "loss": 0.636, + "step": 622 + }, + { + "epoch": 0.16540554891809373, + "grad_norm": 0.38172097171318353, + "learning_rate": 4.993334857187015e-06, + "loss": 0.6628, + "step": 623 + }, + { + "epoch": 0.1656710473914775, + "grad_norm": 0.37361327501254793, + "learning_rate": 4.993309356087706e-06, + "loss": 0.6785, + "step": 624 + }, + { + "epoch": 0.16593654586486128, + "grad_norm": 0.3354208369872467, + "learning_rate": 4.993283806362759e-06, + "loss": 0.688, + "step": 625 + }, + { + "epoch": 0.16620204433824506, + "grad_norm": 0.33450601458476165, + "learning_rate": 4.9932582080126734e-06, + "loss": 0.7019, + "step": 626 + }, + { + "epoch": 0.16646754281162884, + "grad_norm": 0.4924071901734736, + "learning_rate": 4.9932325610379485e-06, + "loss": 0.6463, + "step": 627 + }, + { + "epoch": 0.16673304128501262, + "grad_norm": 0.3301536451004917, + "learning_rate": 4.993206865439084e-06, + "loss": 0.6683, + "step": 628 + }, + { + "epoch": 0.1669985397583964, + "grad_norm": 0.35214494911561767, + "learning_rate": 4.993181121216582e-06, + "loss": 0.6737, + "step": 629 + }, + { + "epoch": 0.16726403823178015, + "grad_norm": 0.3996492927577036, + "learning_rate": 4.9931553283709425e-06, + "loss": 0.6626, + "step": 630 + }, + { + "epoch": 0.16752953670516393, + "grad_norm": 0.3472386845192621, + "learning_rate": 4.99312948690267e-06, + "loss": 0.647, + "step": 631 + }, + { + "epoch": 0.1677950351785477, + "grad_norm": 0.3284775082396095, + "learning_rate": 4.993103596812269e-06, + "loss": 0.6481, + "step": 632 + }, + { + "epoch": 0.1680605336519315, + "grad_norm": 0.3420332625692997, + "learning_rate": 4.993077658100243e-06, + "loss": 0.692, + "step": 633 + }, + { + "epoch": 0.16832603212531527, + "grad_norm": 0.3747111543822854, + "learning_rate": 4.993051670767098e-06, + "loss": 0.6747, + "step": 634 + }, + { + "epoch": 0.16859153059869905, + "grad_norm": 0.34727129132010565, + "learning_rate": 4.993025634813342e-06, + "loss": 0.6625, + "step": 635 + }, + { + "epoch": 0.16885702907208283, + "grad_norm": 0.33104329412752553, + "learning_rate": 4.992999550239482e-06, + "loss": 0.674, + "step": 636 + }, + { + "epoch": 0.1691225275454666, + "grad_norm": 0.34417715343099703, + "learning_rate": 4.992973417046028e-06, + "loss": 0.6273, + "step": 637 + }, + { + "epoch": 0.1693880260188504, + "grad_norm": 0.389161476418092, + "learning_rate": 4.992947235233488e-06, + "loss": 0.6468, + "step": 638 + }, + { + "epoch": 0.16965352449223417, + "grad_norm": 0.3382559855533501, + "learning_rate": 4.992921004802372e-06, + "loss": 0.5964, + "step": 639 + }, + { + "epoch": 0.16991902296561795, + "grad_norm": 0.34217674700171413, + "learning_rate": 4.992894725753193e-06, + "loss": 0.6689, + "step": 640 + }, + { + "epoch": 0.17018452143900173, + "grad_norm": 0.39400178355254273, + "learning_rate": 4.9928683980864634e-06, + "loss": 0.6622, + "step": 641 + }, + { + "epoch": 0.1704500199123855, + "grad_norm": 0.42784826089298833, + "learning_rate": 4.992842021802697e-06, + "loss": 0.6255, + "step": 642 + }, + { + "epoch": 0.1707155183857693, + "grad_norm": 0.33397660471822366, + "learning_rate": 4.992815596902407e-06, + "loss": 0.6612, + "step": 643 + }, + { + "epoch": 0.17098101685915307, + "grad_norm": 0.3660443714679061, + "learning_rate": 4.99278912338611e-06, + "loss": 0.66, + "step": 644 + }, + { + "epoch": 0.17124651533253685, + "grad_norm": 0.3506234120068022, + "learning_rate": 4.992762601254322e-06, + "loss": 0.6927, + "step": 645 + }, + { + "epoch": 0.17151201380592063, + "grad_norm": 0.4752779091611214, + "learning_rate": 4.992736030507559e-06, + "loss": 0.6296, + "step": 646 + }, + { + "epoch": 0.1717775122793044, + "grad_norm": 0.34570403940780386, + "learning_rate": 4.99270941114634e-06, + "loss": 0.6307, + "step": 647 + }, + { + "epoch": 0.17204301075268819, + "grad_norm": 0.33898176223347, + "learning_rate": 4.992682743171185e-06, + "loss": 0.6862, + "step": 648 + }, + { + "epoch": 0.17230850922607194, + "grad_norm": 0.3527315562887804, + "learning_rate": 4.992656026582613e-06, + "loss": 0.6435, + "step": 649 + }, + { + "epoch": 0.17257400769945572, + "grad_norm": 0.5591109976004349, + "learning_rate": 4.992629261381146e-06, + "loss": 0.6455, + "step": 650 + }, + { + "epoch": 0.1728395061728395, + "grad_norm": 0.3511782949125343, + "learning_rate": 4.992602447567304e-06, + "loss": 0.6937, + "step": 651 + }, + { + "epoch": 0.17310500464622328, + "grad_norm": 0.3660381170727164, + "learning_rate": 4.992575585141612e-06, + "loss": 0.6492, + "step": 652 + }, + { + "epoch": 0.17337050311960706, + "grad_norm": 0.3770987884631617, + "learning_rate": 4.992548674104594e-06, + "loss": 0.6823, + "step": 653 + }, + { + "epoch": 0.17363600159299084, + "grad_norm": 0.3761289256131638, + "learning_rate": 4.992521714456774e-06, + "loss": 0.6396, + "step": 654 + }, + { + "epoch": 0.17390150006637461, + "grad_norm": 0.3475518003285641, + "learning_rate": 4.992494706198678e-06, + "loss": 0.6428, + "step": 655 + }, + { + "epoch": 0.1741669985397584, + "grad_norm": 0.338760464616784, + "learning_rate": 4.992467649330831e-06, + "loss": 0.6868, + "step": 656 + }, + { + "epoch": 0.17443249701314217, + "grad_norm": 0.329468348433597, + "learning_rate": 4.992440543853763e-06, + "loss": 0.6339, + "step": 657 + }, + { + "epoch": 0.17469799548652595, + "grad_norm": 0.3520208363642314, + "learning_rate": 4.992413389768001e-06, + "loss": 0.6196, + "step": 658 + }, + { + "epoch": 0.17496349395990973, + "grad_norm": 0.3526400854331627, + "learning_rate": 4.992386187074078e-06, + "loss": 0.6451, + "step": 659 + }, + { + "epoch": 0.1752289924332935, + "grad_norm": 0.35797450081758125, + "learning_rate": 4.992358935772519e-06, + "loss": 0.6789, + "step": 660 + }, + { + "epoch": 0.1754944909066773, + "grad_norm": 0.3350384684829453, + "learning_rate": 4.99233163586386e-06, + "loss": 0.6483, + "step": 661 + }, + { + "epoch": 0.17575998938006107, + "grad_norm": 0.33898553335758785, + "learning_rate": 4.992304287348632e-06, + "loss": 0.6805, + "step": 662 + }, + { + "epoch": 0.17602548785344485, + "grad_norm": 0.33291920836012545, + "learning_rate": 4.992276890227368e-06, + "loss": 0.6401, + "step": 663 + }, + { + "epoch": 0.17629098632682863, + "grad_norm": 0.3507502389049936, + "learning_rate": 4.992249444500601e-06, + "loss": 0.6369, + "step": 664 + }, + { + "epoch": 0.1765564848002124, + "grad_norm": 0.3528312083787476, + "learning_rate": 4.992221950168869e-06, + "loss": 0.6696, + "step": 665 + }, + { + "epoch": 0.1768219832735962, + "grad_norm": 0.33847661618514874, + "learning_rate": 4.992194407232708e-06, + "loss": 0.6583, + "step": 666 + }, + { + "epoch": 0.17708748174697994, + "grad_norm": 0.33546638130976375, + "learning_rate": 4.9921668156926526e-06, + "loss": 0.6779, + "step": 667 + }, + { + "epoch": 0.17735298022036372, + "grad_norm": 0.34310485759623466, + "learning_rate": 4.992139175549243e-06, + "loss": 0.678, + "step": 668 + }, + { + "epoch": 0.1776184786937475, + "grad_norm": 0.35026130026948477, + "learning_rate": 4.992111486803017e-06, + "loss": 0.6064, + "step": 669 + }, + { + "epoch": 0.17788397716713128, + "grad_norm": 0.3408774225154651, + "learning_rate": 4.992083749454516e-06, + "loss": 0.6478, + "step": 670 + }, + { + "epoch": 0.17814947564051506, + "grad_norm": 0.34073396016254826, + "learning_rate": 4.99205596350428e-06, + "loss": 0.6463, + "step": 671 + }, + { + "epoch": 0.17841497411389884, + "grad_norm": 0.35756027313020583, + "learning_rate": 4.992028128952851e-06, + "loss": 0.6413, + "step": 672 + }, + { + "epoch": 0.17868047258728262, + "grad_norm": 0.3336875362140239, + "learning_rate": 4.992000245800772e-06, + "loss": 0.6461, + "step": 673 + }, + { + "epoch": 0.1789459710606664, + "grad_norm": 0.3445936259159989, + "learning_rate": 4.991972314048587e-06, + "loss": 0.6645, + "step": 674 + }, + { + "epoch": 0.17921146953405018, + "grad_norm": 0.33654721153260087, + "learning_rate": 4.99194433369684e-06, + "loss": 0.6289, + "step": 675 + }, + { + "epoch": 0.17947696800743396, + "grad_norm": 0.34037008807256686, + "learning_rate": 4.991916304746077e-06, + "loss": 0.6733, + "step": 676 + }, + { + "epoch": 0.17974246648081774, + "grad_norm": 0.34539272142794913, + "learning_rate": 4.991888227196846e-06, + "loss": 0.6486, + "step": 677 + }, + { + "epoch": 0.18000796495420152, + "grad_norm": 0.3306497540608407, + "learning_rate": 4.9918601010496925e-06, + "loss": 0.6784, + "step": 678 + }, + { + "epoch": 0.1802734634275853, + "grad_norm": 0.3453371982415876, + "learning_rate": 4.991831926305166e-06, + "loss": 0.6757, + "step": 679 + }, + { + "epoch": 0.18053896190096907, + "grad_norm": 0.3506783353640043, + "learning_rate": 4.991803702963815e-06, + "loss": 0.6471, + "step": 680 + }, + { + "epoch": 0.18080446037435285, + "grad_norm": 0.3460571746922703, + "learning_rate": 4.9917754310261925e-06, + "loss": 0.6468, + "step": 681 + }, + { + "epoch": 0.18106995884773663, + "grad_norm": 0.3455399902951912, + "learning_rate": 4.991747110492847e-06, + "loss": 0.667, + "step": 682 + }, + { + "epoch": 0.1813354573211204, + "grad_norm": 0.3335499569300848, + "learning_rate": 4.991718741364333e-06, + "loss": 0.6478, + "step": 683 + }, + { + "epoch": 0.1816009557945042, + "grad_norm": 0.3570524927092075, + "learning_rate": 4.991690323641202e-06, + "loss": 0.6727, + "step": 684 + }, + { + "epoch": 0.18186645426788797, + "grad_norm": 0.3401655496067992, + "learning_rate": 4.991661857324009e-06, + "loss": 0.6437, + "step": 685 + }, + { + "epoch": 0.18213195274127172, + "grad_norm": 0.34351058102025905, + "learning_rate": 4.991633342413309e-06, + "loss": 0.6719, + "step": 686 + }, + { + "epoch": 0.1823974512146555, + "grad_norm": 0.33841071957533675, + "learning_rate": 4.991604778909659e-06, + "loss": 0.6371, + "step": 687 + }, + { + "epoch": 0.18266294968803928, + "grad_norm": 0.3420778911786694, + "learning_rate": 4.991576166813614e-06, + "loss": 0.6825, + "step": 688 + }, + { + "epoch": 0.18292844816142306, + "grad_norm": 0.3654088014459768, + "learning_rate": 4.991547506125735e-06, + "loss": 0.705, + "step": 689 + }, + { + "epoch": 0.18319394663480684, + "grad_norm": 0.34959108681412265, + "learning_rate": 4.991518796846578e-06, + "loss": 0.6296, + "step": 690 + }, + { + "epoch": 0.18345944510819062, + "grad_norm": 0.33902705185959686, + "learning_rate": 4.9914900389767044e-06, + "loss": 0.6555, + "step": 691 + }, + { + "epoch": 0.1837249435815744, + "grad_norm": 0.35297183061504295, + "learning_rate": 4.991461232516675e-06, + "loss": 0.6221, + "step": 692 + }, + { + "epoch": 0.18399044205495818, + "grad_norm": 0.3469568147576771, + "learning_rate": 4.991432377467052e-06, + "loss": 0.6346, + "step": 693 + }, + { + "epoch": 0.18425594052834196, + "grad_norm": 0.3386616592406038, + "learning_rate": 4.991403473828397e-06, + "loss": 0.6138, + "step": 694 + }, + { + "epoch": 0.18452143900172574, + "grad_norm": 0.3407264654457628, + "learning_rate": 4.9913745216012735e-06, + "loss": 0.6777, + "step": 695 + }, + { + "epoch": 0.18478693747510952, + "grad_norm": 0.345545417846684, + "learning_rate": 4.991345520786248e-06, + "loss": 0.6874, + "step": 696 + }, + { + "epoch": 0.1850524359484933, + "grad_norm": 0.34674174814963427, + "learning_rate": 4.991316471383885e-06, + "loss": 0.6467, + "step": 697 + }, + { + "epoch": 0.18531793442187708, + "grad_norm": 0.3454970518855923, + "learning_rate": 4.99128737339475e-06, + "loss": 0.6797, + "step": 698 + }, + { + "epoch": 0.18558343289526086, + "grad_norm": 0.33639062026906313, + "learning_rate": 4.991258226819412e-06, + "loss": 0.6562, + "step": 699 + }, + { + "epoch": 0.18584893136864464, + "grad_norm": 0.35528926257682564, + "learning_rate": 4.991229031658439e-06, + "loss": 0.5918, + "step": 700 + }, + { + "epoch": 0.18611442984202842, + "grad_norm": 0.361808065424919, + "learning_rate": 4.9911997879124e-06, + "loss": 0.675, + "step": 701 + }, + { + "epoch": 0.1863799283154122, + "grad_norm": 0.3459245861361517, + "learning_rate": 4.991170495581866e-06, + "loss": 0.682, + "step": 702 + }, + { + "epoch": 0.18664542678879598, + "grad_norm": 0.3462120423392343, + "learning_rate": 4.991141154667408e-06, + "loss": 0.6423, + "step": 703 + }, + { + "epoch": 0.18691092526217976, + "grad_norm": 0.3436493823715349, + "learning_rate": 4.991111765169599e-06, + "loss": 0.6518, + "step": 704 + }, + { + "epoch": 0.1871764237355635, + "grad_norm": 0.35167741744699027, + "learning_rate": 4.99108232708901e-06, + "loss": 0.6606, + "step": 705 + }, + { + "epoch": 0.1874419222089473, + "grad_norm": 0.34327635583975075, + "learning_rate": 4.991052840426216e-06, + "loss": 0.639, + "step": 706 + }, + { + "epoch": 0.18770742068233107, + "grad_norm": 0.3453640625141677, + "learning_rate": 4.991023305181793e-06, + "loss": 0.6354, + "step": 707 + }, + { + "epoch": 0.18797291915571485, + "grad_norm": 0.3567185085773874, + "learning_rate": 4.990993721356317e-06, + "loss": 0.6372, + "step": 708 + }, + { + "epoch": 0.18823841762909863, + "grad_norm": 0.364478705086117, + "learning_rate": 4.990964088950364e-06, + "loss": 0.6461, + "step": 709 + }, + { + "epoch": 0.1885039161024824, + "grad_norm": 0.3431707251325505, + "learning_rate": 4.990934407964511e-06, + "loss": 0.6308, + "step": 710 + }, + { + "epoch": 0.18876941457586618, + "grad_norm": 0.3661209655624326, + "learning_rate": 4.990904678399341e-06, + "loss": 0.6622, + "step": 711 + }, + { + "epoch": 0.18903491304924996, + "grad_norm": 0.3431422937762494, + "learning_rate": 4.9908749002554295e-06, + "loss": 0.6791, + "step": 712 + }, + { + "epoch": 0.18930041152263374, + "grad_norm": 0.35905228630684316, + "learning_rate": 4.990845073533358e-06, + "loss": 0.68, + "step": 713 + }, + { + "epoch": 0.18956590999601752, + "grad_norm": 0.3461910167092669, + "learning_rate": 4.99081519823371e-06, + "loss": 0.6636, + "step": 714 + }, + { + "epoch": 0.1898314084694013, + "grad_norm": 0.3386006082998523, + "learning_rate": 4.990785274357067e-06, + "loss": 0.6434, + "step": 715 + }, + { + "epoch": 0.19009690694278508, + "grad_norm": 0.3453548364134816, + "learning_rate": 4.990755301904012e-06, + "loss": 0.6684, + "step": 716 + }, + { + "epoch": 0.19036240541616886, + "grad_norm": 0.35014637101935864, + "learning_rate": 4.9907252808751304e-06, + "loss": 0.6771, + "step": 717 + }, + { + "epoch": 0.19062790388955264, + "grad_norm": 0.35004893115165964, + "learning_rate": 4.990695211271008e-06, + "loss": 0.6489, + "step": 718 + }, + { + "epoch": 0.19089340236293642, + "grad_norm": 0.342933936524258, + "learning_rate": 4.990665093092231e-06, + "loss": 0.6549, + "step": 719 + }, + { + "epoch": 0.1911589008363202, + "grad_norm": 0.3637709812659784, + "learning_rate": 4.990634926339385e-06, + "loss": 0.6753, + "step": 720 + }, + { + "epoch": 0.19142439930970398, + "grad_norm": 0.34613321478661163, + "learning_rate": 4.99060471101306e-06, + "loss": 0.644, + "step": 721 + }, + { + "epoch": 0.19168989778308776, + "grad_norm": 0.36246048423430655, + "learning_rate": 4.990574447113847e-06, + "loss": 0.6373, + "step": 722 + }, + { + "epoch": 0.19195539625647154, + "grad_norm": 0.3430804737712154, + "learning_rate": 4.990544134642333e-06, + "loss": 0.6776, + "step": 723 + }, + { + "epoch": 0.1922208947298553, + "grad_norm": 0.3704339499448267, + "learning_rate": 4.9905137735991104e-06, + "loss": 0.6225, + "step": 724 + }, + { + "epoch": 0.19248639320323907, + "grad_norm": 0.3585600683814105, + "learning_rate": 4.990483363984771e-06, + "loss": 0.6703, + "step": 725 + }, + { + "epoch": 0.19275189167662285, + "grad_norm": 0.35002134355805975, + "learning_rate": 4.990452905799909e-06, + "loss": 0.6273, + "step": 726 + }, + { + "epoch": 0.19301739015000663, + "grad_norm": 0.358388032460886, + "learning_rate": 4.990422399045117e-06, + "loss": 0.6375, + "step": 727 + }, + { + "epoch": 0.1932828886233904, + "grad_norm": 0.34611033141067893, + "learning_rate": 4.990391843720992e-06, + "loss": 0.6412, + "step": 728 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 0.3553189733335606, + "learning_rate": 4.990361239828127e-06, + "loss": 0.6648, + "step": 729 + }, + { + "epoch": 0.19381388557015797, + "grad_norm": 0.340952722084266, + "learning_rate": 4.990330587367121e-06, + "loss": 0.6608, + "step": 730 + }, + { + "epoch": 0.19407938404354175, + "grad_norm": 0.43374978272153053, + "learning_rate": 4.990299886338571e-06, + "loss": 0.6073, + "step": 731 + }, + { + "epoch": 0.19434488251692553, + "grad_norm": 0.33795144738861543, + "learning_rate": 4.990269136743076e-06, + "loss": 0.6459, + "step": 732 + }, + { + "epoch": 0.1946103809903093, + "grad_norm": 0.37392102483203854, + "learning_rate": 4.990238338581236e-06, + "loss": 0.699, + "step": 733 + }, + { + "epoch": 0.19487587946369309, + "grad_norm": 0.3760566762842307, + "learning_rate": 4.99020749185365e-06, + "loss": 0.662, + "step": 734 + }, + { + "epoch": 0.19514137793707687, + "grad_norm": 0.34528152011058866, + "learning_rate": 4.9901765965609225e-06, + "loss": 0.6681, + "step": 735 + }, + { + "epoch": 0.19540687641046064, + "grad_norm": 0.344179669061022, + "learning_rate": 4.990145652703653e-06, + "loss": 0.6497, + "step": 736 + }, + { + "epoch": 0.19567237488384442, + "grad_norm": 0.34965044920948035, + "learning_rate": 4.990114660282447e-06, + "loss": 0.6755, + "step": 737 + }, + { + "epoch": 0.1959378733572282, + "grad_norm": 0.3535454904841323, + "learning_rate": 4.990083619297908e-06, + "loss": 0.6609, + "step": 738 + }, + { + "epoch": 0.19620337183061198, + "grad_norm": 0.34515881193454195, + "learning_rate": 4.990052529750642e-06, + "loss": 0.6535, + "step": 739 + }, + { + "epoch": 0.19646887030399576, + "grad_norm": 0.35290041799419597, + "learning_rate": 4.990021391641255e-06, + "loss": 0.6688, + "step": 740 + }, + { + "epoch": 0.19673436877737954, + "grad_norm": 0.35948043134213553, + "learning_rate": 4.989990204970354e-06, + "loss": 0.662, + "step": 741 + }, + { + "epoch": 0.19699986725076332, + "grad_norm": 0.34688437244572223, + "learning_rate": 4.989958969738548e-06, + "loss": 0.6561, + "step": 742 + }, + { + "epoch": 0.19726536572414707, + "grad_norm": 0.3423806942407409, + "learning_rate": 4.989927685946444e-06, + "loss": 0.6564, + "step": 743 + }, + { + "epoch": 0.19753086419753085, + "grad_norm": 0.3671254216720733, + "learning_rate": 4.989896353594655e-06, + "loss": 0.6922, + "step": 744 + }, + { + "epoch": 0.19779636267091463, + "grad_norm": 0.3426208067836131, + "learning_rate": 4.989864972683791e-06, + "loss": 0.6478, + "step": 745 + }, + { + "epoch": 0.1980618611442984, + "grad_norm": 0.37198162066372836, + "learning_rate": 4.989833543214463e-06, + "loss": 0.6948, + "step": 746 + }, + { + "epoch": 0.1983273596176822, + "grad_norm": 0.34879592548399513, + "learning_rate": 4.989802065187285e-06, + "loss": 0.664, + "step": 747 + }, + { + "epoch": 0.19859285809106597, + "grad_norm": 0.34837159432268605, + "learning_rate": 4.989770538602871e-06, + "loss": 0.6731, + "step": 748 + }, + { + "epoch": 0.19885835656444975, + "grad_norm": 0.365626296978299, + "learning_rate": 4.989738963461835e-06, + "loss": 0.66, + "step": 749 + }, + { + "epoch": 0.19912385503783353, + "grad_norm": 0.3601140620745634, + "learning_rate": 4.989707339764794e-06, + "loss": 0.6238, + "step": 750 + }, + { + "epoch": 0.1993893535112173, + "grad_norm": 0.34474706919274484, + "learning_rate": 4.989675667512363e-06, + "loss": 0.6743, + "step": 751 + }, + { + "epoch": 0.1996548519846011, + "grad_norm": 0.3540948265947992, + "learning_rate": 4.989643946705161e-06, + "loss": 0.6437, + "step": 752 + }, + { + "epoch": 0.19992035045798487, + "grad_norm": 0.3569867242462055, + "learning_rate": 4.989612177343807e-06, + "loss": 0.6532, + "step": 753 + }, + { + "epoch": 0.20018584893136865, + "grad_norm": 0.3641200112332836, + "learning_rate": 4.98958035942892e-06, + "loss": 0.66, + "step": 754 + }, + { + "epoch": 0.20045134740475243, + "grad_norm": 0.3441067788826454, + "learning_rate": 4.98954849296112e-06, + "loss": 0.6535, + "step": 755 + }, + { + "epoch": 0.2007168458781362, + "grad_norm": 0.3451350430287015, + "learning_rate": 4.989516577941029e-06, + "loss": 0.6891, + "step": 756 + }, + { + "epoch": 0.20098234435152, + "grad_norm": 0.35665774669038647, + "learning_rate": 4.989484614369269e-06, + "loss": 0.6535, + "step": 757 + }, + { + "epoch": 0.20124784282490377, + "grad_norm": 0.3735976922013897, + "learning_rate": 4.989452602246464e-06, + "loss": 0.6074, + "step": 758 + }, + { + "epoch": 0.20151334129828755, + "grad_norm": 0.35283173086183617, + "learning_rate": 4.989420541573238e-06, + "loss": 0.6821, + "step": 759 + }, + { + "epoch": 0.20177883977167133, + "grad_norm": 0.34824192261422704, + "learning_rate": 4.989388432350216e-06, + "loss": 0.6458, + "step": 760 + }, + { + "epoch": 0.20204433824505508, + "grad_norm": 0.355889090178465, + "learning_rate": 4.989356274578025e-06, + "loss": 0.6517, + "step": 761 + }, + { + "epoch": 0.20230983671843886, + "grad_norm": 0.3392175251337885, + "learning_rate": 4.989324068257292e-06, + "loss": 0.6519, + "step": 762 + }, + { + "epoch": 0.20257533519182264, + "grad_norm": 0.3407751555251443, + "learning_rate": 4.989291813388644e-06, + "loss": 0.6643, + "step": 763 + }, + { + "epoch": 0.20284083366520642, + "grad_norm": 0.3504841174357293, + "learning_rate": 4.98925950997271e-06, + "loss": 0.6825, + "step": 764 + }, + { + "epoch": 0.2031063321385902, + "grad_norm": 0.35525779440856015, + "learning_rate": 4.989227158010123e-06, + "loss": 0.6477, + "step": 765 + }, + { + "epoch": 0.20337183061197397, + "grad_norm": 0.34858526803908774, + "learning_rate": 4.9891947575015095e-06, + "loss": 0.6434, + "step": 766 + }, + { + "epoch": 0.20363732908535775, + "grad_norm": 0.33648804934110765, + "learning_rate": 4.989162308447505e-06, + "loss": 0.6159, + "step": 767 + }, + { + "epoch": 0.20390282755874153, + "grad_norm": 0.3568903495343007, + "learning_rate": 4.98912981084874e-06, + "loss": 0.6558, + "step": 768 + }, + { + "epoch": 0.2041683260321253, + "grad_norm": 0.36416058886348546, + "learning_rate": 4.98909726470585e-06, + "loss": 0.6623, + "step": 769 + }, + { + "epoch": 0.2044338245055091, + "grad_norm": 0.35405775315654897, + "learning_rate": 4.989064670019469e-06, + "loss": 0.6569, + "step": 770 + }, + { + "epoch": 0.20469932297889287, + "grad_norm": 0.35255045795860585, + "learning_rate": 4.9890320267902325e-06, + "loss": 0.6332, + "step": 771 + }, + { + "epoch": 0.20496482145227665, + "grad_norm": 0.3606279396471135, + "learning_rate": 4.988999335018777e-06, + "loss": 0.6518, + "step": 772 + }, + { + "epoch": 0.20523031992566043, + "grad_norm": 0.3476308603367875, + "learning_rate": 4.988966594705741e-06, + "loss": 0.6659, + "step": 773 + }, + { + "epoch": 0.2054958183990442, + "grad_norm": 0.3425688513013799, + "learning_rate": 4.988933805851762e-06, + "loss": 0.6932, + "step": 774 + }, + { + "epoch": 0.205761316872428, + "grad_norm": 0.3768492089354647, + "learning_rate": 4.98890096845748e-06, + "loss": 0.6548, + "step": 775 + }, + { + "epoch": 0.20602681534581177, + "grad_norm": 0.34867139824243465, + "learning_rate": 4.988868082523535e-06, + "loss": 0.6545, + "step": 776 + }, + { + "epoch": 0.20629231381919555, + "grad_norm": 0.3618989818520878, + "learning_rate": 4.988835148050568e-06, + "loss": 0.6257, + "step": 777 + }, + { + "epoch": 0.20655781229257933, + "grad_norm": 0.35329490061661384, + "learning_rate": 4.988802165039222e-06, + "loss": 0.6504, + "step": 778 + }, + { + "epoch": 0.2068233107659631, + "grad_norm": 0.34555989750148103, + "learning_rate": 4.988769133490141e-06, + "loss": 0.6103, + "step": 779 + }, + { + "epoch": 0.20708880923934686, + "grad_norm": 0.3431675375187069, + "learning_rate": 4.988736053403968e-06, + "loss": 0.6251, + "step": 780 + }, + { + "epoch": 0.20735430771273064, + "grad_norm": 0.3555657917148854, + "learning_rate": 4.988702924781349e-06, + "loss": 0.6636, + "step": 781 + }, + { + "epoch": 0.20761980618611442, + "grad_norm": 0.36124070993240076, + "learning_rate": 4.988669747622929e-06, + "loss": 0.6601, + "step": 782 + }, + { + "epoch": 0.2078853046594982, + "grad_norm": 0.3477640473044727, + "learning_rate": 4.988636521929355e-06, + "loss": 0.6304, + "step": 783 + }, + { + "epoch": 0.20815080313288198, + "grad_norm": 0.35789720292635246, + "learning_rate": 4.988603247701277e-06, + "loss": 0.6639, + "step": 784 + }, + { + "epoch": 0.20841630160626576, + "grad_norm": 0.35820990851418305, + "learning_rate": 4.9885699249393415e-06, + "loss": 0.631, + "step": 785 + }, + { + "epoch": 0.20868180007964954, + "grad_norm": 0.3823713400187527, + "learning_rate": 4.9885365536442e-06, + "loss": 0.6304, + "step": 786 + }, + { + "epoch": 0.20894729855303332, + "grad_norm": 0.3506296950107694, + "learning_rate": 4.9885031338165025e-06, + "loss": 0.6921, + "step": 787 + }, + { + "epoch": 0.2092127970264171, + "grad_norm": 0.3870833129338183, + "learning_rate": 4.988469665456901e-06, + "loss": 0.6486, + "step": 788 + }, + { + "epoch": 0.20947829549980088, + "grad_norm": 0.347488569930697, + "learning_rate": 4.988436148566047e-06, + "loss": 0.691, + "step": 789 + }, + { + "epoch": 0.20974379397318466, + "grad_norm": 0.358713841867121, + "learning_rate": 4.988402583144597e-06, + "loss": 0.668, + "step": 790 + }, + { + "epoch": 0.21000929244656844, + "grad_norm": 0.35299913733330485, + "learning_rate": 4.988368969193204e-06, + "loss": 0.64, + "step": 791 + }, + { + "epoch": 0.21027479091995221, + "grad_norm": 0.34859776950733684, + "learning_rate": 4.9883353067125225e-06, + "loss": 0.6553, + "step": 792 + }, + { + "epoch": 0.210540289393336, + "grad_norm": 0.34686055149021544, + "learning_rate": 4.98830159570321e-06, + "loss": 0.6258, + "step": 793 + }, + { + "epoch": 0.21080578786671977, + "grad_norm": 0.35949850530490796, + "learning_rate": 4.988267836165924e-06, + "loss": 0.6669, + "step": 794 + }, + { + "epoch": 0.21107128634010355, + "grad_norm": 0.35366808769806707, + "learning_rate": 4.988234028101324e-06, + "loss": 0.6593, + "step": 795 + }, + { + "epoch": 0.21133678481348733, + "grad_norm": 0.3547574223059748, + "learning_rate": 4.988200171510067e-06, + "loss": 0.6633, + "step": 796 + }, + { + "epoch": 0.2116022832868711, + "grad_norm": 0.34981892699610107, + "learning_rate": 4.988166266392815e-06, + "loss": 0.6665, + "step": 797 + }, + { + "epoch": 0.2118677817602549, + "grad_norm": 0.34759057078754807, + "learning_rate": 4.988132312750229e-06, + "loss": 0.6492, + "step": 798 + }, + { + "epoch": 0.21213328023363864, + "grad_norm": 0.3526274083157686, + "learning_rate": 4.98809831058297e-06, + "loss": 0.6691, + "step": 799 + }, + { + "epoch": 0.21239877870702242, + "grad_norm": 0.36962844118627614, + "learning_rate": 4.988064259891703e-06, + "loss": 0.6573, + "step": 800 + }, + { + "epoch": 0.2126642771804062, + "grad_norm": 0.3495562841744903, + "learning_rate": 4.988030160677091e-06, + "loss": 0.6276, + "step": 801 + }, + { + "epoch": 0.21292977565378998, + "grad_norm": 0.35906869205672887, + "learning_rate": 4.987996012939799e-06, + "loss": 0.6419, + "step": 802 + }, + { + "epoch": 0.21319527412717376, + "grad_norm": 0.3545749367955905, + "learning_rate": 4.987961816680493e-06, + "loss": 0.6626, + "step": 803 + }, + { + "epoch": 0.21346077260055754, + "grad_norm": 0.35503241613586267, + "learning_rate": 4.98792757189984e-06, + "loss": 0.6307, + "step": 804 + }, + { + "epoch": 0.21372627107394132, + "grad_norm": 0.3352386894697989, + "learning_rate": 4.987893278598507e-06, + "loss": 0.6316, + "step": 805 + }, + { + "epoch": 0.2139917695473251, + "grad_norm": 0.35190578272262724, + "learning_rate": 4.987858936777165e-06, + "loss": 0.6156, + "step": 806 + }, + { + "epoch": 0.21425726802070888, + "grad_norm": 0.3513033240075226, + "learning_rate": 4.987824546436482e-06, + "loss": 0.6549, + "step": 807 + }, + { + "epoch": 0.21452276649409266, + "grad_norm": 0.34204140216555323, + "learning_rate": 4.987790107577128e-06, + "loss": 0.641, + "step": 808 + }, + { + "epoch": 0.21478826496747644, + "grad_norm": 0.3490871460559712, + "learning_rate": 4.987755620199776e-06, + "loss": 0.6796, + "step": 809 + }, + { + "epoch": 0.21505376344086022, + "grad_norm": 0.35347875353423974, + "learning_rate": 4.987721084305101e-06, + "loss": 0.5716, + "step": 810 + }, + { + "epoch": 0.215319261914244, + "grad_norm": 0.3483940521143135, + "learning_rate": 4.987686499893771e-06, + "loss": 0.6487, + "step": 811 + }, + { + "epoch": 0.21558476038762778, + "grad_norm": 0.37216731718694335, + "learning_rate": 4.987651866966464e-06, + "loss": 0.649, + "step": 812 + }, + { + "epoch": 0.21585025886101156, + "grad_norm": 0.3581584548987224, + "learning_rate": 4.9876171855238554e-06, + "loss": 0.6365, + "step": 813 + }, + { + "epoch": 0.21611575733439534, + "grad_norm": 0.3596825816868976, + "learning_rate": 4.98758245556662e-06, + "loss": 0.648, + "step": 814 + }, + { + "epoch": 0.21638125580777912, + "grad_norm": 0.3480720817879569, + "learning_rate": 4.987547677095437e-06, + "loss": 0.6254, + "step": 815 + }, + { + "epoch": 0.2166467542811629, + "grad_norm": 0.3693355122012089, + "learning_rate": 4.987512850110983e-06, + "loss": 0.6331, + "step": 816 + }, + { + "epoch": 0.21691225275454667, + "grad_norm": 0.3603395444162195, + "learning_rate": 4.987477974613939e-06, + "loss": 0.6283, + "step": 817 + }, + { + "epoch": 0.21717775122793043, + "grad_norm": 0.3737800526817902, + "learning_rate": 4.987443050604983e-06, + "loss": 0.6747, + "step": 818 + }, + { + "epoch": 0.2174432497013142, + "grad_norm": 0.3440344614094484, + "learning_rate": 4.987408078084797e-06, + "loss": 0.6304, + "step": 819 + }, + { + "epoch": 0.21770874817469799, + "grad_norm": 0.3690454107199134, + "learning_rate": 4.987373057054064e-06, + "loss": 0.6687, + "step": 820 + }, + { + "epoch": 0.21797424664808177, + "grad_norm": 0.3678878059485673, + "learning_rate": 4.987337987513467e-06, + "loss": 0.605, + "step": 821 + }, + { + "epoch": 0.21823974512146554, + "grad_norm": 0.34335471426275843, + "learning_rate": 4.987302869463687e-06, + "loss": 0.6352, + "step": 822 + }, + { + "epoch": 0.21850524359484932, + "grad_norm": 0.37952789084664285, + "learning_rate": 4.987267702905413e-06, + "loss": 0.6591, + "step": 823 + }, + { + "epoch": 0.2187707420682331, + "grad_norm": 0.340934874626806, + "learning_rate": 4.987232487839328e-06, + "loss": 0.6322, + "step": 824 + }, + { + "epoch": 0.21903624054161688, + "grad_norm": 0.3592082283497444, + "learning_rate": 4.9871972242661195e-06, + "loss": 0.6554, + "step": 825 + }, + { + "epoch": 0.21930173901500066, + "grad_norm": 0.35312258794824203, + "learning_rate": 4.987161912186476e-06, + "loss": 0.6844, + "step": 826 + }, + { + "epoch": 0.21956723748838444, + "grad_norm": 0.39253561740637266, + "learning_rate": 4.9871265516010854e-06, + "loss": 0.6342, + "step": 827 + }, + { + "epoch": 0.21983273596176822, + "grad_norm": 0.37013801640971983, + "learning_rate": 4.9870911425106375e-06, + "loss": 0.6448, + "step": 828 + }, + { + "epoch": 0.220098234435152, + "grad_norm": 0.37332415971943467, + "learning_rate": 4.987055684915824e-06, + "loss": 0.6408, + "step": 829 + }, + { + "epoch": 0.22036373290853578, + "grad_norm": 0.3391307278832495, + "learning_rate": 4.9870201788173334e-06, + "loss": 0.6455, + "step": 830 + }, + { + "epoch": 0.22062923138191956, + "grad_norm": 0.3462273975496499, + "learning_rate": 4.986984624215861e-06, + "loss": 0.6425, + "step": 831 + }, + { + "epoch": 0.22089472985530334, + "grad_norm": 0.3546835803282198, + "learning_rate": 4.986949021112098e-06, + "loss": 0.6414, + "step": 832 + }, + { + "epoch": 0.22116022832868712, + "grad_norm": 0.35926255487376796, + "learning_rate": 4.986913369506742e-06, + "loss": 0.6453, + "step": 833 + }, + { + "epoch": 0.2214257268020709, + "grad_norm": 0.346809525634212, + "learning_rate": 4.986877669400485e-06, + "loss": 0.6378, + "step": 834 + }, + { + "epoch": 0.22169122527545468, + "grad_norm": 0.3623921896573435, + "learning_rate": 4.9868419207940255e-06, + "loss": 0.7046, + "step": 835 + }, + { + "epoch": 0.22195672374883843, + "grad_norm": 0.3633208444851114, + "learning_rate": 4.986806123688059e-06, + "loss": 0.6769, + "step": 836 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.35061516901527795, + "learning_rate": 4.986770278083284e-06, + "loss": 0.6449, + "step": 837 + }, + { + "epoch": 0.222487720695606, + "grad_norm": 0.3571100916897149, + "learning_rate": 4.986734383980401e-06, + "loss": 0.6548, + "step": 838 + }, + { + "epoch": 0.22275321916898977, + "grad_norm": 0.3632731411894179, + "learning_rate": 4.9866984413801075e-06, + "loss": 0.6793, + "step": 839 + }, + { + "epoch": 0.22301871764237355, + "grad_norm": 0.3657427335552278, + "learning_rate": 4.986662450283107e-06, + "loss": 0.6692, + "step": 840 + }, + { + "epoch": 0.22328421611575733, + "grad_norm": 0.34989572876436803, + "learning_rate": 4.9866264106900995e-06, + "loss": 0.6341, + "step": 841 + }, + { + "epoch": 0.2235497145891411, + "grad_norm": 0.35678405687180387, + "learning_rate": 4.98659032260179e-06, + "loss": 0.6096, + "step": 842 + }, + { + "epoch": 0.2238152130625249, + "grad_norm": 0.34616644977846645, + "learning_rate": 4.986554186018879e-06, + "loss": 0.6152, + "step": 843 + }, + { + "epoch": 0.22408071153590867, + "grad_norm": 0.3472065104355087, + "learning_rate": 4.986518000942075e-06, + "loss": 0.6326, + "step": 844 + }, + { + "epoch": 0.22434621000929245, + "grad_norm": 0.3429874846006786, + "learning_rate": 4.986481767372081e-06, + "loss": 0.6527, + "step": 845 + }, + { + "epoch": 0.22461170848267623, + "grad_norm": 0.3397592737642725, + "learning_rate": 4.986445485309606e-06, + "loss": 0.6397, + "step": 846 + }, + { + "epoch": 0.22487720695606, + "grad_norm": 0.35275167602090624, + "learning_rate": 4.986409154755354e-06, + "loss": 0.6615, + "step": 847 + }, + { + "epoch": 0.22514270542944378, + "grad_norm": 0.3661359137649159, + "learning_rate": 4.9863727757100375e-06, + "loss": 0.6733, + "step": 848 + }, + { + "epoch": 0.22540820390282756, + "grad_norm": 0.3590313219117273, + "learning_rate": 4.986336348174364e-06, + "loss": 0.6185, + "step": 849 + }, + { + "epoch": 0.22567370237621134, + "grad_norm": 0.37040577861478885, + "learning_rate": 4.986299872149043e-06, + "loss": 0.6885, + "step": 850 + }, + { + "epoch": 0.22593920084959512, + "grad_norm": 0.3522247074086537, + "learning_rate": 4.986263347634787e-06, + "loss": 0.6409, + "step": 851 + }, + { + "epoch": 0.2262046993229789, + "grad_norm": 0.3626158025539011, + "learning_rate": 4.986226774632309e-06, + "loss": 0.6617, + "step": 852 + }, + { + "epoch": 0.22647019779636268, + "grad_norm": 0.3619975206918967, + "learning_rate": 4.986190153142321e-06, + "loss": 0.6815, + "step": 853 + }, + { + "epoch": 0.22673569626974646, + "grad_norm": 0.36106204110325646, + "learning_rate": 4.986153483165539e-06, + "loss": 0.7225, + "step": 854 + }, + { + "epoch": 0.2270011947431302, + "grad_norm": 0.35954909376182703, + "learning_rate": 4.9861167647026755e-06, + "loss": 0.6373, + "step": 855 + }, + { + "epoch": 0.227266693216514, + "grad_norm": 0.3654694196099264, + "learning_rate": 4.986079997754449e-06, + "loss": 0.6383, + "step": 856 + }, + { + "epoch": 0.22753219168989777, + "grad_norm": 0.36021768240263363, + "learning_rate": 4.986043182321575e-06, + "loss": 0.6642, + "step": 857 + }, + { + "epoch": 0.22779769016328155, + "grad_norm": 0.36383655510780455, + "learning_rate": 4.986006318404772e-06, + "loss": 0.6248, + "step": 858 + }, + { + "epoch": 0.22806318863666533, + "grad_norm": 0.36525947002795617, + "learning_rate": 4.985969406004759e-06, + "loss": 0.6456, + "step": 859 + }, + { + "epoch": 0.2283286871100491, + "grad_norm": 0.3847121537010074, + "learning_rate": 4.985932445122257e-06, + "loss": 0.6528, + "step": 860 + }, + { + "epoch": 0.2285941855834329, + "grad_norm": 0.37182492589245225, + "learning_rate": 4.985895435757985e-06, + "loss": 0.6892, + "step": 861 + }, + { + "epoch": 0.22885968405681667, + "grad_norm": 0.3712810265763539, + "learning_rate": 4.9858583779126645e-06, + "loss": 0.6591, + "step": 862 + }, + { + "epoch": 0.22912518253020045, + "grad_norm": 0.36232619618674444, + "learning_rate": 4.9858212715870205e-06, + "loss": 0.6492, + "step": 863 + }, + { + "epoch": 0.22939068100358423, + "grad_norm": 0.3725415819789804, + "learning_rate": 4.985784116781774e-06, + "loss": 0.6659, + "step": 864 + }, + { + "epoch": 0.229656179476968, + "grad_norm": 0.3413921760507849, + "learning_rate": 4.985746913497652e-06, + "loss": 0.6554, + "step": 865 + }, + { + "epoch": 0.2299216779503518, + "grad_norm": 0.3836353768850874, + "learning_rate": 4.985709661735379e-06, + "loss": 0.6287, + "step": 866 + }, + { + "epoch": 0.23018717642373557, + "grad_norm": 0.36842816359691, + "learning_rate": 4.9856723614956815e-06, + "loss": 0.6662, + "step": 867 + }, + { + "epoch": 0.23045267489711935, + "grad_norm": 0.4069655588649439, + "learning_rate": 4.985635012779288e-06, + "loss": 0.6567, + "step": 868 + }, + { + "epoch": 0.23071817337050313, + "grad_norm": 0.3586906177489775, + "learning_rate": 4.985597615586924e-06, + "loss": 0.6457, + "step": 869 + }, + { + "epoch": 0.2309836718438869, + "grad_norm": 0.3690248569199568, + "learning_rate": 4.985560169919321e-06, + "loss": 0.6836, + "step": 870 + }, + { + "epoch": 0.23124917031727069, + "grad_norm": 0.3618169009342208, + "learning_rate": 4.9855226757772105e-06, + "loss": 0.7102, + "step": 871 + }, + { + "epoch": 0.23151466879065447, + "grad_norm": 0.45226740809634364, + "learning_rate": 4.985485133161321e-06, + "loss": 0.6265, + "step": 872 + }, + { + "epoch": 0.23178016726403824, + "grad_norm": 0.38149227214737613, + "learning_rate": 4.985447542072386e-06, + "loss": 0.6584, + "step": 873 + }, + { + "epoch": 0.232045665737422, + "grad_norm": 0.42137441504054346, + "learning_rate": 4.985409902511138e-06, + "loss": 0.6359, + "step": 874 + }, + { + "epoch": 0.23231116421080578, + "grad_norm": 0.3629050435358434, + "learning_rate": 4.985372214478313e-06, + "loss": 0.6712, + "step": 875 + }, + { + "epoch": 0.23257666268418956, + "grad_norm": 0.3797761774400396, + "learning_rate": 4.985334477974643e-06, + "loss": 0.6328, + "step": 876 + }, + { + "epoch": 0.23284216115757334, + "grad_norm": 0.3683308015560075, + "learning_rate": 4.985296693000866e-06, + "loss": 0.6919, + "step": 877 + }, + { + "epoch": 0.23310765963095711, + "grad_norm": 0.37672600814985063, + "learning_rate": 4.985258859557719e-06, + "loss": 0.6541, + "step": 878 + }, + { + "epoch": 0.2333731581043409, + "grad_norm": 0.3635422343734699, + "learning_rate": 4.985220977645939e-06, + "loss": 0.6449, + "step": 879 + }, + { + "epoch": 0.23363865657772467, + "grad_norm": 0.340922674674232, + "learning_rate": 4.985183047266265e-06, + "loss": 0.6319, + "step": 880 + }, + { + "epoch": 0.23390415505110845, + "grad_norm": 0.3490076534717787, + "learning_rate": 4.985145068419437e-06, + "loss": 0.6418, + "step": 881 + }, + { + "epoch": 0.23416965352449223, + "grad_norm": 0.3774554872420336, + "learning_rate": 4.9851070411061954e-06, + "loss": 0.6166, + "step": 882 + }, + { + "epoch": 0.234435151997876, + "grad_norm": 0.3570349216253811, + "learning_rate": 4.985068965327281e-06, + "loss": 0.6441, + "step": 883 + }, + { + "epoch": 0.2347006504712598, + "grad_norm": 0.36709361727630363, + "learning_rate": 4.985030841083439e-06, + "loss": 0.6676, + "step": 884 + }, + { + "epoch": 0.23496614894464357, + "grad_norm": 0.36607196029775896, + "learning_rate": 4.98499266837541e-06, + "loss": 0.6355, + "step": 885 + }, + { + "epoch": 0.23523164741802735, + "grad_norm": 0.373504831992073, + "learning_rate": 4.9849544472039394e-06, + "loss": 0.6617, + "step": 886 + }, + { + "epoch": 0.23549714589141113, + "grad_norm": 0.3573539490018993, + "learning_rate": 4.984916177569774e-06, + "loss": 0.6118, + "step": 887 + }, + { + "epoch": 0.2357626443647949, + "grad_norm": 0.35686848155853435, + "learning_rate": 4.984877859473658e-06, + "loss": 0.6522, + "step": 888 + }, + { + "epoch": 0.2360281428381787, + "grad_norm": 0.35608679872998955, + "learning_rate": 4.98483949291634e-06, + "loss": 0.6628, + "step": 889 + }, + { + "epoch": 0.23629364131156247, + "grad_norm": 0.3637007806136096, + "learning_rate": 4.984801077898568e-06, + "loss": 0.6506, + "step": 890 + }, + { + "epoch": 0.23655913978494625, + "grad_norm": 0.3545252456352333, + "learning_rate": 4.984762614421092e-06, + "loss": 0.6983, + "step": 891 + }, + { + "epoch": 0.23682463825833003, + "grad_norm": 0.36288115661550346, + "learning_rate": 4.98472410248466e-06, + "loss": 0.6717, + "step": 892 + }, + { + "epoch": 0.23709013673171378, + "grad_norm": 0.3626678375509858, + "learning_rate": 4.9846855420900245e-06, + "loss": 0.6132, + "step": 893 + }, + { + "epoch": 0.23735563520509756, + "grad_norm": 0.3484005456555979, + "learning_rate": 4.984646933237938e-06, + "loss": 0.6565, + "step": 894 + }, + { + "epoch": 0.23762113367848134, + "grad_norm": 0.34605002196912016, + "learning_rate": 4.984608275929153e-06, + "loss": 0.6166, + "step": 895 + }, + { + "epoch": 0.23788663215186512, + "grad_norm": 0.34730696193486404, + "learning_rate": 4.984569570164423e-06, + "loss": 0.6599, + "step": 896 + }, + { + "epoch": 0.2381521306252489, + "grad_norm": 0.353993737368934, + "learning_rate": 4.984530815944502e-06, + "loss": 0.6451, + "step": 897 + }, + { + "epoch": 0.23841762909863268, + "grad_norm": 0.3515130354430692, + "learning_rate": 4.984492013270148e-06, + "loss": 0.6616, + "step": 898 + }, + { + "epoch": 0.23868312757201646, + "grad_norm": 0.36635953389940396, + "learning_rate": 4.9844531621421165e-06, + "loss": 0.6187, + "step": 899 + }, + { + "epoch": 0.23894862604540024, + "grad_norm": 0.34987838664873055, + "learning_rate": 4.984414262561166e-06, + "loss": 0.6582, + "step": 900 + }, + { + "epoch": 0.23921412451878402, + "grad_norm": 0.36813857307007014, + "learning_rate": 4.984375314528052e-06, + "loss": 0.6537, + "step": 901 + }, + { + "epoch": 0.2394796229921678, + "grad_norm": 0.36019520717900866, + "learning_rate": 4.984336318043539e-06, + "loss": 0.6505, + "step": 902 + }, + { + "epoch": 0.23974512146555157, + "grad_norm": 0.35914257245516107, + "learning_rate": 4.984297273108385e-06, + "loss": 0.6441, + "step": 903 + }, + { + "epoch": 0.24001061993893535, + "grad_norm": 0.36124637472865423, + "learning_rate": 4.984258179723351e-06, + "loss": 0.6503, + "step": 904 + }, + { + "epoch": 0.24027611841231913, + "grad_norm": 0.37851229317535584, + "learning_rate": 4.9842190378892e-06, + "loss": 0.6443, + "step": 905 + }, + { + "epoch": 0.2405416168857029, + "grad_norm": 0.3473299876317927, + "learning_rate": 4.984179847606695e-06, + "loss": 0.6199, + "step": 906 + }, + { + "epoch": 0.2408071153590867, + "grad_norm": 0.3730397722899552, + "learning_rate": 4.9841406088766005e-06, + "loss": 0.6886, + "step": 907 + }, + { + "epoch": 0.24107261383247047, + "grad_norm": 0.3672165214242251, + "learning_rate": 4.9841013216996825e-06, + "loss": 0.6966, + "step": 908 + }, + { + "epoch": 0.24133811230585425, + "grad_norm": 0.3816250965958184, + "learning_rate": 4.984061986076707e-06, + "loss": 0.6443, + "step": 909 + }, + { + "epoch": 0.24160361077923803, + "grad_norm": 0.3638965681825285, + "learning_rate": 4.98402260200844e-06, + "loss": 0.6707, + "step": 910 + }, + { + "epoch": 0.2418691092526218, + "grad_norm": 0.3865422142358914, + "learning_rate": 4.983983169495651e-06, + "loss": 0.6047, + "step": 911 + }, + { + "epoch": 0.24213460772600556, + "grad_norm": 0.348430739502864, + "learning_rate": 4.983943688539108e-06, + "loss": 0.6376, + "step": 912 + }, + { + "epoch": 0.24240010619938934, + "grad_norm": 0.3440800363082338, + "learning_rate": 4.983904159139581e-06, + "loss": 0.6056, + "step": 913 + }, + { + "epoch": 0.24266560467277312, + "grad_norm": 0.3602285592115701, + "learning_rate": 4.983864581297841e-06, + "loss": 0.6562, + "step": 914 + }, + { + "epoch": 0.2429311031461569, + "grad_norm": 0.36508533233465384, + "learning_rate": 4.983824955014661e-06, + "loss": 0.7068, + "step": 915 + }, + { + "epoch": 0.24319660161954068, + "grad_norm": 0.37488541240290535, + "learning_rate": 4.983785280290812e-06, + "loss": 0.6669, + "step": 916 + }, + { + "epoch": 0.24346210009292446, + "grad_norm": 0.36745750796129384, + "learning_rate": 4.9837455571270696e-06, + "loss": 0.6554, + "step": 917 + }, + { + "epoch": 0.24372759856630824, + "grad_norm": 0.347919831926423, + "learning_rate": 4.983705785524207e-06, + "loss": 0.6433, + "step": 918 + }, + { + "epoch": 0.24399309703969202, + "grad_norm": 0.3645222567372667, + "learning_rate": 4.983665965483001e-06, + "loss": 0.623, + "step": 919 + }, + { + "epoch": 0.2442585955130758, + "grad_norm": 0.36734959172373594, + "learning_rate": 4.983626097004227e-06, + "loss": 0.677, + "step": 920 + }, + { + "epoch": 0.24452409398645958, + "grad_norm": 0.3597583035018811, + "learning_rate": 4.9835861800886635e-06, + "loss": 0.6362, + "step": 921 + }, + { + "epoch": 0.24478959245984336, + "grad_norm": 0.36723122963508614, + "learning_rate": 4.983546214737088e-06, + "loss": 0.6605, + "step": 922 + }, + { + "epoch": 0.24505509093322714, + "grad_norm": 0.3575193735159897, + "learning_rate": 4.9835062009502815e-06, + "loss": 0.6969, + "step": 923 + }, + { + "epoch": 0.24532058940661092, + "grad_norm": 0.3639065644092349, + "learning_rate": 4.983466138729023e-06, + "loss": 0.655, + "step": 924 + }, + { + "epoch": 0.2455860878799947, + "grad_norm": 0.3523416143981236, + "learning_rate": 4.983426028074095e-06, + "loss": 0.643, + "step": 925 + }, + { + "epoch": 0.24585158635337848, + "grad_norm": 0.36972737936054123, + "learning_rate": 4.983385868986277e-06, + "loss": 0.6543, + "step": 926 + }, + { + "epoch": 0.24611708482676226, + "grad_norm": 0.3628422006374738, + "learning_rate": 4.983345661466356e-06, + "loss": 0.6751, + "step": 927 + }, + { + "epoch": 0.24638258330014604, + "grad_norm": 0.378487498998904, + "learning_rate": 4.983305405515113e-06, + "loss": 0.6578, + "step": 928 + }, + { + "epoch": 0.24664808177352981, + "grad_norm": 0.35773346585319693, + "learning_rate": 4.983265101133335e-06, + "loss": 0.6425, + "step": 929 + }, + { + "epoch": 0.24691358024691357, + "grad_norm": 0.3567053830313769, + "learning_rate": 4.983224748321808e-06, + "loss": 0.6457, + "step": 930 + }, + { + "epoch": 0.24717907872029735, + "grad_norm": 0.35787311418723283, + "learning_rate": 4.983184347081318e-06, + "loss": 0.5949, + "step": 931 + }, + { + "epoch": 0.24744457719368113, + "grad_norm": 0.34942920908399994, + "learning_rate": 4.9831438974126525e-06, + "loss": 0.6474, + "step": 932 + }, + { + "epoch": 0.2477100756670649, + "grad_norm": 0.3726541359601728, + "learning_rate": 4.983103399316602e-06, + "loss": 0.6706, + "step": 933 + }, + { + "epoch": 0.24797557414044868, + "grad_norm": 0.4419698443655577, + "learning_rate": 4.983062852793956e-06, + "loss": 0.6362, + "step": 934 + }, + { + "epoch": 0.24824107261383246, + "grad_norm": 0.3730662683997837, + "learning_rate": 4.983022257845504e-06, + "loss": 0.6489, + "step": 935 + }, + { + "epoch": 0.24850657108721624, + "grad_norm": 0.39203459093004006, + "learning_rate": 4.982981614472039e-06, + "loss": 0.6303, + "step": 936 + }, + { + "epoch": 0.24877206956060002, + "grad_norm": 0.3746997194288708, + "learning_rate": 4.982940922674353e-06, + "loss": 0.6475, + "step": 937 + }, + { + "epoch": 0.2490375680339838, + "grad_norm": 0.3774628011618854, + "learning_rate": 4.98290018245324e-06, + "loss": 0.6794, + "step": 938 + }, + { + "epoch": 0.24930306650736758, + "grad_norm": 0.3851350044540439, + "learning_rate": 4.982859393809493e-06, + "loss": 0.6605, + "step": 939 + }, + { + "epoch": 0.24956856498075136, + "grad_norm": 0.36840943795916076, + "learning_rate": 4.98281855674391e-06, + "loss": 0.6229, + "step": 940 + }, + { + "epoch": 0.24983406345413514, + "grad_norm": 0.3552009871115495, + "learning_rate": 4.982777671257285e-06, + "loss": 0.5982, + "step": 941 + }, + { + "epoch": 0.2500995619275189, + "grad_norm": 0.3707382212266125, + "learning_rate": 4.982736737350418e-06, + "loss": 0.6646, + "step": 942 + }, + { + "epoch": 0.2503650604009027, + "grad_norm": 0.3543486718509332, + "learning_rate": 4.982695755024105e-06, + "loss": 0.6414, + "step": 943 + }, + { + "epoch": 0.2506305588742865, + "grad_norm": 0.38513378319201824, + "learning_rate": 4.982654724279146e-06, + "loss": 0.6485, + "step": 944 + }, + { + "epoch": 0.25089605734767023, + "grad_norm": 0.3636189188787609, + "learning_rate": 4.982613645116342e-06, + "loss": 0.6496, + "step": 945 + }, + { + "epoch": 0.25116155582105404, + "grad_norm": 0.36571575378837284, + "learning_rate": 4.9825725175364926e-06, + "loss": 0.6856, + "step": 946 + }, + { + "epoch": 0.2514270542944378, + "grad_norm": 0.3807450704812266, + "learning_rate": 4.9825313415404005e-06, + "loss": 0.6756, + "step": 947 + }, + { + "epoch": 0.2516925527678216, + "grad_norm": 0.36818288389234677, + "learning_rate": 4.98249011712887e-06, + "loss": 0.6881, + "step": 948 + }, + { + "epoch": 0.25195805124120535, + "grad_norm": 0.3549349496432932, + "learning_rate": 4.982448844302704e-06, + "loss": 0.6183, + "step": 949 + }, + { + "epoch": 0.25222354971458916, + "grad_norm": 0.36306646432366935, + "learning_rate": 4.982407523062707e-06, + "loss": 0.6808, + "step": 950 + }, + { + "epoch": 0.2524890481879729, + "grad_norm": 0.3394703612759071, + "learning_rate": 4.982366153409685e-06, + "loss": 0.6189, + "step": 951 + }, + { + "epoch": 0.2527545466613567, + "grad_norm": 0.3465033805123829, + "learning_rate": 4.982324735344446e-06, + "loss": 0.6086, + "step": 952 + }, + { + "epoch": 0.25302004513474047, + "grad_norm": 0.3614620012877914, + "learning_rate": 4.982283268867796e-06, + "loss": 0.6588, + "step": 953 + }, + { + "epoch": 0.2532855436081243, + "grad_norm": 0.3609814306439208, + "learning_rate": 4.9822417539805445e-06, + "loss": 0.6627, + "step": 954 + }, + { + "epoch": 0.253551042081508, + "grad_norm": 0.3664856599554348, + "learning_rate": 4.982200190683502e-06, + "loss": 0.6118, + "step": 955 + }, + { + "epoch": 0.25381654055489183, + "grad_norm": 0.3551510979475306, + "learning_rate": 4.9821585789774775e-06, + "loss": 0.6709, + "step": 956 + }, + { + "epoch": 0.2540820390282756, + "grad_norm": 0.3559223188494908, + "learning_rate": 4.982116918863285e-06, + "loss": 0.6481, + "step": 957 + }, + { + "epoch": 0.2543475375016594, + "grad_norm": 0.37500353329046965, + "learning_rate": 4.9820752103417335e-06, + "loss": 0.6362, + "step": 958 + }, + { + "epoch": 0.25461303597504314, + "grad_norm": 0.3468948169563887, + "learning_rate": 4.982033453413638e-06, + "loss": 0.6088, + "step": 959 + }, + { + "epoch": 0.2548785344484269, + "grad_norm": 0.3539323668269743, + "learning_rate": 4.981991648079814e-06, + "loss": 0.6213, + "step": 960 + }, + { + "epoch": 0.2551440329218107, + "grad_norm": 0.391904199579984, + "learning_rate": 4.981949794341077e-06, + "loss": 0.6606, + "step": 961 + }, + { + "epoch": 0.25540953139519446, + "grad_norm": 0.37079725181686624, + "learning_rate": 4.98190789219824e-06, + "loss": 0.6235, + "step": 962 + }, + { + "epoch": 0.25567502986857826, + "grad_norm": 0.3498087643698131, + "learning_rate": 4.981865941652124e-06, + "loss": 0.6164, + "step": 963 + }, + { + "epoch": 0.255940528341962, + "grad_norm": 0.37674344852061253, + "learning_rate": 4.981823942703544e-06, + "loss": 0.6377, + "step": 964 + }, + { + "epoch": 0.2562060268153458, + "grad_norm": 0.3659909389340991, + "learning_rate": 4.9817818953533215e-06, + "loss": 0.6428, + "step": 965 + }, + { + "epoch": 0.2564715252887296, + "grad_norm": 0.3686889799009494, + "learning_rate": 4.981739799602275e-06, + "loss": 0.6735, + "step": 966 + }, + { + "epoch": 0.2567370237621134, + "grad_norm": 0.3754154691933635, + "learning_rate": 4.981697655451228e-06, + "loss": 0.658, + "step": 967 + }, + { + "epoch": 0.25700252223549713, + "grad_norm": 0.3494085335346896, + "learning_rate": 4.981655462900998e-06, + "loss": 0.6506, + "step": 968 + }, + { + "epoch": 0.25726802070888094, + "grad_norm": 0.3617112473409996, + "learning_rate": 4.981613221952412e-06, + "loss": 0.6292, + "step": 969 + }, + { + "epoch": 0.2575335191822647, + "grad_norm": 0.35222431431476064, + "learning_rate": 4.981570932606291e-06, + "loss": 0.6287, + "step": 970 + }, + { + "epoch": 0.2577990176556485, + "grad_norm": 0.358091030551002, + "learning_rate": 4.981528594863461e-06, + "loss": 0.6463, + "step": 971 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 0.35783571511857615, + "learning_rate": 4.981486208724748e-06, + "loss": 0.6451, + "step": 972 + }, + { + "epoch": 0.25833001460241606, + "grad_norm": 0.3376080650157559, + "learning_rate": 4.981443774190978e-06, + "loss": 0.6119, + "step": 973 + }, + { + "epoch": 0.2585955130757998, + "grad_norm": 0.37354453677979915, + "learning_rate": 4.981401291262979e-06, + "loss": 0.6466, + "step": 974 + }, + { + "epoch": 0.2588610115491836, + "grad_norm": 0.3585901221168928, + "learning_rate": 4.981358759941579e-06, + "loss": 0.625, + "step": 975 + }, + { + "epoch": 0.25912651002256737, + "grad_norm": 0.3606103589636798, + "learning_rate": 4.981316180227609e-06, + "loss": 0.6434, + "step": 976 + }, + { + "epoch": 0.2593920084959511, + "grad_norm": 0.3568169059932706, + "learning_rate": 4.981273552121896e-06, + "loss": 0.6422, + "step": 977 + }, + { + "epoch": 0.25965750696933493, + "grad_norm": 0.3612424596771187, + "learning_rate": 4.981230875625275e-06, + "loss": 0.6385, + "step": 978 + }, + { + "epoch": 0.2599230054427187, + "grad_norm": 0.3654440661113346, + "learning_rate": 4.9811881507385766e-06, + "loss": 0.6624, + "step": 979 + }, + { + "epoch": 0.2601885039161025, + "grad_norm": 0.3744325849965256, + "learning_rate": 4.9811453774626335e-06, + "loss": 0.6301, + "step": 980 + }, + { + "epoch": 0.26045400238948624, + "grad_norm": 0.36838728225673273, + "learning_rate": 4.981102555798282e-06, + "loss": 0.6492, + "step": 981 + }, + { + "epoch": 0.26071950086287005, + "grad_norm": 0.4095904456767099, + "learning_rate": 4.981059685746355e-06, + "loss": 0.6332, + "step": 982 + }, + { + "epoch": 0.2609849993362538, + "grad_norm": 0.3657123204519917, + "learning_rate": 4.9810167673076896e-06, + "loss": 0.6447, + "step": 983 + }, + { + "epoch": 0.2612504978096376, + "grad_norm": 0.37768044353818037, + "learning_rate": 4.980973800483123e-06, + "loss": 0.649, + "step": 984 + }, + { + "epoch": 0.26151599628302136, + "grad_norm": 0.4065192180685813, + "learning_rate": 4.980930785273492e-06, + "loss": 0.6574, + "step": 985 + }, + { + "epoch": 0.26178149475640516, + "grad_norm": 0.3681415226963581, + "learning_rate": 4.980887721679637e-06, + "loss": 0.6555, + "step": 986 + }, + { + "epoch": 0.2620469932297889, + "grad_norm": 0.36216962817587833, + "learning_rate": 4.980844609702397e-06, + "loss": 0.6256, + "step": 987 + }, + { + "epoch": 0.2623124917031727, + "grad_norm": 0.3752251781540997, + "learning_rate": 4.9808014493426124e-06, + "loss": 0.6248, + "step": 988 + }, + { + "epoch": 0.2625779901765565, + "grad_norm": 0.36627485508812824, + "learning_rate": 4.9807582406011265e-06, + "loss": 0.6692, + "step": 989 + }, + { + "epoch": 0.2628434886499403, + "grad_norm": 0.3638792629169123, + "learning_rate": 4.9807149834787806e-06, + "loss": 0.6736, + "step": 990 + }, + { + "epoch": 0.26310898712332403, + "grad_norm": 0.37848625346615206, + "learning_rate": 4.980671677976419e-06, + "loss": 0.6286, + "step": 991 + }, + { + "epoch": 0.26337448559670784, + "grad_norm": 0.3748855594569195, + "learning_rate": 4.980628324094885e-06, + "loss": 0.6538, + "step": 992 + }, + { + "epoch": 0.2636399840700916, + "grad_norm": 0.35586075229113756, + "learning_rate": 4.9805849218350255e-06, + "loss": 0.6697, + "step": 993 + }, + { + "epoch": 0.2639054825434754, + "grad_norm": 0.39699081477204445, + "learning_rate": 4.980541471197687e-06, + "loss": 0.6472, + "step": 994 + }, + { + "epoch": 0.26417098101685915, + "grad_norm": 0.366446635023391, + "learning_rate": 4.980497972183716e-06, + "loss": 0.6594, + "step": 995 + }, + { + "epoch": 0.2644364794902429, + "grad_norm": 0.3796150359783703, + "learning_rate": 4.980454424793961e-06, + "loss": 0.6424, + "step": 996 + }, + { + "epoch": 0.2647019779636267, + "grad_norm": 0.3723896820854948, + "learning_rate": 4.980410829029272e-06, + "loss": 0.628, + "step": 997 + }, + { + "epoch": 0.26496747643701046, + "grad_norm": 0.35886436852751274, + "learning_rate": 4.980367184890499e-06, + "loss": 0.6445, + "step": 998 + }, + { + "epoch": 0.26523297491039427, + "grad_norm": 0.35835065043728165, + "learning_rate": 4.980323492378491e-06, + "loss": 0.6441, + "step": 999 + }, + { + "epoch": 0.265498473383778, + "grad_norm": 0.3591313042147608, + "learning_rate": 4.980279751494104e-06, + "loss": 0.6866, + "step": 1000 + }, + { + "epoch": 0.26576397185716183, + "grad_norm": 0.3656836899949334, + "learning_rate": 4.980235962238188e-06, + "loss": 0.6926, + "step": 1001 + }, + { + "epoch": 0.2660294703305456, + "grad_norm": 0.3607855738162133, + "learning_rate": 4.980192124611599e-06, + "loss": 0.6272, + "step": 1002 + }, + { + "epoch": 0.2662949688039294, + "grad_norm": 0.36855282842246273, + "learning_rate": 4.9801482386151905e-06, + "loss": 0.6606, + "step": 1003 + }, + { + "epoch": 0.26656046727731314, + "grad_norm": 0.3699974943566117, + "learning_rate": 4.980104304249819e-06, + "loss": 0.6967, + "step": 1004 + }, + { + "epoch": 0.26682596575069695, + "grad_norm": 0.35427505302748685, + "learning_rate": 4.9800603215163415e-06, + "loss": 0.613, + "step": 1005 + }, + { + "epoch": 0.2670914642240807, + "grad_norm": 0.3620278170370366, + "learning_rate": 4.9800162904156155e-06, + "loss": 0.6374, + "step": 1006 + }, + { + "epoch": 0.2673569626974645, + "grad_norm": 0.3569586726428625, + "learning_rate": 4.979972210948499e-06, + "loss": 0.6423, + "step": 1007 + }, + { + "epoch": 0.26762246117084826, + "grad_norm": 0.35097849073088916, + "learning_rate": 4.979928083115853e-06, + "loss": 0.6223, + "step": 1008 + }, + { + "epoch": 0.26788795964423207, + "grad_norm": 0.3558841188227152, + "learning_rate": 4.979883906918536e-06, + "loss": 0.6142, + "step": 1009 + }, + { + "epoch": 0.2681534581176158, + "grad_norm": 0.36359631099266737, + "learning_rate": 4.979839682357414e-06, + "loss": 0.6596, + "step": 1010 + }, + { + "epoch": 0.2684189565909996, + "grad_norm": 0.3636149284226135, + "learning_rate": 4.979795409433344e-06, + "loss": 0.6702, + "step": 1011 + }, + { + "epoch": 0.2686844550643834, + "grad_norm": 0.3722364173082872, + "learning_rate": 4.979751088147192e-06, + "loss": 0.6375, + "step": 1012 + }, + { + "epoch": 0.2689499535377672, + "grad_norm": 0.38001659256946785, + "learning_rate": 4.979706718499824e-06, + "loss": 0.6302, + "step": 1013 + }, + { + "epoch": 0.26921545201115094, + "grad_norm": 0.37155343314022443, + "learning_rate": 4.9796623004921016e-06, + "loss": 0.6795, + "step": 1014 + }, + { + "epoch": 0.2694809504845347, + "grad_norm": 0.36957715696091586, + "learning_rate": 4.979617834124895e-06, + "loss": 0.6315, + "step": 1015 + }, + { + "epoch": 0.2697464489579185, + "grad_norm": 0.36786882803706994, + "learning_rate": 4.979573319399068e-06, + "loss": 0.6483, + "step": 1016 + }, + { + "epoch": 0.27001194743130225, + "grad_norm": 0.36288275679902826, + "learning_rate": 4.979528756315491e-06, + "loss": 0.6397, + "step": 1017 + }, + { + "epoch": 0.27027744590468605, + "grad_norm": 0.3549232309789082, + "learning_rate": 4.979484144875032e-06, + "loss": 0.6611, + "step": 1018 + }, + { + "epoch": 0.2705429443780698, + "grad_norm": 0.3576446391815027, + "learning_rate": 4.9794394850785616e-06, + "loss": 0.6114, + "step": 1019 + }, + { + "epoch": 0.2708084428514536, + "grad_norm": 0.36090274675872946, + "learning_rate": 4.97939477692695e-06, + "loss": 0.6024, + "step": 1020 + }, + { + "epoch": 0.27107394132483736, + "grad_norm": 0.3572624736441068, + "learning_rate": 4.979350020421071e-06, + "loss": 0.6561, + "step": 1021 + }, + { + "epoch": 0.27133943979822117, + "grad_norm": 0.3715741116524624, + "learning_rate": 4.979305215561795e-06, + "loss": 0.661, + "step": 1022 + }, + { + "epoch": 0.2716049382716049, + "grad_norm": 0.36784880588918306, + "learning_rate": 4.9792603623499965e-06, + "loss": 0.6703, + "step": 1023 + }, + { + "epoch": 0.27187043674498873, + "grad_norm": 0.40290401087707023, + "learning_rate": 4.979215460786552e-06, + "loss": 0.6533, + "step": 1024 + }, + { + "epoch": 0.2721359352183725, + "grad_norm": 0.35855405719324956, + "learning_rate": 4.979170510872334e-06, + "loss": 0.6329, + "step": 1025 + }, + { + "epoch": 0.2724014336917563, + "grad_norm": 0.3647908558652933, + "learning_rate": 4.979125512608223e-06, + "loss": 0.6547, + "step": 1026 + }, + { + "epoch": 0.27266693216514004, + "grad_norm": 0.36947883787403, + "learning_rate": 4.979080465995093e-06, + "loss": 0.6467, + "step": 1027 + }, + { + "epoch": 0.27293243063852385, + "grad_norm": 0.3846719010338243, + "learning_rate": 4.979035371033824e-06, + "loss": 0.6712, + "step": 1028 + }, + { + "epoch": 0.2731979291119076, + "grad_norm": 0.37234047371660695, + "learning_rate": 4.978990227725297e-06, + "loss": 0.6811, + "step": 1029 + }, + { + "epoch": 0.2734634275852914, + "grad_norm": 0.3682858925006891, + "learning_rate": 4.97894503607039e-06, + "loss": 0.6696, + "step": 1030 + }, + { + "epoch": 0.27372892605867516, + "grad_norm": 0.3553102914397915, + "learning_rate": 4.978899796069986e-06, + "loss": 0.6348, + "step": 1031 + }, + { + "epoch": 0.27399442453205897, + "grad_norm": 0.38445335797009916, + "learning_rate": 4.9788545077249655e-06, + "loss": 0.6416, + "step": 1032 + }, + { + "epoch": 0.2742599230054427, + "grad_norm": 0.3540641688007243, + "learning_rate": 4.978809171036213e-06, + "loss": 0.6436, + "step": 1033 + }, + { + "epoch": 0.27452542147882647, + "grad_norm": 0.35701320811905085, + "learning_rate": 4.978763786004613e-06, + "loss": 0.6522, + "step": 1034 + }, + { + "epoch": 0.2747909199522103, + "grad_norm": 0.3816791247486726, + "learning_rate": 4.97871835263105e-06, + "loss": 0.6384, + "step": 1035 + }, + { + "epoch": 0.27505641842559403, + "grad_norm": 0.3729395285778984, + "learning_rate": 4.97867287091641e-06, + "loss": 0.6667, + "step": 1036 + }, + { + "epoch": 0.27532191689897784, + "grad_norm": 0.3533855518978844, + "learning_rate": 4.9786273408615805e-06, + "loss": 0.6406, + "step": 1037 + }, + { + "epoch": 0.2755874153723616, + "grad_norm": 0.3622612832585471, + "learning_rate": 4.978581762467449e-06, + "loss": 0.6295, + "step": 1038 + }, + { + "epoch": 0.2758529138457454, + "grad_norm": 0.37672182638995366, + "learning_rate": 4.978536135734905e-06, + "loss": 0.6302, + "step": 1039 + }, + { + "epoch": 0.27611841231912915, + "grad_norm": 0.3657574771076464, + "learning_rate": 4.9784904606648366e-06, + "loss": 0.6105, + "step": 1040 + }, + { + "epoch": 0.27638391079251295, + "grad_norm": 0.42636879444707215, + "learning_rate": 4.978444737258137e-06, + "loss": 0.6249, + "step": 1041 + }, + { + "epoch": 0.2766494092658967, + "grad_norm": 0.3674160896616571, + "learning_rate": 4.9783989655156965e-06, + "loss": 0.6643, + "step": 1042 + }, + { + "epoch": 0.2769149077392805, + "grad_norm": 0.3682225329604301, + "learning_rate": 4.978353145438407e-06, + "loss": 0.6556, + "step": 1043 + }, + { + "epoch": 0.27718040621266427, + "grad_norm": 0.39590198616690214, + "learning_rate": 4.978307277027165e-06, + "loss": 0.6404, + "step": 1044 + }, + { + "epoch": 0.2774459046860481, + "grad_norm": 0.4081112800566932, + "learning_rate": 4.9782613602828614e-06, + "loss": 0.6114, + "step": 1045 + }, + { + "epoch": 0.2777114031594318, + "grad_norm": 0.3838676655816886, + "learning_rate": 4.978215395206394e-06, + "loss": 0.6332, + "step": 1046 + }, + { + "epoch": 0.27797690163281563, + "grad_norm": 0.3815210844192773, + "learning_rate": 4.978169381798659e-06, + "loss": 0.6487, + "step": 1047 + }, + { + "epoch": 0.2782424001061994, + "grad_norm": 0.3605995293936653, + "learning_rate": 4.978123320060552e-06, + "loss": 0.6422, + "step": 1048 + }, + { + "epoch": 0.2785078985795832, + "grad_norm": 0.35616745318355486, + "learning_rate": 4.9780772099929755e-06, + "loss": 0.5991, + "step": 1049 + }, + { + "epoch": 0.27877339705296694, + "grad_norm": 0.3736046260647422, + "learning_rate": 4.978031051596824e-06, + "loss": 0.6171, + "step": 1050 + }, + { + "epoch": 0.27903889552635075, + "grad_norm": 0.3576515022274845, + "learning_rate": 4.977984844873e-06, + "loss": 0.6189, + "step": 1051 + }, + { + "epoch": 0.2793043939997345, + "grad_norm": 0.37415264077762256, + "learning_rate": 4.977938589822405e-06, + "loss": 0.6481, + "step": 1052 + }, + { + "epoch": 0.27956989247311825, + "grad_norm": 0.38740232408265945, + "learning_rate": 4.97789228644594e-06, + "loss": 0.6465, + "step": 1053 + }, + { + "epoch": 0.27983539094650206, + "grad_norm": 0.36488961523079405, + "learning_rate": 4.977845934744509e-06, + "loss": 0.6551, + "step": 1054 + }, + { + "epoch": 0.2801008894198858, + "grad_norm": 0.36671510752598824, + "learning_rate": 4.977799534719014e-06, + "loss": 0.6578, + "step": 1055 + }, + { + "epoch": 0.2803663878932696, + "grad_norm": 0.36863121575333246, + "learning_rate": 4.977753086370363e-06, + "loss": 0.6543, + "step": 1056 + }, + { + "epoch": 0.28063188636665337, + "grad_norm": 0.3673835879709623, + "learning_rate": 4.977706589699459e-06, + "loss": 0.647, + "step": 1057 + }, + { + "epoch": 0.2808973848400372, + "grad_norm": 0.3805719174915159, + "learning_rate": 4.977660044707211e-06, + "loss": 0.6461, + "step": 1058 + }, + { + "epoch": 0.28116288331342093, + "grad_norm": 0.3738617837889614, + "learning_rate": 4.977613451394525e-06, + "loss": 0.6605, + "step": 1059 + }, + { + "epoch": 0.28142838178680474, + "grad_norm": 0.36895272556611924, + "learning_rate": 4.977566809762312e-06, + "loss": 0.6855, + "step": 1060 + }, + { + "epoch": 0.2816938802601885, + "grad_norm": 0.3663668110974325, + "learning_rate": 4.977520119811479e-06, + "loss": 0.6584, + "step": 1061 + }, + { + "epoch": 0.2819593787335723, + "grad_norm": 0.37707507476680496, + "learning_rate": 4.977473381542936e-06, + "loss": 0.6175, + "step": 1062 + }, + { + "epoch": 0.28222487720695605, + "grad_norm": 0.3734079207074507, + "learning_rate": 4.977426594957598e-06, + "loss": 0.6599, + "step": 1063 + }, + { + "epoch": 0.28249037568033986, + "grad_norm": 0.3861721946751422, + "learning_rate": 4.977379760056376e-06, + "loss": 0.6004, + "step": 1064 + }, + { + "epoch": 0.2827558741537236, + "grad_norm": 0.3547710138333417, + "learning_rate": 4.977332876840182e-06, + "loss": 0.6164, + "step": 1065 + }, + { + "epoch": 0.2830213726271074, + "grad_norm": 0.3849148616938619, + "learning_rate": 4.977285945309932e-06, + "loss": 0.6784, + "step": 1066 + }, + { + "epoch": 0.28328687110049117, + "grad_norm": 0.3799132968612521, + "learning_rate": 4.97723896546654e-06, + "loss": 0.5987, + "step": 1067 + }, + { + "epoch": 0.283552369573875, + "grad_norm": 0.36704655517993257, + "learning_rate": 4.977191937310923e-06, + "loss": 0.6373, + "step": 1068 + }, + { + "epoch": 0.2838178680472587, + "grad_norm": 0.36616281253825506, + "learning_rate": 4.977144860843999e-06, + "loss": 0.6402, + "step": 1069 + }, + { + "epoch": 0.28408336652064253, + "grad_norm": 0.43383765197262214, + "learning_rate": 4.977097736066684e-06, + "loss": 0.6709, + "step": 1070 + }, + { + "epoch": 0.2843488649940263, + "grad_norm": 0.3747998695242347, + "learning_rate": 4.977050562979899e-06, + "loss": 0.6496, + "step": 1071 + }, + { + "epoch": 0.28461436346741004, + "grad_norm": 0.43774417732764065, + "learning_rate": 4.977003341584562e-06, + "loss": 0.5989, + "step": 1072 + }, + { + "epoch": 0.28487986194079384, + "grad_norm": 0.3694306445203735, + "learning_rate": 4.976956071881596e-06, + "loss": 0.6353, + "step": 1073 + }, + { + "epoch": 0.2851453604141776, + "grad_norm": 0.3748486580016225, + "learning_rate": 4.976908753871922e-06, + "loss": 0.6374, + "step": 1074 + }, + { + "epoch": 0.2854108588875614, + "grad_norm": 0.40236012775431534, + "learning_rate": 4.976861387556462e-06, + "loss": 0.6497, + "step": 1075 + }, + { + "epoch": 0.28567635736094515, + "grad_norm": 0.3619853947522983, + "learning_rate": 4.976813972936141e-06, + "loss": 0.669, + "step": 1076 + }, + { + "epoch": 0.28594185583432896, + "grad_norm": 0.3849930432274775, + "learning_rate": 4.976766510011883e-06, + "loss": 0.5857, + "step": 1077 + }, + { + "epoch": 0.2862073543077127, + "grad_norm": 0.40420789974038696, + "learning_rate": 4.976718998784614e-06, + "loss": 0.6152, + "step": 1078 + }, + { + "epoch": 0.2864728527810965, + "grad_norm": 0.4047890114990248, + "learning_rate": 4.976671439255261e-06, + "loss": 0.6075, + "step": 1079 + }, + { + "epoch": 0.2867383512544803, + "grad_norm": 0.36417790362112606, + "learning_rate": 4.97662383142475e-06, + "loss": 0.6725, + "step": 1080 + }, + { + "epoch": 0.2870038497278641, + "grad_norm": 0.37279288917560943, + "learning_rate": 4.976576175294012e-06, + "loss": 0.6688, + "step": 1081 + }, + { + "epoch": 0.28726934820124783, + "grad_norm": 0.3769711148414339, + "learning_rate": 4.9765284708639735e-06, + "loss": 0.651, + "step": 1082 + }, + { + "epoch": 0.28753484667463164, + "grad_norm": 0.36201838478819254, + "learning_rate": 4.976480718135567e-06, + "loss": 0.6295, + "step": 1083 + }, + { + "epoch": 0.2878003451480154, + "grad_norm": 0.36618405010112043, + "learning_rate": 4.976432917109722e-06, + "loss": 0.6507, + "step": 1084 + }, + { + "epoch": 0.2880658436213992, + "grad_norm": 0.36136083902163096, + "learning_rate": 4.9763850677873735e-06, + "loss": 0.6319, + "step": 1085 + }, + { + "epoch": 0.28833134209478295, + "grad_norm": 0.38406693990346413, + "learning_rate": 4.9763371701694515e-06, + "loss": 0.6262, + "step": 1086 + }, + { + "epoch": 0.28859684056816676, + "grad_norm": 0.3888302360216497, + "learning_rate": 4.976289224256892e-06, + "loss": 0.6442, + "step": 1087 + }, + { + "epoch": 0.2888623390415505, + "grad_norm": 0.3611850453000929, + "learning_rate": 4.97624123005063e-06, + "loss": 0.6431, + "step": 1088 + }, + { + "epoch": 0.2891278375149343, + "grad_norm": 0.3589076257694757, + "learning_rate": 4.9761931875516e-06, + "loss": 0.6237, + "step": 1089 + }, + { + "epoch": 0.28939333598831807, + "grad_norm": 0.3581585044999295, + "learning_rate": 4.976145096760741e-06, + "loss": 0.626, + "step": 1090 + }, + { + "epoch": 0.2896588344617018, + "grad_norm": 0.3666506277965257, + "learning_rate": 4.9760969576789905e-06, + "loss": 0.6325, + "step": 1091 + }, + { + "epoch": 0.2899243329350856, + "grad_norm": 0.3646360957648641, + "learning_rate": 4.976048770307286e-06, + "loss": 0.6253, + "step": 1092 + }, + { + "epoch": 0.2901898314084694, + "grad_norm": 0.3557008343438929, + "learning_rate": 4.976000534646568e-06, + "loss": 0.6068, + "step": 1093 + }, + { + "epoch": 0.2904553298818532, + "grad_norm": 0.38629588484800864, + "learning_rate": 4.975952250697778e-06, + "loss": 0.6755, + "step": 1094 + }, + { + "epoch": 0.29072082835523694, + "grad_norm": 0.36941376555216937, + "learning_rate": 4.975903918461856e-06, + "loss": 0.6559, + "step": 1095 + }, + { + "epoch": 0.29098632682862074, + "grad_norm": 0.36868040224675036, + "learning_rate": 4.975855537939746e-06, + "loss": 0.6524, + "step": 1096 + }, + { + "epoch": 0.2912518253020045, + "grad_norm": 0.3779000553077218, + "learning_rate": 4.9758071091323905e-06, + "loss": 0.607, + "step": 1097 + }, + { + "epoch": 0.2915173237753883, + "grad_norm": 0.41871886278418036, + "learning_rate": 4.975758632040734e-06, + "loss": 0.6527, + "step": 1098 + }, + { + "epoch": 0.29178282224877206, + "grad_norm": 0.37608653378257667, + "learning_rate": 4.975710106665724e-06, + "loss": 0.6361, + "step": 1099 + }, + { + "epoch": 0.29204832072215586, + "grad_norm": 0.39800491607222016, + "learning_rate": 4.975661533008305e-06, + "loss": 0.6666, + "step": 1100 + }, + { + "epoch": 0.2923138191955396, + "grad_norm": 0.3747362995726176, + "learning_rate": 4.975612911069425e-06, + "loss": 0.6554, + "step": 1101 + }, + { + "epoch": 0.2925793176689234, + "grad_norm": 0.3628819752775224, + "learning_rate": 4.975564240850032e-06, + "loss": 0.6571, + "step": 1102 + }, + { + "epoch": 0.2928448161423072, + "grad_norm": 0.3633263237254101, + "learning_rate": 4.975515522351073e-06, + "loss": 0.6166, + "step": 1103 + }, + { + "epoch": 0.293110314615691, + "grad_norm": 0.3714301141730258, + "learning_rate": 4.975466755573503e-06, + "loss": 0.6155, + "step": 1104 + }, + { + "epoch": 0.29337581308907473, + "grad_norm": 0.37697531633451764, + "learning_rate": 4.975417940518268e-06, + "loss": 0.6652, + "step": 1105 + }, + { + "epoch": 0.29364131156245854, + "grad_norm": 0.3665382088093708, + "learning_rate": 4.975369077186324e-06, + "loss": 0.6735, + "step": 1106 + }, + { + "epoch": 0.2939068100358423, + "grad_norm": 0.3881447317983678, + "learning_rate": 4.975320165578621e-06, + "loss": 0.6678, + "step": 1107 + }, + { + "epoch": 0.2941723085092261, + "grad_norm": 0.3681517437141483, + "learning_rate": 4.975271205696115e-06, + "loss": 0.6268, + "step": 1108 + }, + { + "epoch": 0.29443780698260985, + "grad_norm": 0.3717262403608333, + "learning_rate": 4.975222197539758e-06, + "loss": 0.6017, + "step": 1109 + }, + { + "epoch": 0.2947033054559936, + "grad_norm": 0.36193710046561917, + "learning_rate": 4.97517314111051e-06, + "loss": 0.6389, + "step": 1110 + }, + { + "epoch": 0.2949688039293774, + "grad_norm": 0.3711770934901428, + "learning_rate": 4.975124036409325e-06, + "loss": 0.6873, + "step": 1111 + }, + { + "epoch": 0.29523430240276116, + "grad_norm": 0.38119641481426275, + "learning_rate": 4.975074883437161e-06, + "loss": 0.6423, + "step": 1112 + }, + { + "epoch": 0.29549980087614497, + "grad_norm": 0.3710720262270734, + "learning_rate": 4.975025682194976e-06, + "loss": 0.6482, + "step": 1113 + }, + { + "epoch": 0.2957652993495287, + "grad_norm": 0.37143480055752387, + "learning_rate": 4.974976432683732e-06, + "loss": 0.667, + "step": 1114 + }, + { + "epoch": 0.29603079782291253, + "grad_norm": 0.3795667856621345, + "learning_rate": 4.974927134904386e-06, + "loss": 0.6367, + "step": 1115 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 0.36287774793276917, + "learning_rate": 4.974877788857903e-06, + "loss": 0.6384, + "step": 1116 + }, + { + "epoch": 0.2965617947696801, + "grad_norm": 0.35103145336870095, + "learning_rate": 4.9748283945452415e-06, + "loss": 0.5954, + "step": 1117 + }, + { + "epoch": 0.29682729324306384, + "grad_norm": 0.36495536794758493, + "learning_rate": 4.974778951967368e-06, + "loss": 0.6254, + "step": 1118 + }, + { + "epoch": 0.29709279171644765, + "grad_norm": 0.4147833945623723, + "learning_rate": 4.974729461125246e-06, + "loss": 0.6221, + "step": 1119 + }, + { + "epoch": 0.2973582901898314, + "grad_norm": 0.3687917710658798, + "learning_rate": 4.974679922019839e-06, + "loss": 0.667, + "step": 1120 + }, + { + "epoch": 0.2976237886632152, + "grad_norm": 0.368437127448733, + "learning_rate": 4.974630334652116e-06, + "loss": 0.6405, + "step": 1121 + }, + { + "epoch": 0.29788928713659896, + "grad_norm": 0.3765791219071444, + "learning_rate": 4.974580699023042e-06, + "loss": 0.6424, + "step": 1122 + }, + { + "epoch": 0.29815478560998276, + "grad_norm": 0.39218884689709, + "learning_rate": 4.9745310151335855e-06, + "loss": 0.6235, + "step": 1123 + }, + { + "epoch": 0.2984202840833665, + "grad_norm": 0.38378639476074716, + "learning_rate": 4.974481282984716e-06, + "loss": 0.6474, + "step": 1124 + }, + { + "epoch": 0.2986857825567503, + "grad_norm": 0.42629351755264794, + "learning_rate": 4.9744315025774015e-06, + "loss": 0.6358, + "step": 1125 + }, + { + "epoch": 0.2989512810301341, + "grad_norm": 0.3764995616646047, + "learning_rate": 4.9743816739126146e-06, + "loss": 0.6435, + "step": 1126 + }, + { + "epoch": 0.2992167795035179, + "grad_norm": 0.3699827961388043, + "learning_rate": 4.974331796991328e-06, + "loss": 0.6279, + "step": 1127 + }, + { + "epoch": 0.29948227797690163, + "grad_norm": 0.3829003820104978, + "learning_rate": 4.974281871814513e-06, + "loss": 0.6758, + "step": 1128 + }, + { + "epoch": 0.2997477764502854, + "grad_norm": 0.37089225016504546, + "learning_rate": 4.974231898383142e-06, + "loss": 0.6474, + "step": 1129 + }, + { + "epoch": 0.3000132749236692, + "grad_norm": 0.3521625524575173, + "learning_rate": 4.974181876698192e-06, + "loss": 0.592, + "step": 1130 + }, + { + "epoch": 0.30027877339705294, + "grad_norm": 0.4093245197064176, + "learning_rate": 4.974131806760637e-06, + "loss": 0.6153, + "step": 1131 + }, + { + "epoch": 0.30054427187043675, + "grad_norm": 0.3747038691130968, + "learning_rate": 4.974081688571455e-06, + "loss": 0.6516, + "step": 1132 + }, + { + "epoch": 0.3008097703438205, + "grad_norm": 0.3728344093015197, + "learning_rate": 4.974031522131621e-06, + "loss": 0.637, + "step": 1133 + }, + { + "epoch": 0.3010752688172043, + "grad_norm": 0.36801062969007814, + "learning_rate": 4.973981307442116e-06, + "loss": 0.6617, + "step": 1134 + }, + { + "epoch": 0.30134076729058806, + "grad_norm": 0.37359690290288783, + "learning_rate": 4.973931044503918e-06, + "loss": 0.6944, + "step": 1135 + }, + { + "epoch": 0.30160626576397187, + "grad_norm": 0.3616515553973182, + "learning_rate": 4.973880733318007e-06, + "loss": 0.6712, + "step": 1136 + }, + { + "epoch": 0.3018717642373556, + "grad_norm": 0.35759397663643866, + "learning_rate": 4.973830373885365e-06, + "loss": 0.5899, + "step": 1137 + }, + { + "epoch": 0.30213726271073943, + "grad_norm": 0.3649548838358172, + "learning_rate": 4.973779966206974e-06, + "loss": 0.6367, + "step": 1138 + }, + { + "epoch": 0.3024027611841232, + "grad_norm": 0.35978835326881975, + "learning_rate": 4.973729510283816e-06, + "loss": 0.6402, + "step": 1139 + }, + { + "epoch": 0.302668259657507, + "grad_norm": 0.3574868347892176, + "learning_rate": 4.973679006116876e-06, + "loss": 0.6389, + "step": 1140 + }, + { + "epoch": 0.30293375813089074, + "grad_norm": 0.3592349428526638, + "learning_rate": 4.973628453707139e-06, + "loss": 0.6339, + "step": 1141 + }, + { + "epoch": 0.30319925660427455, + "grad_norm": 0.36513790765039866, + "learning_rate": 4.973577853055591e-06, + "loss": 0.6356, + "step": 1142 + }, + { + "epoch": 0.3034647550776583, + "grad_norm": 0.35776671262425036, + "learning_rate": 4.973527204163218e-06, + "loss": 0.6463, + "step": 1143 + }, + { + "epoch": 0.3037302535510421, + "grad_norm": 0.39145078042075204, + "learning_rate": 4.9734765070310085e-06, + "loss": 0.6121, + "step": 1144 + }, + { + "epoch": 0.30399575202442586, + "grad_norm": 0.38454986651587336, + "learning_rate": 4.973425761659952e-06, + "loss": 0.6419, + "step": 1145 + }, + { + "epoch": 0.3042612504978096, + "grad_norm": 0.3651720822252698, + "learning_rate": 4.973374968051035e-06, + "loss": 0.6789, + "step": 1146 + }, + { + "epoch": 0.3045267489711934, + "grad_norm": 0.37266762022796446, + "learning_rate": 4.973324126205251e-06, + "loss": 0.6573, + "step": 1147 + }, + { + "epoch": 0.30479224744457717, + "grad_norm": 0.36583017222514574, + "learning_rate": 4.973273236123592e-06, + "loss": 0.6436, + "step": 1148 + }, + { + "epoch": 0.305057745917961, + "grad_norm": 0.36984308792531545, + "learning_rate": 4.973222297807048e-06, + "loss": 0.654, + "step": 1149 + }, + { + "epoch": 0.30532324439134473, + "grad_norm": 0.36296298997708737, + "learning_rate": 4.9731713112566144e-06, + "loss": 0.6213, + "step": 1150 + }, + { + "epoch": 0.30558874286472854, + "grad_norm": 0.3969882923530544, + "learning_rate": 4.973120276473284e-06, + "loss": 0.6303, + "step": 1151 + }, + { + "epoch": 0.3058542413381123, + "grad_norm": 0.3709599941947518, + "learning_rate": 4.9730691934580545e-06, + "loss": 0.6151, + "step": 1152 + }, + { + "epoch": 0.3061197398114961, + "grad_norm": 0.3712133967684035, + "learning_rate": 4.973018062211919e-06, + "loss": 0.6542, + "step": 1153 + }, + { + "epoch": 0.30638523828487985, + "grad_norm": 0.3686728439426148, + "learning_rate": 4.972966882735876e-06, + "loss": 0.6572, + "step": 1154 + }, + { + "epoch": 0.30665073675826365, + "grad_norm": 0.36155246211334735, + "learning_rate": 4.972915655030926e-06, + "loss": 0.633, + "step": 1155 + }, + { + "epoch": 0.3069162352316474, + "grad_norm": 0.3647944810341329, + "learning_rate": 4.972864379098064e-06, + "loss": 0.6331, + "step": 1156 + }, + { + "epoch": 0.3071817337050312, + "grad_norm": 0.3969342707637075, + "learning_rate": 4.972813054938293e-06, + "loss": 0.6688, + "step": 1157 + }, + { + "epoch": 0.30744723217841496, + "grad_norm": 0.37933894707453275, + "learning_rate": 4.972761682552613e-06, + "loss": 0.6509, + "step": 1158 + }, + { + "epoch": 0.30771273065179877, + "grad_norm": 0.3952402592545023, + "learning_rate": 4.9727102619420265e-06, + "loss": 0.6603, + "step": 1159 + }, + { + "epoch": 0.3079782291251825, + "grad_norm": 0.43888487689848305, + "learning_rate": 4.972658793107534e-06, + "loss": 0.6456, + "step": 1160 + }, + { + "epoch": 0.30824372759856633, + "grad_norm": 0.38915812338524325, + "learning_rate": 4.972607276050142e-06, + "loss": 0.6755, + "step": 1161 + }, + { + "epoch": 0.3085092260719501, + "grad_norm": 0.4565404388718119, + "learning_rate": 4.972555710770854e-06, + "loss": 0.5976, + "step": 1162 + }, + { + "epoch": 0.3087747245453339, + "grad_norm": 0.38584294050783047, + "learning_rate": 4.9725040972706755e-06, + "loss": 0.6396, + "step": 1163 + }, + { + "epoch": 0.30904022301871764, + "grad_norm": 0.3904123684938697, + "learning_rate": 4.9724524355506135e-06, + "loss": 0.6219, + "step": 1164 + }, + { + "epoch": 0.3093057214921014, + "grad_norm": 0.39446221328058956, + "learning_rate": 4.9724007256116755e-06, + "loss": 0.6141, + "step": 1165 + }, + { + "epoch": 0.3095712199654852, + "grad_norm": 0.34872890334357637, + "learning_rate": 4.9723489674548695e-06, + "loss": 0.6082, + "step": 1166 + }, + { + "epoch": 0.30983671843886895, + "grad_norm": 0.3779781026484708, + "learning_rate": 4.972297161081205e-06, + "loss": 0.6445, + "step": 1167 + }, + { + "epoch": 0.31010221691225276, + "grad_norm": 0.4612015309967254, + "learning_rate": 4.972245306491694e-06, + "loss": 0.6357, + "step": 1168 + }, + { + "epoch": 0.3103677153856365, + "grad_norm": 0.376288731974902, + "learning_rate": 4.9721934036873455e-06, + "loss": 0.6445, + "step": 1169 + }, + { + "epoch": 0.3106332138590203, + "grad_norm": 0.39029455733963414, + "learning_rate": 4.972141452669172e-06, + "loss": 0.6836, + "step": 1170 + }, + { + "epoch": 0.31089871233240407, + "grad_norm": 0.41205089113636817, + "learning_rate": 4.972089453438189e-06, + "loss": 0.6622, + "step": 1171 + }, + { + "epoch": 0.3111642108057879, + "grad_norm": 0.3806204577440277, + "learning_rate": 4.972037405995408e-06, + "loss": 0.6608, + "step": 1172 + }, + { + "epoch": 0.31142970927917163, + "grad_norm": 0.37423120088296086, + "learning_rate": 4.971985310341845e-06, + "loss": 0.6389, + "step": 1173 + }, + { + "epoch": 0.31169520775255544, + "grad_norm": 0.38961077240672787, + "learning_rate": 4.971933166478516e-06, + "loss": 0.6384, + "step": 1174 + }, + { + "epoch": 0.3119607062259392, + "grad_norm": 0.39326611404176093, + "learning_rate": 4.971880974406438e-06, + "loss": 0.6493, + "step": 1175 + }, + { + "epoch": 0.312226204699323, + "grad_norm": 0.3773472668067406, + "learning_rate": 4.971828734126628e-06, + "loss": 0.6402, + "step": 1176 + }, + { + "epoch": 0.31249170317270675, + "grad_norm": 0.37634222257849803, + "learning_rate": 4.971776445640107e-06, + "loss": 0.6436, + "step": 1177 + }, + { + "epoch": 0.31275720164609055, + "grad_norm": 0.3971968746908418, + "learning_rate": 4.9717241089478925e-06, + "loss": 0.6312, + "step": 1178 + }, + { + "epoch": 0.3130227001194743, + "grad_norm": 0.36898188875095717, + "learning_rate": 4.971671724051006e-06, + "loss": 0.6319, + "step": 1179 + }, + { + "epoch": 0.3132881985928581, + "grad_norm": 0.3735814886195571, + "learning_rate": 4.9716192909504695e-06, + "loss": 0.6995, + "step": 1180 + }, + { + "epoch": 0.31355369706624187, + "grad_norm": 0.43807928720477224, + "learning_rate": 4.971566809647305e-06, + "loss": 0.64, + "step": 1181 + }, + { + "epoch": 0.3138191955396257, + "grad_norm": 0.3554185719143483, + "learning_rate": 4.971514280142536e-06, + "loss": 0.653, + "step": 1182 + }, + { + "epoch": 0.3140846940130094, + "grad_norm": 0.3600671399514037, + "learning_rate": 4.971461702437188e-06, + "loss": 0.6499, + "step": 1183 + }, + { + "epoch": 0.3143501924863932, + "grad_norm": 0.42884570095607055, + "learning_rate": 4.9714090765322855e-06, + "loss": 0.5977, + "step": 1184 + }, + { + "epoch": 0.314615690959777, + "grad_norm": 0.3771822640580587, + "learning_rate": 4.971356402428855e-06, + "loss": 0.6146, + "step": 1185 + }, + { + "epoch": 0.31488118943316074, + "grad_norm": 0.36555143380876054, + "learning_rate": 4.971303680127923e-06, + "loss": 0.6494, + "step": 1186 + }, + { + "epoch": 0.31514668790654454, + "grad_norm": 0.35469507178908677, + "learning_rate": 4.971250909630519e-06, + "loss": 0.6228, + "step": 1187 + }, + { + "epoch": 0.3154121863799283, + "grad_norm": 0.37383246878433135, + "learning_rate": 4.971198090937671e-06, + "loss": 0.6685, + "step": 1188 + }, + { + "epoch": 0.3156776848533121, + "grad_norm": 0.38633289261138815, + "learning_rate": 4.971145224050411e-06, + "loss": 0.6502, + "step": 1189 + }, + { + "epoch": 0.31594318332669585, + "grad_norm": 0.37760264930069487, + "learning_rate": 4.971092308969767e-06, + "loss": 0.6211, + "step": 1190 + }, + { + "epoch": 0.31620868180007966, + "grad_norm": 0.3923110144964888, + "learning_rate": 4.971039345696775e-06, + "loss": 0.6457, + "step": 1191 + }, + { + "epoch": 0.3164741802734634, + "grad_norm": 0.41798910756637, + "learning_rate": 4.970986334232464e-06, + "loss": 0.6165, + "step": 1192 + }, + { + "epoch": 0.3167396787468472, + "grad_norm": 0.37896907857521517, + "learning_rate": 4.970933274577869e-06, + "loss": 0.6131, + "step": 1193 + }, + { + "epoch": 0.31700517722023097, + "grad_norm": 0.3929800177031569, + "learning_rate": 4.970880166734026e-06, + "loss": 0.6109, + "step": 1194 + }, + { + "epoch": 0.3172706756936148, + "grad_norm": 0.38625252598325116, + "learning_rate": 4.9708270107019695e-06, + "loss": 0.6512, + "step": 1195 + }, + { + "epoch": 0.31753617416699853, + "grad_norm": 0.396773031617555, + "learning_rate": 4.970773806482738e-06, + "loss": 0.6369, + "step": 1196 + }, + { + "epoch": 0.31780167264038234, + "grad_norm": 0.3928328348883509, + "learning_rate": 4.970720554077366e-06, + "loss": 0.658, + "step": 1197 + }, + { + "epoch": 0.3180671711137661, + "grad_norm": 0.36965240472489613, + "learning_rate": 4.970667253486895e-06, + "loss": 0.6504, + "step": 1198 + }, + { + "epoch": 0.3183326695871499, + "grad_norm": 0.3882045898977093, + "learning_rate": 4.970613904712363e-06, + "loss": 0.6731, + "step": 1199 + }, + { + "epoch": 0.31859816806053365, + "grad_norm": 0.37814287715143236, + "learning_rate": 4.970560507754811e-06, + "loss": 0.6302, + "step": 1200 + }, + { + "epoch": 0.31886366653391746, + "grad_norm": 0.40819492057896173, + "learning_rate": 4.970507062615279e-06, + "loss": 0.626, + "step": 1201 + }, + { + "epoch": 0.3191291650073012, + "grad_norm": 0.37598809384418796, + "learning_rate": 4.970453569294812e-06, + "loss": 0.67, + "step": 1202 + }, + { + "epoch": 0.31939466348068496, + "grad_norm": 0.3820873853598865, + "learning_rate": 4.970400027794451e-06, + "loss": 0.6703, + "step": 1203 + }, + { + "epoch": 0.31966016195406877, + "grad_norm": 0.3840650953228521, + "learning_rate": 4.970346438115241e-06, + "loss": 0.7007, + "step": 1204 + }, + { + "epoch": 0.3199256604274525, + "grad_norm": 0.3833331826383565, + "learning_rate": 4.970292800258227e-06, + "loss": 0.6881, + "step": 1205 + }, + { + "epoch": 0.3201911589008363, + "grad_norm": 0.3883360600368711, + "learning_rate": 4.970239114224455e-06, + "loss": 0.6386, + "step": 1206 + }, + { + "epoch": 0.3204566573742201, + "grad_norm": 0.38546498480970454, + "learning_rate": 4.9701853800149715e-06, + "loss": 0.6408, + "step": 1207 + }, + { + "epoch": 0.3207221558476039, + "grad_norm": 0.39736338249726827, + "learning_rate": 4.970131597630826e-06, + "loss": 0.6383, + "step": 1208 + }, + { + "epoch": 0.32098765432098764, + "grad_norm": 0.3750662089816997, + "learning_rate": 4.970077767073066e-06, + "loss": 0.6497, + "step": 1209 + }, + { + "epoch": 0.32125315279437144, + "grad_norm": 0.37614692491962076, + "learning_rate": 4.970023888342742e-06, + "loss": 0.6481, + "step": 1210 + }, + { + "epoch": 0.3215186512677552, + "grad_norm": 0.39398809434318954, + "learning_rate": 4.969969961440904e-06, + "loss": 0.6603, + "step": 1211 + }, + { + "epoch": 0.321784149741139, + "grad_norm": 0.37903198746645655, + "learning_rate": 4.969915986368604e-06, + "loss": 0.6394, + "step": 1212 + }, + { + "epoch": 0.32204964821452275, + "grad_norm": 0.3806943211152248, + "learning_rate": 4.969861963126896e-06, + "loss": 0.6852, + "step": 1213 + }, + { + "epoch": 0.32231514668790656, + "grad_norm": 0.378411613698888, + "learning_rate": 4.9698078917168315e-06, + "loss": 0.6596, + "step": 1214 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 0.37293848988846345, + "learning_rate": 4.969753772139466e-06, + "loss": 0.6292, + "step": 1215 + }, + { + "epoch": 0.3228461436346741, + "grad_norm": 0.3871294240192617, + "learning_rate": 4.969699604395856e-06, + "loss": 0.6813, + "step": 1216 + }, + { + "epoch": 0.3231116421080579, + "grad_norm": 0.38562512359951995, + "learning_rate": 4.969645388487056e-06, + "loss": 0.6322, + "step": 1217 + }, + { + "epoch": 0.3233771405814417, + "grad_norm": 0.37858135306273044, + "learning_rate": 4.969591124414124e-06, + "loss": 0.6487, + "step": 1218 + }, + { + "epoch": 0.32364263905482543, + "grad_norm": 0.38742082721166776, + "learning_rate": 4.969536812178119e-06, + "loss": 0.6275, + "step": 1219 + }, + { + "epoch": 0.32390813752820924, + "grad_norm": 0.3949888770742023, + "learning_rate": 4.969482451780099e-06, + "loss": 0.6512, + "step": 1220 + }, + { + "epoch": 0.324173636001593, + "grad_norm": 0.37362786748858295, + "learning_rate": 4.969428043221126e-06, + "loss": 0.6462, + "step": 1221 + }, + { + "epoch": 0.32443913447497674, + "grad_norm": 0.3854571710583223, + "learning_rate": 4.969373586502258e-06, + "loss": 0.6899, + "step": 1222 + }, + { + "epoch": 0.32470463294836055, + "grad_norm": 0.37609556159438146, + "learning_rate": 4.9693190816245615e-06, + "loss": 0.6941, + "step": 1223 + }, + { + "epoch": 0.3249701314217443, + "grad_norm": 0.376576559340508, + "learning_rate": 4.969264528589095e-06, + "loss": 0.6507, + "step": 1224 + }, + { + "epoch": 0.3252356298951281, + "grad_norm": 0.3759310587412502, + "learning_rate": 4.9692099273969255e-06, + "loss": 0.6347, + "step": 1225 + }, + { + "epoch": 0.32550112836851186, + "grad_norm": 0.3677685461171089, + "learning_rate": 4.9691552780491165e-06, + "loss": 0.642, + "step": 1226 + }, + { + "epoch": 0.32576662684189567, + "grad_norm": 0.43429630087021026, + "learning_rate": 4.969100580546734e-06, + "loss": 0.655, + "step": 1227 + }, + { + "epoch": 0.3260321253152794, + "grad_norm": 0.3972913905485784, + "learning_rate": 4.969045834890844e-06, + "loss": 0.6222, + "step": 1228 + }, + { + "epoch": 0.3262976237886632, + "grad_norm": 0.39411628890650474, + "learning_rate": 4.968991041082516e-06, + "loss": 0.6367, + "step": 1229 + }, + { + "epoch": 0.326563122262047, + "grad_norm": 0.4121478949385552, + "learning_rate": 4.968936199122818e-06, + "loss": 0.6576, + "step": 1230 + }, + { + "epoch": 0.3268286207354308, + "grad_norm": 0.39351689900550935, + "learning_rate": 4.968881309012817e-06, + "loss": 0.671, + "step": 1231 + }, + { + "epoch": 0.32709411920881454, + "grad_norm": 0.40431089792915353, + "learning_rate": 4.968826370753588e-06, + "loss": 0.6297, + "step": 1232 + }, + { + "epoch": 0.32735961768219835, + "grad_norm": 0.4155017267907915, + "learning_rate": 4.968771384346199e-06, + "loss": 0.6343, + "step": 1233 + }, + { + "epoch": 0.3276251161555821, + "grad_norm": 0.39505094820926934, + "learning_rate": 4.968716349791723e-06, + "loss": 0.6508, + "step": 1234 + }, + { + "epoch": 0.3278906146289659, + "grad_norm": 0.3900439205266144, + "learning_rate": 4.968661267091234e-06, + "loss": 0.6651, + "step": 1235 + }, + { + "epoch": 0.32815611310234966, + "grad_norm": 0.3838567287234332, + "learning_rate": 4.968606136245806e-06, + "loss": 0.6784, + "step": 1236 + }, + { + "epoch": 0.32842161157573346, + "grad_norm": 0.3692972630077756, + "learning_rate": 4.968550957256515e-06, + "loss": 0.6597, + "step": 1237 + }, + { + "epoch": 0.3286871100491172, + "grad_norm": 0.3787752976099921, + "learning_rate": 4.968495730124435e-06, + "loss": 0.5861, + "step": 1238 + }, + { + "epoch": 0.328952608522501, + "grad_norm": 0.385960339456465, + "learning_rate": 4.968440454850645e-06, + "loss": 0.6413, + "step": 1239 + }, + { + "epoch": 0.3292181069958848, + "grad_norm": 0.38108364058089694, + "learning_rate": 4.968385131436222e-06, + "loss": 0.6373, + "step": 1240 + }, + { + "epoch": 0.3294836054692685, + "grad_norm": 0.37361168543067974, + "learning_rate": 4.968329759882245e-06, + "loss": 0.63, + "step": 1241 + }, + { + "epoch": 0.32974910394265233, + "grad_norm": 0.38063184677826034, + "learning_rate": 4.968274340189795e-06, + "loss": 0.6595, + "step": 1242 + }, + { + "epoch": 0.3300146024160361, + "grad_norm": 0.38266301382281803, + "learning_rate": 4.968218872359952e-06, + "loss": 0.6829, + "step": 1243 + }, + { + "epoch": 0.3302801008894199, + "grad_norm": 0.3743457762433329, + "learning_rate": 4.968163356393797e-06, + "loss": 0.642, + "step": 1244 + }, + { + "epoch": 0.33054559936280364, + "grad_norm": 0.37673658524483494, + "learning_rate": 4.968107792292414e-06, + "loss": 0.6515, + "step": 1245 + }, + { + "epoch": 0.33081109783618745, + "grad_norm": 0.399175358291409, + "learning_rate": 4.9680521800568856e-06, + "loss": 0.6282, + "step": 1246 + }, + { + "epoch": 0.3310765963095712, + "grad_norm": 0.3575619463721458, + "learning_rate": 4.967996519688298e-06, + "loss": 0.6574, + "step": 1247 + }, + { + "epoch": 0.331342094782955, + "grad_norm": 0.374835351902063, + "learning_rate": 4.967940811187733e-06, + "loss": 0.6934, + "step": 1248 + }, + { + "epoch": 0.33160759325633876, + "grad_norm": 0.3718438462752692, + "learning_rate": 4.967885054556283e-06, + "loss": 0.6369, + "step": 1249 + }, + { + "epoch": 0.33187309172972257, + "grad_norm": 0.3915516002224567, + "learning_rate": 4.96782924979503e-06, + "loss": 0.6553, + "step": 1250 + }, + { + "epoch": 0.3321385902031063, + "grad_norm": 0.3931536697264864, + "learning_rate": 4.967773396905065e-06, + "loss": 0.6474, + "step": 1251 + }, + { + "epoch": 0.33240408867649013, + "grad_norm": 0.35748610097975514, + "learning_rate": 4.967717495887476e-06, + "loss": 0.6158, + "step": 1252 + }, + { + "epoch": 0.3326695871498739, + "grad_norm": 0.37916638694158805, + "learning_rate": 4.967661546743354e-06, + "loss": 0.6333, + "step": 1253 + }, + { + "epoch": 0.3329350856232577, + "grad_norm": 0.3634201334581195, + "learning_rate": 4.96760554947379e-06, + "loss": 0.6352, + "step": 1254 + }, + { + "epoch": 0.33320058409664144, + "grad_norm": 0.3775606423853063, + "learning_rate": 4.967549504079876e-06, + "loss": 0.6559, + "step": 1255 + }, + { + "epoch": 0.33346608257002525, + "grad_norm": 0.3823992935295063, + "learning_rate": 4.967493410562705e-06, + "loss": 0.638, + "step": 1256 + }, + { + "epoch": 0.333731581043409, + "grad_norm": 0.3752163493940841, + "learning_rate": 4.967437268923372e-06, + "loss": 0.6747, + "step": 1257 + }, + { + "epoch": 0.3339970795167928, + "grad_norm": 0.3737054649051383, + "learning_rate": 4.96738107916297e-06, + "loss": 0.5964, + "step": 1258 + }, + { + "epoch": 0.33426257799017656, + "grad_norm": 0.38471152194156755, + "learning_rate": 4.967324841282596e-06, + "loss": 0.613, + "step": 1259 + }, + { + "epoch": 0.3345280764635603, + "grad_norm": 0.37944769648714166, + "learning_rate": 4.967268555283346e-06, + "loss": 0.6418, + "step": 1260 + }, + { + "epoch": 0.3347935749369441, + "grad_norm": 0.3651988063579846, + "learning_rate": 4.967212221166319e-06, + "loss": 0.6454, + "step": 1261 + }, + { + "epoch": 0.33505907341032787, + "grad_norm": 0.36961299995464575, + "learning_rate": 4.967155838932613e-06, + "loss": 0.6338, + "step": 1262 + }, + { + "epoch": 0.3353245718837117, + "grad_norm": 0.377312244983752, + "learning_rate": 4.967099408583327e-06, + "loss": 0.6377, + "step": 1263 + }, + { + "epoch": 0.3355900703570954, + "grad_norm": 0.3877479803620617, + "learning_rate": 4.967042930119562e-06, + "loss": 0.6924, + "step": 1264 + }, + { + "epoch": 0.33585556883047923, + "grad_norm": 0.3722360404758459, + "learning_rate": 4.9669864035424195e-06, + "loss": 0.6402, + "step": 1265 + }, + { + "epoch": 0.336121067303863, + "grad_norm": 0.3861384724525087, + "learning_rate": 4.966929828853002e-06, + "loss": 0.6144, + "step": 1266 + }, + { + "epoch": 0.3363865657772468, + "grad_norm": 0.38293062568547903, + "learning_rate": 4.966873206052413e-06, + "loss": 0.6273, + "step": 1267 + }, + { + "epoch": 0.33665206425063054, + "grad_norm": 0.37043997405393947, + "learning_rate": 4.966816535141756e-06, + "loss": 0.6153, + "step": 1268 + }, + { + "epoch": 0.33691756272401435, + "grad_norm": 0.3865129056209508, + "learning_rate": 4.9667598161221374e-06, + "loss": 0.6197, + "step": 1269 + }, + { + "epoch": 0.3371830611973981, + "grad_norm": 0.3913776831297648, + "learning_rate": 4.966703048994662e-06, + "loss": 0.628, + "step": 1270 + }, + { + "epoch": 0.3374485596707819, + "grad_norm": 0.3926460415222075, + "learning_rate": 4.9666462337604386e-06, + "loss": 0.6403, + "step": 1271 + }, + { + "epoch": 0.33771405814416566, + "grad_norm": 0.3821552611169409, + "learning_rate": 4.966589370420573e-06, + "loss": 0.6668, + "step": 1272 + }, + { + "epoch": 0.33797955661754947, + "grad_norm": 0.3695640890804788, + "learning_rate": 4.966532458976176e-06, + "loss": 0.6226, + "step": 1273 + }, + { + "epoch": 0.3382450550909332, + "grad_norm": 0.34457327673863775, + "learning_rate": 4.966475499428357e-06, + "loss": 0.6027, + "step": 1274 + }, + { + "epoch": 0.33851055356431703, + "grad_norm": 0.37633149095083684, + "learning_rate": 4.9664184917782275e-06, + "loss": 0.6922, + "step": 1275 + }, + { + "epoch": 0.3387760520377008, + "grad_norm": 0.35822937825563533, + "learning_rate": 4.966361436026898e-06, + "loss": 0.6536, + "step": 1276 + }, + { + "epoch": 0.3390415505110846, + "grad_norm": 0.36824771473329787, + "learning_rate": 4.966304332175482e-06, + "loss": 0.6236, + "step": 1277 + }, + { + "epoch": 0.33930704898446834, + "grad_norm": 0.4191537794706296, + "learning_rate": 4.966247180225092e-06, + "loss": 0.6761, + "step": 1278 + }, + { + "epoch": 0.3395725474578521, + "grad_norm": 0.36977002305745066, + "learning_rate": 4.966189980176844e-06, + "loss": 0.6244, + "step": 1279 + }, + { + "epoch": 0.3398380459312359, + "grad_norm": 0.3697939815838604, + "learning_rate": 4.966132732031854e-06, + "loss": 0.6382, + "step": 1280 + }, + { + "epoch": 0.34010354440461965, + "grad_norm": 0.3647216931300155, + "learning_rate": 4.966075435791237e-06, + "loss": 0.6665, + "step": 1281 + }, + { + "epoch": 0.34036904287800346, + "grad_norm": 0.3877494990176763, + "learning_rate": 4.96601809145611e-06, + "loss": 0.6025, + "step": 1282 + }, + { + "epoch": 0.3406345413513872, + "grad_norm": 0.37051861749202386, + "learning_rate": 4.965960699027594e-06, + "loss": 0.65, + "step": 1283 + }, + { + "epoch": 0.340900039824771, + "grad_norm": 0.40310083557088616, + "learning_rate": 4.965903258506806e-06, + "loss": 0.611, + "step": 1284 + }, + { + "epoch": 0.34116553829815477, + "grad_norm": 0.3805140330371282, + "learning_rate": 4.965845769894867e-06, + "loss": 0.6752, + "step": 1285 + }, + { + "epoch": 0.3414310367715386, + "grad_norm": 0.3686700457380233, + "learning_rate": 4.965788233192898e-06, + "loss": 0.6036, + "step": 1286 + }, + { + "epoch": 0.34169653524492233, + "grad_norm": 0.38530850242444353, + "learning_rate": 4.965730648402021e-06, + "loss": 0.6322, + "step": 1287 + }, + { + "epoch": 0.34196203371830614, + "grad_norm": 0.39369770405184756, + "learning_rate": 4.965673015523359e-06, + "loss": 0.6518, + "step": 1288 + }, + { + "epoch": 0.3422275321916899, + "grad_norm": 0.37569563153017255, + "learning_rate": 4.9656153345580375e-06, + "loss": 0.6552, + "step": 1289 + }, + { + "epoch": 0.3424930306650737, + "grad_norm": 0.3853621467619779, + "learning_rate": 4.9655576055071786e-06, + "loss": 0.6294, + "step": 1290 + }, + { + "epoch": 0.34275852913845745, + "grad_norm": 0.38634565588761144, + "learning_rate": 4.96549982837191e-06, + "loss": 0.626, + "step": 1291 + }, + { + "epoch": 0.34302402761184125, + "grad_norm": 0.36609973965410436, + "learning_rate": 4.965442003153359e-06, + "loss": 0.6097, + "step": 1292 + }, + { + "epoch": 0.343289526085225, + "grad_norm": 0.38447990687036715, + "learning_rate": 4.965384129852652e-06, + "loss": 0.6676, + "step": 1293 + }, + { + "epoch": 0.3435550245586088, + "grad_norm": 0.3835162570357552, + "learning_rate": 4.965326208470917e-06, + "loss": 0.5892, + "step": 1294 + }, + { + "epoch": 0.34382052303199256, + "grad_norm": 0.3680716196904, + "learning_rate": 4.965268239009286e-06, + "loss": 0.6001, + "step": 1295 + }, + { + "epoch": 0.34408602150537637, + "grad_norm": 0.38077278109458834, + "learning_rate": 4.965210221468888e-06, + "loss": 0.694, + "step": 1296 + }, + { + "epoch": 0.3443515199787601, + "grad_norm": 0.3874965431285721, + "learning_rate": 4.965152155850855e-06, + "loss": 0.6675, + "step": 1297 + }, + { + "epoch": 0.3446170184521439, + "grad_norm": 0.3739594745659148, + "learning_rate": 4.965094042156319e-06, + "loss": 0.6409, + "step": 1298 + }, + { + "epoch": 0.3448825169255277, + "grad_norm": 0.3797731927777758, + "learning_rate": 4.965035880386414e-06, + "loss": 0.6611, + "step": 1299 + }, + { + "epoch": 0.34514801539891143, + "grad_norm": 0.37648059503370057, + "learning_rate": 4.9649776705422735e-06, + "loss": 0.6424, + "step": 1300 + }, + { + "epoch": 0.34541351387229524, + "grad_norm": 0.3926410453862713, + "learning_rate": 4.9649194126250334e-06, + "loss": 0.6594, + "step": 1301 + }, + { + "epoch": 0.345679012345679, + "grad_norm": 0.36104628330663807, + "learning_rate": 4.964861106635829e-06, + "loss": 0.6184, + "step": 1302 + }, + { + "epoch": 0.3459445108190628, + "grad_norm": 0.3823712939991956, + "learning_rate": 4.964802752575799e-06, + "loss": 0.6558, + "step": 1303 + }, + { + "epoch": 0.34621000929244655, + "grad_norm": 0.3854073436797071, + "learning_rate": 4.96474435044608e-06, + "loss": 0.6405, + "step": 1304 + }, + { + "epoch": 0.34647550776583036, + "grad_norm": 0.36896677281817614, + "learning_rate": 4.964685900247811e-06, + "loss": 0.6548, + "step": 1305 + }, + { + "epoch": 0.3467410062392141, + "grad_norm": 0.37470596844469567, + "learning_rate": 4.964627401982133e-06, + "loss": 0.6731, + "step": 1306 + }, + { + "epoch": 0.3470065047125979, + "grad_norm": 0.3701356096650525, + "learning_rate": 4.964568855650186e-06, + "loss": 0.6402, + "step": 1307 + }, + { + "epoch": 0.34727200318598167, + "grad_norm": 0.376241418411062, + "learning_rate": 4.9645102612531116e-06, + "loss": 0.6328, + "step": 1308 + }, + { + "epoch": 0.3475375016593655, + "grad_norm": 0.3661483028009873, + "learning_rate": 4.964451618792053e-06, + "loss": 0.62, + "step": 1309 + }, + { + "epoch": 0.34780300013274923, + "grad_norm": 0.3624725041571129, + "learning_rate": 4.964392928268154e-06, + "loss": 0.625, + "step": 1310 + }, + { + "epoch": 0.34806849860613304, + "grad_norm": 0.3870982359232708, + "learning_rate": 4.964334189682559e-06, + "loss": 0.6771, + "step": 1311 + }, + { + "epoch": 0.3483339970795168, + "grad_norm": 0.37174773107257353, + "learning_rate": 4.964275403036414e-06, + "loss": 0.6319, + "step": 1312 + }, + { + "epoch": 0.3485994955529006, + "grad_norm": 0.36899297737022696, + "learning_rate": 4.964216568330864e-06, + "loss": 0.6098, + "step": 1313 + }, + { + "epoch": 0.34886499402628435, + "grad_norm": 0.3804981128632557, + "learning_rate": 4.9641576855670585e-06, + "loss": 0.6761, + "step": 1314 + }, + { + "epoch": 0.3491304924996681, + "grad_norm": 0.3804053549888035, + "learning_rate": 4.9640987547461445e-06, + "loss": 0.6596, + "step": 1315 + }, + { + "epoch": 0.3493959909730519, + "grad_norm": 0.39034962730016304, + "learning_rate": 4.964039775869271e-06, + "loss": 0.6487, + "step": 1316 + }, + { + "epoch": 0.34966148944643566, + "grad_norm": 0.3741099395770308, + "learning_rate": 4.96398074893759e-06, + "loss": 0.6605, + "step": 1317 + }, + { + "epoch": 0.34992698791981947, + "grad_norm": 0.36271656310405986, + "learning_rate": 4.963921673952251e-06, + "loss": 0.5872, + "step": 1318 + }, + { + "epoch": 0.3501924863932032, + "grad_norm": 0.3995062957591318, + "learning_rate": 4.963862550914407e-06, + "loss": 0.661, + "step": 1319 + }, + { + "epoch": 0.350457984866587, + "grad_norm": 0.37305610009600587, + "learning_rate": 4.96380337982521e-06, + "loss": 0.6496, + "step": 1320 + }, + { + "epoch": 0.3507234833399708, + "grad_norm": 0.3801367848683364, + "learning_rate": 4.963744160685815e-06, + "loss": 0.6137, + "step": 1321 + }, + { + "epoch": 0.3509889818133546, + "grad_norm": 0.3955362303938955, + "learning_rate": 4.963684893497377e-06, + "loss": 0.6405, + "step": 1322 + }, + { + "epoch": 0.35125448028673834, + "grad_norm": 0.37893790310795683, + "learning_rate": 4.963625578261051e-06, + "loss": 0.6393, + "step": 1323 + }, + { + "epoch": 0.35151997876012214, + "grad_norm": 0.3788019859897635, + "learning_rate": 4.963566214977994e-06, + "loss": 0.6382, + "step": 1324 + }, + { + "epoch": 0.3517854772335059, + "grad_norm": 0.38654949916511866, + "learning_rate": 4.963506803649364e-06, + "loss": 0.5996, + "step": 1325 + }, + { + "epoch": 0.3520509757068897, + "grad_norm": 0.40307788144519574, + "learning_rate": 4.9634473442763195e-06, + "loss": 0.6813, + "step": 1326 + }, + { + "epoch": 0.35231647418027345, + "grad_norm": 0.3704607316775162, + "learning_rate": 4.963387836860021e-06, + "loss": 0.6259, + "step": 1327 + }, + { + "epoch": 0.35258197265365726, + "grad_norm": 0.3735058113907317, + "learning_rate": 4.963328281401627e-06, + "loss": 0.6488, + "step": 1328 + }, + { + "epoch": 0.352847471127041, + "grad_norm": 0.3811243356459918, + "learning_rate": 4.963268677902302e-06, + "loss": 0.6577, + "step": 1329 + }, + { + "epoch": 0.3531129696004248, + "grad_norm": 0.38674222498284955, + "learning_rate": 4.963209026363206e-06, + "loss": 0.6521, + "step": 1330 + }, + { + "epoch": 0.35337846807380857, + "grad_norm": 0.361601234266494, + "learning_rate": 4.963149326785502e-06, + "loss": 0.6282, + "step": 1331 + }, + { + "epoch": 0.3536439665471924, + "grad_norm": 0.3979335219653062, + "learning_rate": 4.963089579170356e-06, + "loss": 0.6428, + "step": 1332 + }, + { + "epoch": 0.35390946502057613, + "grad_norm": 0.3678971039569373, + "learning_rate": 4.963029783518933e-06, + "loss": 0.6606, + "step": 1333 + }, + { + "epoch": 0.3541749634939599, + "grad_norm": 0.36543972095556243, + "learning_rate": 4.962969939832398e-06, + "loss": 0.6243, + "step": 1334 + }, + { + "epoch": 0.3544404619673437, + "grad_norm": 0.3927679539886193, + "learning_rate": 4.962910048111919e-06, + "loss": 0.6293, + "step": 1335 + }, + { + "epoch": 0.35470596044072744, + "grad_norm": 0.4277322929158258, + "learning_rate": 4.962850108358664e-06, + "loss": 0.6122, + "step": 1336 + }, + { + "epoch": 0.35497145891411125, + "grad_norm": 0.3697604390986625, + "learning_rate": 4.962790120573801e-06, + "loss": 0.6617, + "step": 1337 + }, + { + "epoch": 0.355236957387495, + "grad_norm": 0.4369151743487042, + "learning_rate": 4.962730084758501e-06, + "loss": 0.6256, + "step": 1338 + }, + { + "epoch": 0.3555024558608788, + "grad_norm": 0.39076793071676813, + "learning_rate": 4.962670000913935e-06, + "loss": 0.6485, + "step": 1339 + }, + { + "epoch": 0.35576795433426256, + "grad_norm": 0.37779970531006934, + "learning_rate": 4.962609869041275e-06, + "loss": 0.6773, + "step": 1340 + }, + { + "epoch": 0.35603345280764637, + "grad_norm": 0.3876902671823763, + "learning_rate": 4.962549689141692e-06, + "loss": 0.6102, + "step": 1341 + }, + { + "epoch": 0.3562989512810301, + "grad_norm": 0.3764840908133353, + "learning_rate": 4.962489461216361e-06, + "loss": 0.6278, + "step": 1342 + }, + { + "epoch": 0.3565644497544139, + "grad_norm": 0.38494621232872744, + "learning_rate": 4.962429185266457e-06, + "loss": 0.6462, + "step": 1343 + }, + { + "epoch": 0.3568299482277977, + "grad_norm": 0.37072142522297324, + "learning_rate": 4.962368861293153e-06, + "loss": 0.6499, + "step": 1344 + }, + { + "epoch": 0.3570954467011815, + "grad_norm": 0.3717158724599488, + "learning_rate": 4.962308489297628e-06, + "loss": 0.6571, + "step": 1345 + }, + { + "epoch": 0.35736094517456524, + "grad_norm": 0.40404331801254534, + "learning_rate": 4.962248069281059e-06, + "loss": 0.6072, + "step": 1346 + }, + { + "epoch": 0.35762644364794904, + "grad_norm": 0.36336814573957393, + "learning_rate": 4.962187601244625e-06, + "loss": 0.631, + "step": 1347 + }, + { + "epoch": 0.3578919421213328, + "grad_norm": 0.3749020997535401, + "learning_rate": 4.9621270851895035e-06, + "loss": 0.6523, + "step": 1348 + }, + { + "epoch": 0.3581574405947166, + "grad_norm": 0.380765478062254, + "learning_rate": 4.962066521116875e-06, + "loss": 0.6101, + "step": 1349 + }, + { + "epoch": 0.35842293906810035, + "grad_norm": 0.3858701462383024, + "learning_rate": 4.962005909027922e-06, + "loss": 0.6673, + "step": 1350 + }, + { + "epoch": 0.35868843754148416, + "grad_norm": 0.3842253412518075, + "learning_rate": 4.961945248923825e-06, + "loss": 0.6508, + "step": 1351 + }, + { + "epoch": 0.3589539360148679, + "grad_norm": 0.3682174188889406, + "learning_rate": 4.961884540805767e-06, + "loss": 0.5865, + "step": 1352 + }, + { + "epoch": 0.35921943448825167, + "grad_norm": 0.38771166101276655, + "learning_rate": 4.961823784674934e-06, + "loss": 0.6445, + "step": 1353 + }, + { + "epoch": 0.3594849329616355, + "grad_norm": 0.3707244021777986, + "learning_rate": 4.96176298053251e-06, + "loss": 0.587, + "step": 1354 + }, + { + "epoch": 0.3597504314350192, + "grad_norm": 0.3725300854127343, + "learning_rate": 4.9617021283796795e-06, + "loss": 0.6345, + "step": 1355 + }, + { + "epoch": 0.36001592990840303, + "grad_norm": 0.37645402418603974, + "learning_rate": 4.961641228217631e-06, + "loss": 0.6342, + "step": 1356 + }, + { + "epoch": 0.3602814283817868, + "grad_norm": 0.3734347605020842, + "learning_rate": 4.96158028004755e-06, + "loss": 0.6655, + "step": 1357 + }, + { + "epoch": 0.3605469268551706, + "grad_norm": 0.37275528379918504, + "learning_rate": 4.961519283870628e-06, + "loss": 0.6428, + "step": 1358 + }, + { + "epoch": 0.36081242532855434, + "grad_norm": 0.3695152405972768, + "learning_rate": 4.961458239688053e-06, + "loss": 0.6119, + "step": 1359 + }, + { + "epoch": 0.36107792380193815, + "grad_norm": 0.36621105763711564, + "learning_rate": 4.9613971475010145e-06, + "loss": 0.6152, + "step": 1360 + }, + { + "epoch": 0.3613434222753219, + "grad_norm": 0.3719351382979484, + "learning_rate": 4.961336007310706e-06, + "loss": 0.6226, + "step": 1361 + }, + { + "epoch": 0.3616089207487057, + "grad_norm": 0.3894427564540057, + "learning_rate": 4.961274819118319e-06, + "loss": 0.6001, + "step": 1362 + }, + { + "epoch": 0.36187441922208946, + "grad_norm": 0.37337631169965396, + "learning_rate": 4.961213582925046e-06, + "loss": 0.6212, + "step": 1363 + }, + { + "epoch": 0.36213991769547327, + "grad_norm": 0.3753722374202282, + "learning_rate": 4.961152298732083e-06, + "loss": 0.6508, + "step": 1364 + }, + { + "epoch": 0.362405416168857, + "grad_norm": 0.37578329555809076, + "learning_rate": 4.961090966540623e-06, + "loss": 0.6528, + "step": 1365 + }, + { + "epoch": 0.3626709146422408, + "grad_norm": 0.37205930681028027, + "learning_rate": 4.961029586351865e-06, + "loss": 0.6723, + "step": 1366 + }, + { + "epoch": 0.3629364131156246, + "grad_norm": 0.37545613857899063, + "learning_rate": 4.960968158167004e-06, + "loss": 0.6283, + "step": 1367 + }, + { + "epoch": 0.3632019115890084, + "grad_norm": 0.36725230843341256, + "learning_rate": 4.960906681987238e-06, + "loss": 0.6508, + "step": 1368 + }, + { + "epoch": 0.36346741006239214, + "grad_norm": 0.3638765124628343, + "learning_rate": 4.960845157813766e-06, + "loss": 0.6502, + "step": 1369 + }, + { + "epoch": 0.36373290853577595, + "grad_norm": 0.382484017705505, + "learning_rate": 4.96078358564779e-06, + "loss": 0.6503, + "step": 1370 + }, + { + "epoch": 0.3639984070091597, + "grad_norm": 0.38719996839925136, + "learning_rate": 4.960721965490507e-06, + "loss": 0.6506, + "step": 1371 + }, + { + "epoch": 0.36426390548254345, + "grad_norm": 0.36338076391017987, + "learning_rate": 4.960660297343122e-06, + "loss": 0.6311, + "step": 1372 + }, + { + "epoch": 0.36452940395592726, + "grad_norm": 0.35058788659765905, + "learning_rate": 4.9605985812068355e-06, + "loss": 0.6115, + "step": 1373 + }, + { + "epoch": 0.364794902429311, + "grad_norm": 0.39396344960920776, + "learning_rate": 4.960536817082853e-06, + "loss": 0.6346, + "step": 1374 + }, + { + "epoch": 0.3650604009026948, + "grad_norm": 0.37178981434123, + "learning_rate": 4.9604750049723775e-06, + "loss": 0.6965, + "step": 1375 + }, + { + "epoch": 0.36532589937607857, + "grad_norm": 0.3707160699657333, + "learning_rate": 4.960413144876615e-06, + "loss": 0.6204, + "step": 1376 + }, + { + "epoch": 0.3655913978494624, + "grad_norm": 0.36689060022358805, + "learning_rate": 4.960351236796772e-06, + "loss": 0.6772, + "step": 1377 + }, + { + "epoch": 0.3658568963228461, + "grad_norm": 0.3891078559027146, + "learning_rate": 4.960289280734056e-06, + "loss": 0.5861, + "step": 1378 + }, + { + "epoch": 0.36612239479622993, + "grad_norm": 0.36991824090341346, + "learning_rate": 4.960227276689676e-06, + "loss": 0.651, + "step": 1379 + }, + { + "epoch": 0.3663878932696137, + "grad_norm": 0.37552427922271137, + "learning_rate": 4.96016522466484e-06, + "loss": 0.661, + "step": 1380 + }, + { + "epoch": 0.3666533917429975, + "grad_norm": 0.37116303316060584, + "learning_rate": 4.960103124660759e-06, + "loss": 0.6068, + "step": 1381 + }, + { + "epoch": 0.36691889021638124, + "grad_norm": 0.3817038291614676, + "learning_rate": 4.960040976678643e-06, + "loss": 0.6144, + "step": 1382 + }, + { + "epoch": 0.36718438868976505, + "grad_norm": 0.3805664191708131, + "learning_rate": 4.959978780719704e-06, + "loss": 0.6423, + "step": 1383 + }, + { + "epoch": 0.3674498871631488, + "grad_norm": 0.37140788716933093, + "learning_rate": 4.959916536785157e-06, + "loss": 0.6314, + "step": 1384 + }, + { + "epoch": 0.3677153856365326, + "grad_norm": 0.4224130419842736, + "learning_rate": 4.959854244876214e-06, + "loss": 0.6218, + "step": 1385 + }, + { + "epoch": 0.36798088410991636, + "grad_norm": 0.37485670103515806, + "learning_rate": 4.959791904994091e-06, + "loss": 0.6236, + "step": 1386 + }, + { + "epoch": 0.36824638258330017, + "grad_norm": 0.4620162539370904, + "learning_rate": 4.959729517140003e-06, + "loss": 0.6082, + "step": 1387 + }, + { + "epoch": 0.3685118810566839, + "grad_norm": 0.3955744303491755, + "learning_rate": 4.959667081315167e-06, + "loss": 0.6317, + "step": 1388 + }, + { + "epoch": 0.36877737953006773, + "grad_norm": 0.3812914244875187, + "learning_rate": 4.9596045975208e-06, + "loss": 0.6167, + "step": 1389 + }, + { + "epoch": 0.3690428780034515, + "grad_norm": 0.4354104908718216, + "learning_rate": 4.959542065758123e-06, + "loss": 0.6583, + "step": 1390 + }, + { + "epoch": 0.36930837647683523, + "grad_norm": 0.39167267251467797, + "learning_rate": 4.959479486028351e-06, + "loss": 0.6386, + "step": 1391 + }, + { + "epoch": 0.36957387495021904, + "grad_norm": 0.3743109799078965, + "learning_rate": 4.95941685833271e-06, + "loss": 0.5948, + "step": 1392 + }, + { + "epoch": 0.3698393734236028, + "grad_norm": 0.40758134817294345, + "learning_rate": 4.959354182672417e-06, + "loss": 0.661, + "step": 1393 + }, + { + "epoch": 0.3701048718969866, + "grad_norm": 0.421962544737718, + "learning_rate": 4.959291459048696e-06, + "loss": 0.6468, + "step": 1394 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 0.37499074470520627, + "learning_rate": 4.95922868746277e-06, + "loss": 0.6458, + "step": 1395 + }, + { + "epoch": 0.37063586884375416, + "grad_norm": 0.382070034558638, + "learning_rate": 4.9591658679158635e-06, + "loss": 0.6779, + "step": 1396 + }, + { + "epoch": 0.3709013673171379, + "grad_norm": 0.4111615078041875, + "learning_rate": 4.959103000409202e-06, + "loss": 0.6291, + "step": 1397 + }, + { + "epoch": 0.3711668657905217, + "grad_norm": 0.38482984054466113, + "learning_rate": 4.95904008494401e-06, + "loss": 0.6357, + "step": 1398 + }, + { + "epoch": 0.37143236426390547, + "grad_norm": 0.3935396036411042, + "learning_rate": 4.958977121521516e-06, + "loss": 0.6506, + "step": 1399 + }, + { + "epoch": 0.3716978627372893, + "grad_norm": 0.4085701958297275, + "learning_rate": 4.9589141101429485e-06, + "loss": 0.6383, + "step": 1400 + }, + { + "epoch": 0.371963361210673, + "grad_norm": 0.3767915921767564, + "learning_rate": 4.958851050809534e-06, + "loss": 0.6143, + "step": 1401 + }, + { + "epoch": 0.37222885968405683, + "grad_norm": 0.3687676905981951, + "learning_rate": 4.958787943522505e-06, + "loss": 0.6469, + "step": 1402 + }, + { + "epoch": 0.3724943581574406, + "grad_norm": 0.4003614931457942, + "learning_rate": 4.9587247882830905e-06, + "loss": 0.6387, + "step": 1403 + }, + { + "epoch": 0.3727598566308244, + "grad_norm": 0.4035927947510916, + "learning_rate": 4.958661585092523e-06, + "loss": 0.6268, + "step": 1404 + }, + { + "epoch": 0.37302535510420815, + "grad_norm": 0.38131953814588676, + "learning_rate": 4.958598333952034e-06, + "loss": 0.6516, + "step": 1405 + }, + { + "epoch": 0.37329085357759195, + "grad_norm": 0.3764012535717099, + "learning_rate": 4.958535034862857e-06, + "loss": 0.6515, + "step": 1406 + }, + { + "epoch": 0.3735563520509757, + "grad_norm": 0.4153391344046764, + "learning_rate": 4.9584716878262285e-06, + "loss": 0.6385, + "step": 1407 + }, + { + "epoch": 0.3738218505243595, + "grad_norm": 0.3697704117982249, + "learning_rate": 4.958408292843383e-06, + "loss": 0.6251, + "step": 1408 + }, + { + "epoch": 0.37408734899774326, + "grad_norm": 0.38153125902827845, + "learning_rate": 4.958344849915556e-06, + "loss": 0.5958, + "step": 1409 + }, + { + "epoch": 0.374352847471127, + "grad_norm": 0.3764632124972295, + "learning_rate": 4.958281359043985e-06, + "loss": 0.6227, + "step": 1410 + }, + { + "epoch": 0.3746183459445108, + "grad_norm": 0.3699439558545008, + "learning_rate": 4.958217820229909e-06, + "loss": 0.6523, + "step": 1411 + }, + { + "epoch": 0.3748838444178946, + "grad_norm": 0.3787900759276024, + "learning_rate": 4.958154233474567e-06, + "loss": 0.6762, + "step": 1412 + }, + { + "epoch": 0.3751493428912784, + "grad_norm": 0.3850429812791596, + "learning_rate": 4.958090598779199e-06, + "loss": 0.6577, + "step": 1413 + }, + { + "epoch": 0.37541484136466213, + "grad_norm": 0.37732513930131806, + "learning_rate": 4.9580269161450455e-06, + "loss": 0.5914, + "step": 1414 + }, + { + "epoch": 0.37568033983804594, + "grad_norm": 0.3908931497346101, + "learning_rate": 4.957963185573349e-06, + "loss": 0.649, + "step": 1415 + }, + { + "epoch": 0.3759458383114297, + "grad_norm": 0.369642313827512, + "learning_rate": 4.957899407065352e-06, + "loss": 0.6427, + "step": 1416 + }, + { + "epoch": 0.3762113367848135, + "grad_norm": 0.3696632006440851, + "learning_rate": 4.957835580622298e-06, + "loss": 0.6002, + "step": 1417 + }, + { + "epoch": 0.37647683525819725, + "grad_norm": 0.3866127348878271, + "learning_rate": 4.9577717062454336e-06, + "loss": 0.6607, + "step": 1418 + }, + { + "epoch": 0.37674233373158106, + "grad_norm": 0.3780464404386548, + "learning_rate": 4.957707783936002e-06, + "loss": 0.6527, + "step": 1419 + }, + { + "epoch": 0.3770078322049648, + "grad_norm": 0.3912925028662332, + "learning_rate": 4.9576438136952525e-06, + "loss": 0.636, + "step": 1420 + }, + { + "epoch": 0.3772733306783486, + "grad_norm": 0.38704762134675663, + "learning_rate": 4.9575797955244306e-06, + "loss": 0.6182, + "step": 1421 + }, + { + "epoch": 0.37753882915173237, + "grad_norm": 0.38299667748331434, + "learning_rate": 4.9575157294247855e-06, + "loss": 0.6587, + "step": 1422 + }, + { + "epoch": 0.3778043276251162, + "grad_norm": 0.39125377495109814, + "learning_rate": 4.957451615397567e-06, + "loss": 0.6757, + "step": 1423 + }, + { + "epoch": 0.37806982609849993, + "grad_norm": 0.37718180543445884, + "learning_rate": 4.957387453444024e-06, + "loss": 0.6643, + "step": 1424 + }, + { + "epoch": 0.37833532457188374, + "grad_norm": 0.3735075398585431, + "learning_rate": 4.95732324356541e-06, + "loss": 0.6618, + "step": 1425 + }, + { + "epoch": 0.3786008230452675, + "grad_norm": 0.3714641315208511, + "learning_rate": 4.957258985762976e-06, + "loss": 0.6656, + "step": 1426 + }, + { + "epoch": 0.3788663215186513, + "grad_norm": 0.37303200700408046, + "learning_rate": 4.957194680037976e-06, + "loss": 0.6345, + "step": 1427 + }, + { + "epoch": 0.37913181999203505, + "grad_norm": 0.3769124858903932, + "learning_rate": 4.957130326391662e-06, + "loss": 0.6449, + "step": 1428 + }, + { + "epoch": 0.3793973184654188, + "grad_norm": 0.4037711755596157, + "learning_rate": 4.957065924825291e-06, + "loss": 0.6188, + "step": 1429 + }, + { + "epoch": 0.3796628169388026, + "grad_norm": 0.38385396358345264, + "learning_rate": 4.957001475340119e-06, + "loss": 0.6648, + "step": 1430 + }, + { + "epoch": 0.37992831541218636, + "grad_norm": 0.3855857786816034, + "learning_rate": 4.956936977937402e-06, + "loss": 0.632, + "step": 1431 + }, + { + "epoch": 0.38019381388557016, + "grad_norm": 0.37470720168500926, + "learning_rate": 4.956872432618399e-06, + "loss": 0.6476, + "step": 1432 + }, + { + "epoch": 0.3804593123589539, + "grad_norm": 0.3691855564432916, + "learning_rate": 4.956807839384367e-06, + "loss": 0.6117, + "step": 1433 + }, + { + "epoch": 0.3807248108323377, + "grad_norm": 0.3742803590304269, + "learning_rate": 4.9567431982365675e-06, + "loss": 0.6191, + "step": 1434 + }, + { + "epoch": 0.3809903093057215, + "grad_norm": 0.38440285333850405, + "learning_rate": 4.956678509176262e-06, + "loss": 0.6245, + "step": 1435 + }, + { + "epoch": 0.3812558077791053, + "grad_norm": 0.3766892786388694, + "learning_rate": 4.956613772204708e-06, + "loss": 0.6795, + "step": 1436 + }, + { + "epoch": 0.38152130625248903, + "grad_norm": 0.3912542692411445, + "learning_rate": 4.956548987323172e-06, + "loss": 0.6114, + "step": 1437 + }, + { + "epoch": 0.38178680472587284, + "grad_norm": 0.42336261428352623, + "learning_rate": 4.956484154532916e-06, + "loss": 0.6225, + "step": 1438 + }, + { + "epoch": 0.3820523031992566, + "grad_norm": 0.3935056723083644, + "learning_rate": 4.956419273835204e-06, + "loss": 0.6151, + "step": 1439 + }, + { + "epoch": 0.3823178016726404, + "grad_norm": 0.3802837261012137, + "learning_rate": 4.956354345231302e-06, + "loss": 0.6485, + "step": 1440 + }, + { + "epoch": 0.38258330014602415, + "grad_norm": 0.3835400112370889, + "learning_rate": 4.956289368722476e-06, + "loss": 0.6717, + "step": 1441 + }, + { + "epoch": 0.38284879861940796, + "grad_norm": 0.404962853402544, + "learning_rate": 4.956224344309993e-06, + "loss": 0.6466, + "step": 1442 + }, + { + "epoch": 0.3831142970927917, + "grad_norm": 0.38818383690924424, + "learning_rate": 4.9561592719951225e-06, + "loss": 0.6598, + "step": 1443 + }, + { + "epoch": 0.3833797955661755, + "grad_norm": 0.3898081719218938, + "learning_rate": 4.956094151779132e-06, + "loss": 0.6235, + "step": 1444 + }, + { + "epoch": 0.38364529403955927, + "grad_norm": 0.38436450299378305, + "learning_rate": 4.9560289836632914e-06, + "loss": 0.692, + "step": 1445 + }, + { + "epoch": 0.3839107925129431, + "grad_norm": 0.48203113579878776, + "learning_rate": 4.955963767648872e-06, + "loss": 0.6295, + "step": 1446 + }, + { + "epoch": 0.38417629098632683, + "grad_norm": 0.3761191864930641, + "learning_rate": 4.955898503737147e-06, + "loss": 0.6532, + "step": 1447 + }, + { + "epoch": 0.3844417894597106, + "grad_norm": 0.39325727838410857, + "learning_rate": 4.955833191929388e-06, + "loss": 0.6678, + "step": 1448 + }, + { + "epoch": 0.3847072879330944, + "grad_norm": 0.5168610566283248, + "learning_rate": 4.955767832226868e-06, + "loss": 0.6481, + "step": 1449 + }, + { + "epoch": 0.38497278640647814, + "grad_norm": 0.38494809132804086, + "learning_rate": 4.9557024246308635e-06, + "loss": 0.6514, + "step": 1450 + }, + { + "epoch": 0.38523828487986195, + "grad_norm": 0.3860619969425166, + "learning_rate": 4.955636969142649e-06, + "loss": 0.6581, + "step": 1451 + }, + { + "epoch": 0.3855037833532457, + "grad_norm": 0.4665927604403339, + "learning_rate": 4.9555714657635e-06, + "loss": 0.6122, + "step": 1452 + }, + { + "epoch": 0.3857692818266295, + "grad_norm": 0.523630248204926, + "learning_rate": 4.955505914494696e-06, + "loss": 0.6307, + "step": 1453 + }, + { + "epoch": 0.38603478030001326, + "grad_norm": 0.38482052933862343, + "learning_rate": 4.9554403153375145e-06, + "loss": 0.6662, + "step": 1454 + }, + { + "epoch": 0.38630027877339707, + "grad_norm": 0.4378683106307078, + "learning_rate": 4.955374668293234e-06, + "loss": 0.6424, + "step": 1455 + }, + { + "epoch": 0.3865657772467808, + "grad_norm": 0.3910977708505808, + "learning_rate": 4.9553089733631375e-06, + "loss": 0.65, + "step": 1456 + }, + { + "epoch": 0.3868312757201646, + "grad_norm": 0.43031474737653463, + "learning_rate": 4.955243230548503e-06, + "loss": 0.6245, + "step": 1457 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 0.3614439227720914, + "learning_rate": 4.955177439850615e-06, + "loss": 0.6025, + "step": 1458 + }, + { + "epoch": 0.3873622726669322, + "grad_norm": 0.39897427541295477, + "learning_rate": 4.955111601270754e-06, + "loss": 0.6153, + "step": 1459 + }, + { + "epoch": 0.38762777114031594, + "grad_norm": 0.41460765512030234, + "learning_rate": 4.955045714810207e-06, + "loss": 0.6834, + "step": 1460 + }, + { + "epoch": 0.38789326961369974, + "grad_norm": 0.387853719192773, + "learning_rate": 4.954979780470257e-06, + "loss": 0.6246, + "step": 1461 + }, + { + "epoch": 0.3881587680870835, + "grad_norm": 0.37773562675084454, + "learning_rate": 4.954913798252191e-06, + "loss": 0.6633, + "step": 1462 + }, + { + "epoch": 0.3884242665604673, + "grad_norm": 0.3997877485273931, + "learning_rate": 4.954847768157296e-06, + "loss": 0.5861, + "step": 1463 + }, + { + "epoch": 0.38868976503385105, + "grad_norm": 0.39331225458899316, + "learning_rate": 4.954781690186857e-06, + "loss": 0.6347, + "step": 1464 + }, + { + "epoch": 0.38895526350723486, + "grad_norm": 0.37548258213028635, + "learning_rate": 4.954715564342166e-06, + "loss": 0.67, + "step": 1465 + }, + { + "epoch": 0.3892207619806186, + "grad_norm": 0.3730080748435241, + "learning_rate": 4.95464939062451e-06, + "loss": 0.632, + "step": 1466 + }, + { + "epoch": 0.38948626045400236, + "grad_norm": 0.3979567934107656, + "learning_rate": 4.954583169035183e-06, + "loss": 0.5809, + "step": 1467 + }, + { + "epoch": 0.38975175892738617, + "grad_norm": 0.3845961397805626, + "learning_rate": 4.9545168995754735e-06, + "loss": 0.6291, + "step": 1468 + }, + { + "epoch": 0.3900172574007699, + "grad_norm": 0.36817228101345933, + "learning_rate": 4.954450582246674e-06, + "loss": 0.5853, + "step": 1469 + }, + { + "epoch": 0.39028275587415373, + "grad_norm": 0.38502100501420783, + "learning_rate": 4.954384217050079e-06, + "loss": 0.6273, + "step": 1470 + }, + { + "epoch": 0.3905482543475375, + "grad_norm": 0.38305800890396074, + "learning_rate": 4.954317803986983e-06, + "loss": 0.6529, + "step": 1471 + }, + { + "epoch": 0.3908137528209213, + "grad_norm": 0.3872047844134107, + "learning_rate": 4.95425134305868e-06, + "loss": 0.6275, + "step": 1472 + }, + { + "epoch": 0.39107925129430504, + "grad_norm": 0.39045498310880516, + "learning_rate": 4.954184834266467e-06, + "loss": 0.6491, + "step": 1473 + }, + { + "epoch": 0.39134474976768885, + "grad_norm": 0.38814459311763694, + "learning_rate": 4.954118277611641e-06, + "loss": 0.6307, + "step": 1474 + }, + { + "epoch": 0.3916102482410726, + "grad_norm": 0.3856182405209275, + "learning_rate": 4.954051673095499e-06, + "loss": 0.6636, + "step": 1475 + }, + { + "epoch": 0.3918757467144564, + "grad_norm": 0.36713649195996256, + "learning_rate": 4.953985020719342e-06, + "loss": 0.66, + "step": 1476 + }, + { + "epoch": 0.39214124518784016, + "grad_norm": 0.36360960011877747, + "learning_rate": 4.953918320484468e-06, + "loss": 0.6551, + "step": 1477 + }, + { + "epoch": 0.39240674366122397, + "grad_norm": 0.3693469414807645, + "learning_rate": 4.953851572392179e-06, + "loss": 0.6334, + "step": 1478 + }, + { + "epoch": 0.3926722421346077, + "grad_norm": 0.3750301406185971, + "learning_rate": 4.9537847764437755e-06, + "loss": 0.6606, + "step": 1479 + }, + { + "epoch": 0.3929377406079915, + "grad_norm": 0.379412127001905, + "learning_rate": 4.953717932640562e-06, + "loss": 0.6449, + "step": 1480 + }, + { + "epoch": 0.3932032390813753, + "grad_norm": 0.3709039163120122, + "learning_rate": 4.95365104098384e-06, + "loss": 0.6671, + "step": 1481 + }, + { + "epoch": 0.3934687375547591, + "grad_norm": 0.3712835013747682, + "learning_rate": 4.953584101474915e-06, + "loss": 0.6322, + "step": 1482 + }, + { + "epoch": 0.39373423602814284, + "grad_norm": 0.37516913432924626, + "learning_rate": 4.953517114115093e-06, + "loss": 0.6342, + "step": 1483 + }, + { + "epoch": 0.39399973450152664, + "grad_norm": 0.3832057761510323, + "learning_rate": 4.95345007890568e-06, + "loss": 0.6203, + "step": 1484 + }, + { + "epoch": 0.3942652329749104, + "grad_norm": 0.38388009158934117, + "learning_rate": 4.953382995847984e-06, + "loss": 0.6668, + "step": 1485 + }, + { + "epoch": 0.39453073144829415, + "grad_norm": 0.40003373474073817, + "learning_rate": 4.953315864943312e-06, + "loss": 0.6346, + "step": 1486 + }, + { + "epoch": 0.39479622992167795, + "grad_norm": 0.37682658361850974, + "learning_rate": 4.9532486861929745e-06, + "loss": 0.6524, + "step": 1487 + }, + { + "epoch": 0.3950617283950617, + "grad_norm": 0.3767846630718036, + "learning_rate": 4.953181459598281e-06, + "loss": 0.6459, + "step": 1488 + }, + { + "epoch": 0.3953272268684455, + "grad_norm": 0.3812495439451251, + "learning_rate": 4.9531141851605426e-06, + "loss": 0.6193, + "step": 1489 + }, + { + "epoch": 0.39559272534182927, + "grad_norm": 0.3846612978316415, + "learning_rate": 4.953046862881071e-06, + "loss": 0.6557, + "step": 1490 + }, + { + "epoch": 0.3958582238152131, + "grad_norm": 0.37315842183341713, + "learning_rate": 4.952979492761179e-06, + "loss": 0.6171, + "step": 1491 + }, + { + "epoch": 0.3961237222885968, + "grad_norm": 0.3956807307227741, + "learning_rate": 4.952912074802182e-06, + "loss": 0.6503, + "step": 1492 + }, + { + "epoch": 0.39638922076198063, + "grad_norm": 0.3757489994884287, + "learning_rate": 4.952844609005393e-06, + "loss": 0.643, + "step": 1493 + }, + { + "epoch": 0.3966547192353644, + "grad_norm": 0.3888396186642242, + "learning_rate": 4.9527770953721285e-06, + "loss": 0.657, + "step": 1494 + }, + { + "epoch": 0.3969202177087482, + "grad_norm": 0.3714727391950251, + "learning_rate": 4.952709533903706e-06, + "loss": 0.6379, + "step": 1495 + }, + { + "epoch": 0.39718571618213194, + "grad_norm": 0.3741917030539026, + "learning_rate": 4.9526419246014415e-06, + "loss": 0.6292, + "step": 1496 + }, + { + "epoch": 0.39745121465551575, + "grad_norm": 0.3694705283973955, + "learning_rate": 4.952574267466655e-06, + "loss": 0.5826, + "step": 1497 + }, + { + "epoch": 0.3977167131288995, + "grad_norm": 0.3706702336109769, + "learning_rate": 4.9525065625006645e-06, + "loss": 0.6458, + "step": 1498 + }, + { + "epoch": 0.3979822116022833, + "grad_norm": 0.3569730233874185, + "learning_rate": 4.952438809704791e-06, + "loss": 0.6174, + "step": 1499 + }, + { + "epoch": 0.39824771007566706, + "grad_norm": 0.37982738830536933, + "learning_rate": 4.9523710090803576e-06, + "loss": 0.6178, + "step": 1500 + }, + { + "epoch": 0.39851320854905087, + "grad_norm": 0.3743163991683717, + "learning_rate": 4.952303160628684e-06, + "loss": 0.6524, + "step": 1501 + }, + { + "epoch": 0.3987787070224346, + "grad_norm": 0.37351588481447573, + "learning_rate": 4.952235264351095e-06, + "loss": 0.6393, + "step": 1502 + }, + { + "epoch": 0.39904420549581837, + "grad_norm": 0.3965425276450105, + "learning_rate": 4.952167320248914e-06, + "loss": 0.6372, + "step": 1503 + }, + { + "epoch": 0.3993097039692022, + "grad_norm": 0.37300389314962357, + "learning_rate": 4.952099328323466e-06, + "loss": 0.6502, + "step": 1504 + }, + { + "epoch": 0.39957520244258593, + "grad_norm": 0.3730643040164551, + "learning_rate": 4.952031288576078e-06, + "loss": 0.619, + "step": 1505 + }, + { + "epoch": 0.39984070091596974, + "grad_norm": 0.37593313637519654, + "learning_rate": 4.9519632010080765e-06, + "loss": 0.558, + "step": 1506 + }, + { + "epoch": 0.4001061993893535, + "grad_norm": 0.40376294370648996, + "learning_rate": 4.951895065620789e-06, + "loss": 0.6297, + "step": 1507 + }, + { + "epoch": 0.4003716978627373, + "grad_norm": 0.3827676631681475, + "learning_rate": 4.951826882415544e-06, + "loss": 0.6064, + "step": 1508 + }, + { + "epoch": 0.40063719633612105, + "grad_norm": 0.3902324075672343, + "learning_rate": 4.951758651393672e-06, + "loss": 0.64, + "step": 1509 + }, + { + "epoch": 0.40090269480950486, + "grad_norm": 0.37491893902961715, + "learning_rate": 4.951690372556502e-06, + "loss": 0.633, + "step": 1510 + }, + { + "epoch": 0.4011681932828886, + "grad_norm": 0.37754628982436195, + "learning_rate": 4.9516220459053685e-06, + "loss": 0.6528, + "step": 1511 + }, + { + "epoch": 0.4014336917562724, + "grad_norm": 0.3725784806878786, + "learning_rate": 4.951553671441603e-06, + "loss": 0.6338, + "step": 1512 + }, + { + "epoch": 0.40169919022965617, + "grad_norm": 0.3866060610989434, + "learning_rate": 4.951485249166537e-06, + "loss": 0.6666, + "step": 1513 + }, + { + "epoch": 0.40196468870304, + "grad_norm": 0.37658539033153926, + "learning_rate": 4.951416779081507e-06, + "loss": 0.6554, + "step": 1514 + }, + { + "epoch": 0.4022301871764237, + "grad_norm": 0.38017348322180383, + "learning_rate": 4.951348261187847e-06, + "loss": 0.6725, + "step": 1515 + }, + { + "epoch": 0.40249568564980753, + "grad_norm": 0.3592714783102323, + "learning_rate": 4.951279695486895e-06, + "loss": 0.6006, + "step": 1516 + }, + { + "epoch": 0.4027611841231913, + "grad_norm": 0.38051905290563476, + "learning_rate": 4.951211081979987e-06, + "loss": 0.6384, + "step": 1517 + }, + { + "epoch": 0.4030266825965751, + "grad_norm": 0.363536852764823, + "learning_rate": 4.951142420668461e-06, + "loss": 0.6224, + "step": 1518 + }, + { + "epoch": 0.40329218106995884, + "grad_norm": 0.3890642636995573, + "learning_rate": 4.951073711553657e-06, + "loss": 0.6358, + "step": 1519 + }, + { + "epoch": 0.40355767954334265, + "grad_norm": 0.39445351972956, + "learning_rate": 4.951004954636913e-06, + "loss": 0.6595, + "step": 1520 + }, + { + "epoch": 0.4038231780167264, + "grad_norm": 0.3982213687881686, + "learning_rate": 4.950936149919572e-06, + "loss": 0.6027, + "step": 1521 + }, + { + "epoch": 0.40408867649011015, + "grad_norm": 0.3854249110118144, + "learning_rate": 4.950867297402976e-06, + "loss": 0.643, + "step": 1522 + }, + { + "epoch": 0.40435417496349396, + "grad_norm": 0.3581356017525054, + "learning_rate": 4.950798397088465e-06, + "loss": 0.6192, + "step": 1523 + }, + { + "epoch": 0.4046196734368777, + "grad_norm": 0.3773246357129126, + "learning_rate": 4.950729448977386e-06, + "loss": 0.6732, + "step": 1524 + }, + { + "epoch": 0.4048851719102615, + "grad_norm": 0.3815722803658837, + "learning_rate": 4.9506604530710825e-06, + "loss": 0.6361, + "step": 1525 + }, + { + "epoch": 0.4051506703836453, + "grad_norm": 0.3664922401679366, + "learning_rate": 4.9505914093709e-06, + "loss": 0.5954, + "step": 1526 + }, + { + "epoch": 0.4054161688570291, + "grad_norm": 0.37563904913174356, + "learning_rate": 4.950522317878184e-06, + "loss": 0.5986, + "step": 1527 + }, + { + "epoch": 0.40568166733041283, + "grad_norm": 0.3892051459536696, + "learning_rate": 4.950453178594283e-06, + "loss": 0.6038, + "step": 1528 + }, + { + "epoch": 0.40594716580379664, + "grad_norm": 0.39694643329554125, + "learning_rate": 4.950383991520547e-06, + "loss": 0.6585, + "step": 1529 + }, + { + "epoch": 0.4062126642771804, + "grad_norm": 0.4083925886587904, + "learning_rate": 4.950314756658321e-06, + "loss": 0.6524, + "step": 1530 + }, + { + "epoch": 0.4064781627505642, + "grad_norm": 0.3759360707159121, + "learning_rate": 4.95024547400896e-06, + "loss": 0.65, + "step": 1531 + }, + { + "epoch": 0.40674366122394795, + "grad_norm": 0.387354723117181, + "learning_rate": 4.9501761435738115e-06, + "loss": 0.6549, + "step": 1532 + }, + { + "epoch": 0.40700915969733176, + "grad_norm": 0.3726027294176786, + "learning_rate": 4.95010676535423e-06, + "loss": 0.6367, + "step": 1533 + }, + { + "epoch": 0.4072746581707155, + "grad_norm": 0.3741693104713276, + "learning_rate": 4.950037339351568e-06, + "loss": 0.6649, + "step": 1534 + }, + { + "epoch": 0.4075401566440993, + "grad_norm": 0.3760665944438138, + "learning_rate": 4.949967865567178e-06, + "loss": 0.6768, + "step": 1535 + }, + { + "epoch": 0.40780565511748307, + "grad_norm": 0.37782839437812066, + "learning_rate": 4.949898344002417e-06, + "loss": 0.6394, + "step": 1536 + }, + { + "epoch": 0.4080711535908669, + "grad_norm": 0.37257726173594624, + "learning_rate": 4.949828774658639e-06, + "loss": 0.6229, + "step": 1537 + }, + { + "epoch": 0.4083366520642506, + "grad_norm": 0.3770754472393843, + "learning_rate": 4.949759157537203e-06, + "loss": 0.6557, + "step": 1538 + }, + { + "epoch": 0.40860215053763443, + "grad_norm": 0.37313936514262486, + "learning_rate": 4.949689492639465e-06, + "loss": 0.6122, + "step": 1539 + }, + { + "epoch": 0.4088676490110182, + "grad_norm": 0.39078374874637534, + "learning_rate": 4.949619779966783e-06, + "loss": 0.6443, + "step": 1540 + }, + { + "epoch": 0.40913314748440194, + "grad_norm": 0.3888752356200754, + "learning_rate": 4.949550019520518e-06, + "loss": 0.6644, + "step": 1541 + }, + { + "epoch": 0.40939864595778575, + "grad_norm": 0.3973044570124634, + "learning_rate": 4.949480211302031e-06, + "loss": 0.6582, + "step": 1542 + }, + { + "epoch": 0.4096641444311695, + "grad_norm": 0.38694257415978767, + "learning_rate": 4.949410355312681e-06, + "loss": 0.6428, + "step": 1543 + }, + { + "epoch": 0.4099296429045533, + "grad_norm": 0.3749325764698057, + "learning_rate": 4.949340451553834e-06, + "loss": 0.6349, + "step": 1544 + }, + { + "epoch": 0.41019514137793706, + "grad_norm": 0.3744298373161061, + "learning_rate": 4.94927050002685e-06, + "loss": 0.6093, + "step": 1545 + }, + { + "epoch": 0.41046063985132086, + "grad_norm": 0.3936594556322945, + "learning_rate": 4.949200500733093e-06, + "loss": 0.6514, + "step": 1546 + }, + { + "epoch": 0.4107261383247046, + "grad_norm": 0.3760700440461478, + "learning_rate": 4.949130453673932e-06, + "loss": 0.6306, + "step": 1547 + }, + { + "epoch": 0.4109916367980884, + "grad_norm": 0.37535800885008996, + "learning_rate": 4.949060358850729e-06, + "loss": 0.6239, + "step": 1548 + }, + { + "epoch": 0.4112571352714722, + "grad_norm": 0.37688638991759393, + "learning_rate": 4.948990216264853e-06, + "loss": 0.6442, + "step": 1549 + }, + { + "epoch": 0.411522633744856, + "grad_norm": 0.369621358837807, + "learning_rate": 4.9489200259176715e-06, + "loss": 0.6354, + "step": 1550 + }, + { + "epoch": 0.41178813221823973, + "grad_norm": 0.4001845431668011, + "learning_rate": 4.948849787810554e-06, + "loss": 0.606, + "step": 1551 + }, + { + "epoch": 0.41205363069162354, + "grad_norm": 0.3801373776920019, + "learning_rate": 4.9487795019448695e-06, + "loss": 0.6328, + "step": 1552 + }, + { + "epoch": 0.4123191291650073, + "grad_norm": 0.38123105172776506, + "learning_rate": 4.948709168321989e-06, + "loss": 0.5982, + "step": 1553 + }, + { + "epoch": 0.4125846276383911, + "grad_norm": 0.37581282507433045, + "learning_rate": 4.948638786943285e-06, + "loss": 0.6146, + "step": 1554 + }, + { + "epoch": 0.41285012611177485, + "grad_norm": 0.38060785345567816, + "learning_rate": 4.948568357810128e-06, + "loss": 0.6446, + "step": 1555 + }, + { + "epoch": 0.41311562458515866, + "grad_norm": 0.37141398664019587, + "learning_rate": 4.9484978809238945e-06, + "loss": 0.6678, + "step": 1556 + }, + { + "epoch": 0.4133811230585424, + "grad_norm": 0.37935883805892023, + "learning_rate": 4.948427356285956e-06, + "loss": 0.6352, + "step": 1557 + }, + { + "epoch": 0.4136466215319262, + "grad_norm": 0.3816316230215305, + "learning_rate": 4.94835678389769e-06, + "loss": 0.6057, + "step": 1558 + }, + { + "epoch": 0.41391212000530997, + "grad_norm": 0.39538492076129317, + "learning_rate": 4.9482861637604725e-06, + "loss": 0.6067, + "step": 1559 + }, + { + "epoch": 0.4141776184786937, + "grad_norm": 0.36542160119434036, + "learning_rate": 4.94821549587568e-06, + "loss": 0.6211, + "step": 1560 + }, + { + "epoch": 0.41444311695207753, + "grad_norm": 0.39548725338996055, + "learning_rate": 4.94814478024469e-06, + "loss": 0.5854, + "step": 1561 + }, + { + "epoch": 0.4147086154254613, + "grad_norm": 0.3836340813617557, + "learning_rate": 4.948074016868884e-06, + "loss": 0.6707, + "step": 1562 + }, + { + "epoch": 0.4149741138988451, + "grad_norm": 0.40700202796436696, + "learning_rate": 4.94800320574964e-06, + "loss": 0.6146, + "step": 1563 + }, + { + "epoch": 0.41523961237222884, + "grad_norm": 0.3900921802915412, + "learning_rate": 4.94793234688834e-06, + "loss": 0.6699, + "step": 1564 + }, + { + "epoch": 0.41550511084561265, + "grad_norm": 0.38383660877028974, + "learning_rate": 4.947861440286365e-06, + "loss": 0.6066, + "step": 1565 + }, + { + "epoch": 0.4157706093189964, + "grad_norm": 0.3811570065982231, + "learning_rate": 4.9477904859450994e-06, + "loss": 0.6778, + "step": 1566 + }, + { + "epoch": 0.4160361077923802, + "grad_norm": 0.39264849903297744, + "learning_rate": 4.947719483865927e-06, + "loss": 0.6468, + "step": 1567 + }, + { + "epoch": 0.41630160626576396, + "grad_norm": 0.3753239643561131, + "learning_rate": 4.947648434050231e-06, + "loss": 0.626, + "step": 1568 + }, + { + "epoch": 0.41656710473914776, + "grad_norm": 0.4050929356879002, + "learning_rate": 4.947577336499396e-06, + "loss": 0.6342, + "step": 1569 + }, + { + "epoch": 0.4168326032125315, + "grad_norm": 0.39462571695589943, + "learning_rate": 4.947506191214812e-06, + "loss": 0.6337, + "step": 1570 + }, + { + "epoch": 0.4170981016859153, + "grad_norm": 0.38905594357205586, + "learning_rate": 4.947434998197864e-06, + "loss": 0.6384, + "step": 1571 + }, + { + "epoch": 0.4173636001592991, + "grad_norm": 0.3804850730806265, + "learning_rate": 4.947363757449941e-06, + "loss": 0.6482, + "step": 1572 + }, + { + "epoch": 0.4176290986326829, + "grad_norm": 0.3818377216918663, + "learning_rate": 4.947292468972433e-06, + "loss": 0.6422, + "step": 1573 + }, + { + "epoch": 0.41789459710606663, + "grad_norm": 0.39699057776833674, + "learning_rate": 4.94722113276673e-06, + "loss": 0.6565, + "step": 1574 + }, + { + "epoch": 0.41816009557945044, + "grad_norm": 0.42960285906165224, + "learning_rate": 4.947149748834223e-06, + "loss": 0.6426, + "step": 1575 + }, + { + "epoch": 0.4184255940528342, + "grad_norm": 0.37287793567883765, + "learning_rate": 4.947078317176304e-06, + "loss": 0.6169, + "step": 1576 + }, + { + "epoch": 0.418691092526218, + "grad_norm": 0.3733878120090327, + "learning_rate": 4.9470068377943665e-06, + "loss": 0.6249, + "step": 1577 + }, + { + "epoch": 0.41895659099960175, + "grad_norm": 0.40996754506660293, + "learning_rate": 4.9469353106898046e-06, + "loss": 0.6497, + "step": 1578 + }, + { + "epoch": 0.4192220894729855, + "grad_norm": 0.3808394070889554, + "learning_rate": 4.946863735864013e-06, + "loss": 0.6373, + "step": 1579 + }, + { + "epoch": 0.4194875879463693, + "grad_norm": 0.4072587890017732, + "learning_rate": 4.9467921133183864e-06, + "loss": 0.605, + "step": 1580 + }, + { + "epoch": 0.41975308641975306, + "grad_norm": 0.3963611100653441, + "learning_rate": 4.946720443054324e-06, + "loss": 0.6004, + "step": 1581 + }, + { + "epoch": 0.42001858489313687, + "grad_norm": 0.37052249405576043, + "learning_rate": 4.9466487250732224e-06, + "loss": 0.6336, + "step": 1582 + }, + { + "epoch": 0.4202840833665206, + "grad_norm": 0.38606938014253195, + "learning_rate": 4.946576959376479e-06, + "loss": 0.6435, + "step": 1583 + }, + { + "epoch": 0.42054958183990443, + "grad_norm": 0.4154998029038964, + "learning_rate": 4.946505145965496e-06, + "loss": 0.6658, + "step": 1584 + }, + { + "epoch": 0.4208150803132882, + "grad_norm": 0.38187431149946094, + "learning_rate": 4.946433284841672e-06, + "loss": 0.6683, + "step": 1585 + }, + { + "epoch": 0.421080578786672, + "grad_norm": 0.38575420325722015, + "learning_rate": 4.946361376006409e-06, + "loss": 0.6506, + "step": 1586 + }, + { + "epoch": 0.42134607726005574, + "grad_norm": 0.3689311821011402, + "learning_rate": 4.94628941946111e-06, + "loss": 0.6713, + "step": 1587 + }, + { + "epoch": 0.42161157573343955, + "grad_norm": 0.3651451466414451, + "learning_rate": 4.946217415207177e-06, + "loss": 0.6379, + "step": 1588 + }, + { + "epoch": 0.4218770742068233, + "grad_norm": 0.39532985642846613, + "learning_rate": 4.946145363246015e-06, + "loss": 0.6169, + "step": 1589 + }, + { + "epoch": 0.4221425726802071, + "grad_norm": 0.36428085953941064, + "learning_rate": 4.94607326357903e-06, + "loss": 0.5861, + "step": 1590 + }, + { + "epoch": 0.42240807115359086, + "grad_norm": 0.3899354145866775, + "learning_rate": 4.946001116207627e-06, + "loss": 0.5937, + "step": 1591 + }, + { + "epoch": 0.42267356962697467, + "grad_norm": 0.38357067102826686, + "learning_rate": 4.9459289211332134e-06, + "loss": 0.6555, + "step": 1592 + }, + { + "epoch": 0.4229390681003584, + "grad_norm": 0.3804977641869738, + "learning_rate": 4.945856678357197e-06, + "loss": 0.5971, + "step": 1593 + }, + { + "epoch": 0.4232045665737422, + "grad_norm": 0.38452688135530994, + "learning_rate": 4.9457843878809875e-06, + "loss": 0.6161, + "step": 1594 + }, + { + "epoch": 0.423470065047126, + "grad_norm": 0.37575089193213984, + "learning_rate": 4.9457120497059926e-06, + "loss": 0.6672, + "step": 1595 + }, + { + "epoch": 0.4237355635205098, + "grad_norm": 0.39086886708302315, + "learning_rate": 4.945639663833626e-06, + "loss": 0.6511, + "step": 1596 + }, + { + "epoch": 0.42400106199389354, + "grad_norm": 0.3942861296902854, + "learning_rate": 4.945567230265298e-06, + "loss": 0.7053, + "step": 1597 + }, + { + "epoch": 0.4242665604672773, + "grad_norm": 0.3867998815391152, + "learning_rate": 4.945494749002421e-06, + "loss": 0.6077, + "step": 1598 + }, + { + "epoch": 0.4245320589406611, + "grad_norm": 0.3766080561335471, + "learning_rate": 4.945422220046409e-06, + "loss": 0.5895, + "step": 1599 + }, + { + "epoch": 0.42479755741404485, + "grad_norm": 0.3835422444876443, + "learning_rate": 4.945349643398675e-06, + "loss": 0.6336, + "step": 1600 + }, + { + "epoch": 0.42506305588742865, + "grad_norm": 0.4052010590526513, + "learning_rate": 4.945277019060637e-06, + "loss": 0.6608, + "step": 1601 + }, + { + "epoch": 0.4253285543608124, + "grad_norm": 0.4151652082319791, + "learning_rate": 4.94520434703371e-06, + "loss": 0.6407, + "step": 1602 + }, + { + "epoch": 0.4255940528341962, + "grad_norm": 0.3783049756753611, + "learning_rate": 4.945131627319311e-06, + "loss": 0.6689, + "step": 1603 + }, + { + "epoch": 0.42585955130757996, + "grad_norm": 0.3964485947247106, + "learning_rate": 4.945058859918858e-06, + "loss": 0.6417, + "step": 1604 + }, + { + "epoch": 0.42612504978096377, + "grad_norm": 0.3893080871437303, + "learning_rate": 4.944986044833772e-06, + "loss": 0.654, + "step": 1605 + }, + { + "epoch": 0.4263905482543475, + "grad_norm": 0.3930411477685514, + "learning_rate": 4.94491318206547e-06, + "loss": 0.6768, + "step": 1606 + }, + { + "epoch": 0.42665604672773133, + "grad_norm": 0.38547954004412693, + "learning_rate": 4.944840271615376e-06, + "loss": 0.6231, + "step": 1607 + }, + { + "epoch": 0.4269215452011151, + "grad_norm": 0.3743913916645727, + "learning_rate": 4.94476731348491e-06, + "loss": 0.6513, + "step": 1608 + }, + { + "epoch": 0.4271870436744989, + "grad_norm": 0.3869653809582928, + "learning_rate": 4.944694307675497e-06, + "loss": 0.6269, + "step": 1609 + }, + { + "epoch": 0.42745254214788264, + "grad_norm": 0.37519303574952695, + "learning_rate": 4.944621254188558e-06, + "loss": 0.6372, + "step": 1610 + }, + { + "epoch": 0.42771804062126645, + "grad_norm": 0.3921145753836486, + "learning_rate": 4.94454815302552e-06, + "loss": 0.594, + "step": 1611 + }, + { + "epoch": 0.4279835390946502, + "grad_norm": 0.39545167671228976, + "learning_rate": 4.944475004187806e-06, + "loss": 0.6411, + "step": 1612 + }, + { + "epoch": 0.428249037568034, + "grad_norm": 0.37658542065485245, + "learning_rate": 4.944401807676846e-06, + "loss": 0.6153, + "step": 1613 + }, + { + "epoch": 0.42851453604141776, + "grad_norm": 0.3690547653104833, + "learning_rate": 4.944328563494066e-06, + "loss": 0.641, + "step": 1614 + }, + { + "epoch": 0.42878003451480157, + "grad_norm": 0.3816290597461041, + "learning_rate": 4.944255271640893e-06, + "loss": 0.6428, + "step": 1615 + }, + { + "epoch": 0.4290455329881853, + "grad_norm": 0.3846315553712388, + "learning_rate": 4.944181932118759e-06, + "loss": 0.6326, + "step": 1616 + }, + { + "epoch": 0.42931103146156907, + "grad_norm": 0.3822047578943089, + "learning_rate": 4.944108544929091e-06, + "loss": 0.6101, + "step": 1617 + }, + { + "epoch": 0.4295765299349529, + "grad_norm": 0.3772806855454925, + "learning_rate": 4.944035110073324e-06, + "loss": 0.5933, + "step": 1618 + }, + { + "epoch": 0.42984202840833663, + "grad_norm": 0.3813315082184482, + "learning_rate": 4.943961627552888e-06, + "loss": 0.6484, + "step": 1619 + }, + { + "epoch": 0.43010752688172044, + "grad_norm": 0.39679355084632945, + "learning_rate": 4.943888097369216e-06, + "loss": 0.6393, + "step": 1620 + }, + { + "epoch": 0.4303730253551042, + "grad_norm": 0.38731039460318645, + "learning_rate": 4.943814519523742e-06, + "loss": 0.6418, + "step": 1621 + }, + { + "epoch": 0.430638523828488, + "grad_norm": 0.38140765610596006, + "learning_rate": 4.943740894017902e-06, + "loss": 0.6612, + "step": 1622 + }, + { + "epoch": 0.43090402230187175, + "grad_norm": 0.37436354173900305, + "learning_rate": 4.9436672208531315e-06, + "loss": 0.6436, + "step": 1623 + }, + { + "epoch": 0.43116952077525555, + "grad_norm": 0.4107710848666306, + "learning_rate": 4.943593500030867e-06, + "loss": 0.6232, + "step": 1624 + }, + { + "epoch": 0.4314350192486393, + "grad_norm": 0.3930193351242004, + "learning_rate": 4.943519731552546e-06, + "loss": 0.6473, + "step": 1625 + }, + { + "epoch": 0.4317005177220231, + "grad_norm": 0.3925443383035921, + "learning_rate": 4.943445915419607e-06, + "loss": 0.6219, + "step": 1626 + }, + { + "epoch": 0.43196601619540687, + "grad_norm": 0.4028359292563561, + "learning_rate": 4.94337205163349e-06, + "loss": 0.5811, + "step": 1627 + }, + { + "epoch": 0.4322315146687907, + "grad_norm": 0.3753775744625516, + "learning_rate": 4.943298140195637e-06, + "loss": 0.6316, + "step": 1628 + }, + { + "epoch": 0.4324970131421744, + "grad_norm": 0.3714009589504834, + "learning_rate": 4.9432241811074875e-06, + "loss": 0.6593, + "step": 1629 + }, + { + "epoch": 0.43276251161555823, + "grad_norm": 0.39135741376656336, + "learning_rate": 4.943150174370483e-06, + "loss": 0.6096, + "step": 1630 + }, + { + "epoch": 0.433028010088942, + "grad_norm": 0.3656532922497972, + "learning_rate": 4.94307611998607e-06, + "loss": 0.659, + "step": 1631 + }, + { + "epoch": 0.4332935085623258, + "grad_norm": 0.38751350689028435, + "learning_rate": 4.94300201795569e-06, + "loss": 0.6614, + "step": 1632 + }, + { + "epoch": 0.43355900703570954, + "grad_norm": 0.44445393479364587, + "learning_rate": 4.94292786828079e-06, + "loss": 0.6134, + "step": 1633 + }, + { + "epoch": 0.43382450550909335, + "grad_norm": 0.3709095673236794, + "learning_rate": 4.942853670962815e-06, + "loss": 0.6643, + "step": 1634 + }, + { + "epoch": 0.4340900039824771, + "grad_norm": 0.38598086806624593, + "learning_rate": 4.942779426003211e-06, + "loss": 0.6083, + "step": 1635 + }, + { + "epoch": 0.43435550245586085, + "grad_norm": 0.47269179740172845, + "learning_rate": 4.942705133403429e-06, + "loss": 0.5798, + "step": 1636 + }, + { + "epoch": 0.43462100092924466, + "grad_norm": 0.39146512560079444, + "learning_rate": 4.942630793164916e-06, + "loss": 0.632, + "step": 1637 + }, + { + "epoch": 0.4348864994026284, + "grad_norm": 0.3929720894985973, + "learning_rate": 4.94255640528912e-06, + "loss": 0.6353, + "step": 1638 + }, + { + "epoch": 0.4351519978760122, + "grad_norm": 0.3906718711554545, + "learning_rate": 4.942481969777496e-06, + "loss": 0.6829, + "step": 1639 + }, + { + "epoch": 0.43541749634939597, + "grad_norm": 0.39182488585389164, + "learning_rate": 4.942407486631492e-06, + "loss": 0.6656, + "step": 1640 + }, + { + "epoch": 0.4356829948227798, + "grad_norm": 0.39462909347772407, + "learning_rate": 4.942332955852563e-06, + "loss": 0.6509, + "step": 1641 + }, + { + "epoch": 0.43594849329616353, + "grad_norm": 0.3940803765074498, + "learning_rate": 4.94225837744216e-06, + "loss": 0.6401, + "step": 1642 + }, + { + "epoch": 0.43621399176954734, + "grad_norm": 0.3941480372471297, + "learning_rate": 4.94218375140174e-06, + "loss": 0.6237, + "step": 1643 + }, + { + "epoch": 0.4364794902429311, + "grad_norm": 0.38749929224259866, + "learning_rate": 4.942109077732757e-06, + "loss": 0.6449, + "step": 1644 + }, + { + "epoch": 0.4367449887163149, + "grad_norm": 0.38312830400750875, + "learning_rate": 4.942034356436668e-06, + "loss": 0.6073, + "step": 1645 + }, + { + "epoch": 0.43701048718969865, + "grad_norm": 0.3862796603844449, + "learning_rate": 4.9419595875149294e-06, + "loss": 0.6843, + "step": 1646 + }, + { + "epoch": 0.43727598566308246, + "grad_norm": 0.3873473046706782, + "learning_rate": 4.941884770969e-06, + "loss": 0.6698, + "step": 1647 + }, + { + "epoch": 0.4375414841364662, + "grad_norm": 0.38685093293955375, + "learning_rate": 4.9418099068003386e-06, + "loss": 0.6379, + "step": 1648 + }, + { + "epoch": 0.43780698260985, + "grad_norm": 0.38332926684140883, + "learning_rate": 4.941734995010406e-06, + "loss": 0.6275, + "step": 1649 + }, + { + "epoch": 0.43807248108323377, + "grad_norm": 0.3724629035094914, + "learning_rate": 4.941660035600663e-06, + "loss": 0.6679, + "step": 1650 + }, + { + "epoch": 0.4383379795566176, + "grad_norm": 0.43837860308373383, + "learning_rate": 4.941585028572571e-06, + "loss": 0.5832, + "step": 1651 + }, + { + "epoch": 0.4386034780300013, + "grad_norm": 0.38785275403811387, + "learning_rate": 4.941509973927591e-06, + "loss": 0.6672, + "step": 1652 + }, + { + "epoch": 0.43886897650338513, + "grad_norm": 0.37736372400950186, + "learning_rate": 4.9414348716671915e-06, + "loss": 0.6422, + "step": 1653 + }, + { + "epoch": 0.4391344749767689, + "grad_norm": 0.40750027812390716, + "learning_rate": 4.941359721792832e-06, + "loss": 0.657, + "step": 1654 + }, + { + "epoch": 0.43939997345015264, + "grad_norm": 0.4073221870315453, + "learning_rate": 4.941284524305982e-06, + "loss": 0.6192, + "step": 1655 + }, + { + "epoch": 0.43966547192353644, + "grad_norm": 0.38725402474887244, + "learning_rate": 4.9412092792081055e-06, + "loss": 0.6892, + "step": 1656 + }, + { + "epoch": 0.4399309703969202, + "grad_norm": 0.3950293089845777, + "learning_rate": 4.941133986500671e-06, + "loss": 0.6196, + "step": 1657 + }, + { + "epoch": 0.440196468870304, + "grad_norm": 0.41446212489566236, + "learning_rate": 4.941058646185148e-06, + "loss": 0.6061, + "step": 1658 + }, + { + "epoch": 0.44046196734368775, + "grad_norm": 0.3828708722445977, + "learning_rate": 4.9409832582630036e-06, + "loss": 0.6443, + "step": 1659 + }, + { + "epoch": 0.44072746581707156, + "grad_norm": 0.40221057778277863, + "learning_rate": 4.940907822735709e-06, + "loss": 0.6342, + "step": 1660 + }, + { + "epoch": 0.4409929642904553, + "grad_norm": 0.40136523177880523, + "learning_rate": 4.9408323396047365e-06, + "loss": 0.596, + "step": 1661 + }, + { + "epoch": 0.4412584627638391, + "grad_norm": 0.38910095784161536, + "learning_rate": 4.9407568088715565e-06, + "loss": 0.6162, + "step": 1662 + }, + { + "epoch": 0.4415239612372229, + "grad_norm": 0.4184623613488529, + "learning_rate": 4.940681230537643e-06, + "loss": 0.6483, + "step": 1663 + }, + { + "epoch": 0.4417894597106067, + "grad_norm": 0.3815204813473002, + "learning_rate": 4.940605604604469e-06, + "loss": 0.631, + "step": 1664 + }, + { + "epoch": 0.44205495818399043, + "grad_norm": 0.3791833852379832, + "learning_rate": 4.94052993107351e-06, + "loss": 0.6341, + "step": 1665 + }, + { + "epoch": 0.44232045665737424, + "grad_norm": 0.3956966500277488, + "learning_rate": 4.940454209946242e-06, + "loss": 0.6043, + "step": 1666 + }, + { + "epoch": 0.442585955130758, + "grad_norm": 0.38571208556788744, + "learning_rate": 4.940378441224143e-06, + "loss": 0.6706, + "step": 1667 + }, + { + "epoch": 0.4428514536041418, + "grad_norm": 0.3749164199841067, + "learning_rate": 4.940302624908689e-06, + "loss": 0.6124, + "step": 1668 + }, + { + "epoch": 0.44311695207752555, + "grad_norm": 0.3915145469847177, + "learning_rate": 4.9402267610013575e-06, + "loss": 0.6295, + "step": 1669 + }, + { + "epoch": 0.44338245055090936, + "grad_norm": 0.3759360062483427, + "learning_rate": 4.94015084950363e-06, + "loss": 0.6737, + "step": 1670 + }, + { + "epoch": 0.4436479490242931, + "grad_norm": 0.37937027922396005, + "learning_rate": 4.940074890416987e-06, + "loss": 0.6301, + "step": 1671 + }, + { + "epoch": 0.44391344749767686, + "grad_norm": 0.374164587013809, + "learning_rate": 4.9399988837429085e-06, + "loss": 0.6339, + "step": 1672 + }, + { + "epoch": 0.44417894597106067, + "grad_norm": 0.3801139949754767, + "learning_rate": 4.939922829482878e-06, + "loss": 0.6178, + "step": 1673 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.38613321793831296, + "learning_rate": 4.9398467276383786e-06, + "loss": 0.6465, + "step": 1674 + }, + { + "epoch": 0.4447099429178282, + "grad_norm": 0.3864302965127435, + "learning_rate": 4.939770578210894e-06, + "loss": 0.6505, + "step": 1675 + }, + { + "epoch": 0.444975441391212, + "grad_norm": 0.3671960895719421, + "learning_rate": 4.939694381201909e-06, + "loss": 0.5898, + "step": 1676 + }, + { + "epoch": 0.4452409398645958, + "grad_norm": 0.3785366184114263, + "learning_rate": 4.939618136612911e-06, + "loss": 0.6634, + "step": 1677 + }, + { + "epoch": 0.44550643833797954, + "grad_norm": 0.37229760246352267, + "learning_rate": 4.9395418444453865e-06, + "loss": 0.6529, + "step": 1678 + }, + { + "epoch": 0.44577193681136335, + "grad_norm": 0.3808553843853277, + "learning_rate": 4.9394655047008225e-06, + "loss": 0.617, + "step": 1679 + }, + { + "epoch": 0.4460374352847471, + "grad_norm": 0.36503412428572335, + "learning_rate": 4.939389117380708e-06, + "loss": 0.5868, + "step": 1680 + }, + { + "epoch": 0.4463029337581309, + "grad_norm": 0.3797407090372478, + "learning_rate": 4.9393126824865335e-06, + "loss": 0.6363, + "step": 1681 + }, + { + "epoch": 0.44656843223151466, + "grad_norm": 0.38620167463900623, + "learning_rate": 4.939236200019789e-06, + "loss": 0.638, + "step": 1682 + }, + { + "epoch": 0.44683393070489846, + "grad_norm": 0.3765492387858599, + "learning_rate": 4.939159669981966e-06, + "loss": 0.6152, + "step": 1683 + }, + { + "epoch": 0.4470994291782822, + "grad_norm": 0.37395862179561223, + "learning_rate": 4.939083092374558e-06, + "loss": 0.6482, + "step": 1684 + }, + { + "epoch": 0.447364927651666, + "grad_norm": 0.38391228111425557, + "learning_rate": 4.939006467199057e-06, + "loss": 0.6012, + "step": 1685 + }, + { + "epoch": 0.4476304261250498, + "grad_norm": 0.3897952675436452, + "learning_rate": 4.93892979445696e-06, + "loss": 0.6611, + "step": 1686 + }, + { + "epoch": 0.4478959245984336, + "grad_norm": 0.36778256560587486, + "learning_rate": 4.938853074149757e-06, + "loss": 0.6073, + "step": 1687 + }, + { + "epoch": 0.44816142307181733, + "grad_norm": 0.38277630659821105, + "learning_rate": 4.93877630627895e-06, + "loss": 0.6338, + "step": 1688 + }, + { + "epoch": 0.44842692154520114, + "grad_norm": 0.38469641726248494, + "learning_rate": 4.938699490846035e-06, + "loss": 0.6429, + "step": 1689 + }, + { + "epoch": 0.4486924200185849, + "grad_norm": 0.3700129046701821, + "learning_rate": 4.938622627852507e-06, + "loss": 0.6135, + "step": 1690 + }, + { + "epoch": 0.44895791849196864, + "grad_norm": 0.383356261722202, + "learning_rate": 4.938545717299867e-06, + "loss": 0.6788, + "step": 1691 + }, + { + "epoch": 0.44922341696535245, + "grad_norm": 0.3749427085145032, + "learning_rate": 4.938468759189616e-06, + "loss": 0.6629, + "step": 1692 + }, + { + "epoch": 0.4494889154387362, + "grad_norm": 0.3726686152622098, + "learning_rate": 4.938391753523254e-06, + "loss": 0.6326, + "step": 1693 + }, + { + "epoch": 0.44975441391212, + "grad_norm": 0.3948298784210491, + "learning_rate": 4.938314700302282e-06, + "loss": 0.6273, + "step": 1694 + }, + { + "epoch": 0.45001991238550376, + "grad_norm": 0.3863984004680951, + "learning_rate": 4.938237599528203e-06, + "loss": 0.6069, + "step": 1695 + }, + { + "epoch": 0.45028541085888757, + "grad_norm": 0.3866378844432379, + "learning_rate": 4.938160451202521e-06, + "loss": 0.6497, + "step": 1696 + }, + { + "epoch": 0.4505509093322713, + "grad_norm": 0.41075334584857653, + "learning_rate": 4.938083255326741e-06, + "loss": 0.5576, + "step": 1697 + }, + { + "epoch": 0.45081640780565513, + "grad_norm": 0.3920227584745048, + "learning_rate": 4.938006011902368e-06, + "loss": 0.6544, + "step": 1698 + }, + { + "epoch": 0.4510819062790389, + "grad_norm": 0.3595136813858587, + "learning_rate": 4.937928720930908e-06, + "loss": 0.6244, + "step": 1699 + }, + { + "epoch": 0.4513474047524227, + "grad_norm": 0.3950605631766553, + "learning_rate": 4.937851382413869e-06, + "loss": 0.6171, + "step": 1700 + }, + { + "epoch": 0.45161290322580644, + "grad_norm": 0.40046136331571747, + "learning_rate": 4.937773996352759e-06, + "loss": 0.6234, + "step": 1701 + }, + { + "epoch": 0.45187840169919025, + "grad_norm": 0.3803613330253262, + "learning_rate": 4.937696562749088e-06, + "loss": 0.6403, + "step": 1702 + }, + { + "epoch": 0.452143900172574, + "grad_norm": 0.37269232995298995, + "learning_rate": 4.937619081604366e-06, + "loss": 0.6076, + "step": 1703 + }, + { + "epoch": 0.4524093986459578, + "grad_norm": 0.37804299250572265, + "learning_rate": 4.937541552920102e-06, + "loss": 0.63, + "step": 1704 + }, + { + "epoch": 0.45267489711934156, + "grad_norm": 0.4115677556718682, + "learning_rate": 4.937463976697811e-06, + "loss": 0.5989, + "step": 1705 + }, + { + "epoch": 0.45294039559272536, + "grad_norm": 0.3771962252099528, + "learning_rate": 4.937386352939004e-06, + "loss": 0.6346, + "step": 1706 + }, + { + "epoch": 0.4532058940661091, + "grad_norm": 0.433898767443415, + "learning_rate": 4.937308681645196e-06, + "loss": 0.5951, + "step": 1707 + }, + { + "epoch": 0.4534713925394929, + "grad_norm": 0.378806177547712, + "learning_rate": 4.9372309628178995e-06, + "loss": 0.6457, + "step": 1708 + }, + { + "epoch": 0.4537368910128767, + "grad_norm": 0.38432939642619474, + "learning_rate": 4.937153196458633e-06, + "loss": 0.6478, + "step": 1709 + }, + { + "epoch": 0.4540023894862604, + "grad_norm": 0.3952040727996697, + "learning_rate": 4.937075382568912e-06, + "loss": 0.5839, + "step": 1710 + }, + { + "epoch": 0.45426788795964423, + "grad_norm": 0.3757951589192852, + "learning_rate": 4.936997521150254e-06, + "loss": 0.6377, + "step": 1711 + }, + { + "epoch": 0.454533386433028, + "grad_norm": 0.3735648170534193, + "learning_rate": 4.936919612204177e-06, + "loss": 0.5973, + "step": 1712 + }, + { + "epoch": 0.4547988849064118, + "grad_norm": 0.3884318126431452, + "learning_rate": 4.9368416557322014e-06, + "loss": 0.6507, + "step": 1713 + }, + { + "epoch": 0.45506438337979555, + "grad_norm": 0.3940904861093302, + "learning_rate": 4.936763651735847e-06, + "loss": 0.6573, + "step": 1714 + }, + { + "epoch": 0.45532988185317935, + "grad_norm": 0.3795500691170738, + "learning_rate": 4.936685600216635e-06, + "loss": 0.6689, + "step": 1715 + }, + { + "epoch": 0.4555953803265631, + "grad_norm": 0.38041657880412216, + "learning_rate": 4.936607501176088e-06, + "loss": 0.6452, + "step": 1716 + }, + { + "epoch": 0.4558608787999469, + "grad_norm": 0.379827072104167, + "learning_rate": 4.936529354615729e-06, + "loss": 0.6325, + "step": 1717 + }, + { + "epoch": 0.45612637727333066, + "grad_norm": 0.37834522292738343, + "learning_rate": 4.9364511605370815e-06, + "loss": 0.5989, + "step": 1718 + }, + { + "epoch": 0.45639187574671447, + "grad_norm": 0.3896691716148016, + "learning_rate": 4.936372918941671e-06, + "loss": 0.622, + "step": 1719 + }, + { + "epoch": 0.4566573742200982, + "grad_norm": 0.3750019835504782, + "learning_rate": 4.9362946298310236e-06, + "loss": 0.6508, + "step": 1720 + }, + { + "epoch": 0.45692287269348203, + "grad_norm": 0.378092227330637, + "learning_rate": 4.936216293206666e-06, + "loss": 0.6175, + "step": 1721 + }, + { + "epoch": 0.4571883711668658, + "grad_norm": 0.372360338643961, + "learning_rate": 4.936137909070125e-06, + "loss": 0.6479, + "step": 1722 + }, + { + "epoch": 0.4574538696402496, + "grad_norm": 0.38238014554140415, + "learning_rate": 4.936059477422931e-06, + "loss": 0.6047, + "step": 1723 + }, + { + "epoch": 0.45771936811363334, + "grad_norm": 0.3777027803451668, + "learning_rate": 4.935980998266613e-06, + "loss": 0.6004, + "step": 1724 + }, + { + "epoch": 0.45798486658701715, + "grad_norm": 0.3951776977852749, + "learning_rate": 4.9359024716027e-06, + "loss": 0.6384, + "step": 1725 + }, + { + "epoch": 0.4582503650604009, + "grad_norm": 0.3845661945307065, + "learning_rate": 4.935823897432725e-06, + "loss": 0.636, + "step": 1726 + }, + { + "epoch": 0.4585158635337847, + "grad_norm": 0.40118180045020735, + "learning_rate": 4.93574527575822e-06, + "loss": 0.6176, + "step": 1727 + }, + { + "epoch": 0.45878136200716846, + "grad_norm": 0.391039266777863, + "learning_rate": 4.935666606580719e-06, + "loss": 0.6185, + "step": 1728 + }, + { + "epoch": 0.4590468604805522, + "grad_norm": 0.37819717057976776, + "learning_rate": 4.935587889901756e-06, + "loss": 0.6573, + "step": 1729 + }, + { + "epoch": 0.459312358953936, + "grad_norm": 0.41544670685909424, + "learning_rate": 4.935509125722865e-06, + "loss": 0.5862, + "step": 1730 + }, + { + "epoch": 0.45957785742731977, + "grad_norm": 0.4080586799532462, + "learning_rate": 4.935430314045583e-06, + "loss": 0.6383, + "step": 1731 + }, + { + "epoch": 0.4598433559007036, + "grad_norm": 0.40190361658617757, + "learning_rate": 4.935351454871447e-06, + "loss": 0.6706, + "step": 1732 + }, + { + "epoch": 0.46010885437408733, + "grad_norm": 0.41063212068772775, + "learning_rate": 4.9352725482019944e-06, + "loss": 0.6384, + "step": 1733 + }, + { + "epoch": 0.46037435284747114, + "grad_norm": 0.3950823442438883, + "learning_rate": 4.935193594038765e-06, + "loss": 0.6355, + "step": 1734 + }, + { + "epoch": 0.4606398513208549, + "grad_norm": 0.38032576246327926, + "learning_rate": 4.935114592383298e-06, + "loss": 0.6205, + "step": 1735 + }, + { + "epoch": 0.4609053497942387, + "grad_norm": 0.3862929803763625, + "learning_rate": 4.9350355432371345e-06, + "loss": 0.6521, + "step": 1736 + }, + { + "epoch": 0.46117084826762245, + "grad_norm": 0.3897244594003261, + "learning_rate": 4.934956446601815e-06, + "loss": 0.5909, + "step": 1737 + }, + { + "epoch": 0.46143634674100625, + "grad_norm": 0.37387682523316074, + "learning_rate": 4.9348773024788845e-06, + "loss": 0.576, + "step": 1738 + }, + { + "epoch": 0.46170184521439, + "grad_norm": 0.3837862860957116, + "learning_rate": 4.934798110869884e-06, + "loss": 0.643, + "step": 1739 + }, + { + "epoch": 0.4619673436877738, + "grad_norm": 0.3998133847095514, + "learning_rate": 4.934718871776359e-06, + "loss": 0.6454, + "step": 1740 + }, + { + "epoch": 0.46223284216115756, + "grad_norm": 0.37805680537467495, + "learning_rate": 4.934639585199855e-06, + "loss": 0.6441, + "step": 1741 + }, + { + "epoch": 0.46249834063454137, + "grad_norm": 0.38788744008372245, + "learning_rate": 4.934560251141918e-06, + "loss": 0.6234, + "step": 1742 + }, + { + "epoch": 0.4627638391079251, + "grad_norm": 0.3654820128679984, + "learning_rate": 4.934480869604095e-06, + "loss": 0.5979, + "step": 1743 + }, + { + "epoch": 0.46302933758130893, + "grad_norm": 0.39218850405121114, + "learning_rate": 4.934401440587935e-06, + "loss": 0.6566, + "step": 1744 + }, + { + "epoch": 0.4632948360546927, + "grad_norm": 0.37947823451585755, + "learning_rate": 4.934321964094985e-06, + "loss": 0.6392, + "step": 1745 + }, + { + "epoch": 0.4635603345280765, + "grad_norm": 0.38657237651505744, + "learning_rate": 4.934242440126798e-06, + "loss": 0.6319, + "step": 1746 + }, + { + "epoch": 0.46382583300146024, + "grad_norm": 0.37588575359767307, + "learning_rate": 4.934162868684923e-06, + "loss": 0.6073, + "step": 1747 + }, + { + "epoch": 0.464091331474844, + "grad_norm": 0.3982070933536065, + "learning_rate": 4.934083249770912e-06, + "loss": 0.6504, + "step": 1748 + }, + { + "epoch": 0.4643568299482278, + "grad_norm": 0.40661072542218, + "learning_rate": 4.934003583386319e-06, + "loss": 0.6043, + "step": 1749 + }, + { + "epoch": 0.46462232842161155, + "grad_norm": 0.3819824929010193, + "learning_rate": 4.933923869532695e-06, + "loss": 0.6274, + "step": 1750 + }, + { + "epoch": 0.46488782689499536, + "grad_norm": 0.42403335588596847, + "learning_rate": 4.933844108211598e-06, + "loss": 0.6432, + "step": 1751 + }, + { + "epoch": 0.4651533253683791, + "grad_norm": 0.3818754144442884, + "learning_rate": 4.933764299424581e-06, + "loss": 0.6221, + "step": 1752 + }, + { + "epoch": 0.4654188238417629, + "grad_norm": 0.3766156996849648, + "learning_rate": 4.9336844431732e-06, + "loss": 0.6292, + "step": 1753 + }, + { + "epoch": 0.46568432231514667, + "grad_norm": 0.4119487407929856, + "learning_rate": 4.933604539459015e-06, + "loss": 0.6177, + "step": 1754 + }, + { + "epoch": 0.4659498207885305, + "grad_norm": 0.3793213603442324, + "learning_rate": 4.933524588283583e-06, + "loss": 0.6511, + "step": 1755 + }, + { + "epoch": 0.46621531926191423, + "grad_norm": 0.37576310847605426, + "learning_rate": 4.933444589648463e-06, + "loss": 0.6446, + "step": 1756 + }, + { + "epoch": 0.46648081773529804, + "grad_norm": 0.38357642146509346, + "learning_rate": 4.9333645435552156e-06, + "loss": 0.6271, + "step": 1757 + }, + { + "epoch": 0.4667463162086818, + "grad_norm": 0.43516532010809733, + "learning_rate": 4.9332844500054015e-06, + "loss": 0.6514, + "step": 1758 + }, + { + "epoch": 0.4670118146820656, + "grad_norm": 0.37990546112442525, + "learning_rate": 4.933204309000583e-06, + "loss": 0.6489, + "step": 1759 + }, + { + "epoch": 0.46727731315544935, + "grad_norm": 0.38457986689359225, + "learning_rate": 4.933124120542323e-06, + "loss": 0.6121, + "step": 1760 + }, + { + "epoch": 0.46754281162883315, + "grad_norm": 0.3704122932587119, + "learning_rate": 4.933043884632185e-06, + "loss": 0.5994, + "step": 1761 + }, + { + "epoch": 0.4678083101022169, + "grad_norm": 0.3899208147014975, + "learning_rate": 4.932963601271734e-06, + "loss": 0.6903, + "step": 1762 + }, + { + "epoch": 0.4680738085756007, + "grad_norm": 0.40387845271885775, + "learning_rate": 4.932883270462536e-06, + "loss": 0.6355, + "step": 1763 + }, + { + "epoch": 0.46833930704898447, + "grad_norm": 0.3899688593253954, + "learning_rate": 4.932802892206158e-06, + "loss": 0.6193, + "step": 1764 + }, + { + "epoch": 0.4686048055223683, + "grad_norm": 0.4211207461379197, + "learning_rate": 4.932722466504167e-06, + "loss": 0.5933, + "step": 1765 + }, + { + "epoch": 0.468870303995752, + "grad_norm": 0.4004879803929037, + "learning_rate": 4.932641993358132e-06, + "loss": 0.6347, + "step": 1766 + }, + { + "epoch": 0.4691358024691358, + "grad_norm": 0.39776362486682454, + "learning_rate": 4.93256147276962e-06, + "loss": 0.6449, + "step": 1767 + }, + { + "epoch": 0.4694013009425196, + "grad_norm": 0.38379232001516417, + "learning_rate": 4.932480904740206e-06, + "loss": 0.6231, + "step": 1768 + }, + { + "epoch": 0.46966679941590334, + "grad_norm": 0.41869550785919246, + "learning_rate": 4.932400289271456e-06, + "loss": 0.6213, + "step": 1769 + }, + { + "epoch": 0.46993229788928714, + "grad_norm": 0.3780403678137773, + "learning_rate": 4.932319626364947e-06, + "loss": 0.6081, + "step": 1770 + }, + { + "epoch": 0.4701977963626709, + "grad_norm": 0.3849828428519195, + "learning_rate": 4.932238916022249e-06, + "loss": 0.5926, + "step": 1771 + }, + { + "epoch": 0.4704632948360547, + "grad_norm": 0.3954503159594688, + "learning_rate": 4.9321581582449365e-06, + "loss": 0.6444, + "step": 1772 + }, + { + "epoch": 0.47072879330943845, + "grad_norm": 0.38946945581332887, + "learning_rate": 4.932077353034585e-06, + "loss": 0.6281, + "step": 1773 + }, + { + "epoch": 0.47099429178282226, + "grad_norm": 0.38524665261405866, + "learning_rate": 4.931996500392772e-06, + "loss": 0.6241, + "step": 1774 + }, + { + "epoch": 0.471259790256206, + "grad_norm": 0.3856937094015711, + "learning_rate": 4.931915600321071e-06, + "loss": 0.5861, + "step": 1775 + }, + { + "epoch": 0.4715252887295898, + "grad_norm": 0.382896892369575, + "learning_rate": 4.931834652821063e-06, + "loss": 0.6539, + "step": 1776 + }, + { + "epoch": 0.47179078720297357, + "grad_norm": 0.42227343563372355, + "learning_rate": 4.9317536578943235e-06, + "loss": 0.6164, + "step": 1777 + }, + { + "epoch": 0.4720562856763574, + "grad_norm": 0.3792211153051177, + "learning_rate": 4.931672615542434e-06, + "loss": 0.6097, + "step": 1778 + }, + { + "epoch": 0.47232178414974113, + "grad_norm": 0.40435135045283704, + "learning_rate": 4.931591525766976e-06, + "loss": 0.6529, + "step": 1779 + }, + { + "epoch": 0.47258728262312494, + "grad_norm": 0.3840802542840774, + "learning_rate": 4.931510388569528e-06, + "loss": 0.6399, + "step": 1780 + }, + { + "epoch": 0.4728527810965087, + "grad_norm": 0.3986134311166046, + "learning_rate": 4.9314292039516755e-06, + "loss": 0.6221, + "step": 1781 + }, + { + "epoch": 0.4731182795698925, + "grad_norm": 0.3897418395704273, + "learning_rate": 4.931347971914999e-06, + "loss": 0.6734, + "step": 1782 + }, + { + "epoch": 0.47338377804327625, + "grad_norm": 0.40374280754299857, + "learning_rate": 4.931266692461085e-06, + "loss": 0.649, + "step": 1783 + }, + { + "epoch": 0.47364927651666006, + "grad_norm": 0.4150821891451634, + "learning_rate": 4.9311853655915175e-06, + "loss": 0.6364, + "step": 1784 + }, + { + "epoch": 0.4739147749900438, + "grad_norm": 0.3730492094868012, + "learning_rate": 4.931103991307883e-06, + "loss": 0.64, + "step": 1785 + }, + { + "epoch": 0.47418027346342756, + "grad_norm": 0.3793974882897054, + "learning_rate": 4.931022569611768e-06, + "loss": 0.6702, + "step": 1786 + }, + { + "epoch": 0.47444577193681137, + "grad_norm": 0.3992342594127169, + "learning_rate": 4.930941100504761e-06, + "loss": 0.6377, + "step": 1787 + }, + { + "epoch": 0.4747112704101951, + "grad_norm": 0.4613974070846031, + "learning_rate": 4.93085958398845e-06, + "loss": 0.6258, + "step": 1788 + }, + { + "epoch": 0.4749767688835789, + "grad_norm": 0.3869638506793003, + "learning_rate": 4.930778020064426e-06, + "loss": 0.6334, + "step": 1789 + }, + { + "epoch": 0.4752422673569627, + "grad_norm": 0.43960079157031906, + "learning_rate": 4.930696408734278e-06, + "loss": 0.6033, + "step": 1790 + }, + { + "epoch": 0.4755077658303465, + "grad_norm": 0.39071061415164876, + "learning_rate": 4.9306147499995996e-06, + "loss": 0.6632, + "step": 1791 + }, + { + "epoch": 0.47577326430373024, + "grad_norm": 0.39240737775505524, + "learning_rate": 4.930533043861982e-06, + "loss": 0.6597, + "step": 1792 + }, + { + "epoch": 0.47603876277711404, + "grad_norm": 0.3850044943362015, + "learning_rate": 4.930451290323019e-06, + "loss": 0.6189, + "step": 1793 + }, + { + "epoch": 0.4763042612504978, + "grad_norm": 0.4371740593999468, + "learning_rate": 4.9303694893843065e-06, + "loss": 0.639, + "step": 1794 + }, + { + "epoch": 0.4765697597238816, + "grad_norm": 0.40516604641583126, + "learning_rate": 4.930287641047437e-06, + "loss": 0.6383, + "step": 1795 + }, + { + "epoch": 0.47683525819726535, + "grad_norm": 0.38546254096400157, + "learning_rate": 4.9302057453140085e-06, + "loss": 0.6293, + "step": 1796 + }, + { + "epoch": 0.47710075667064916, + "grad_norm": 0.4034326852028295, + "learning_rate": 4.930123802185618e-06, + "loss": 0.6505, + "step": 1797 + }, + { + "epoch": 0.4773662551440329, + "grad_norm": 0.40494528447050565, + "learning_rate": 4.9300418116638625e-06, + "loss": 0.6287, + "step": 1798 + }, + { + "epoch": 0.4776317536174167, + "grad_norm": 0.3821707943680172, + "learning_rate": 4.929959773750342e-06, + "loss": 0.6446, + "step": 1799 + }, + { + "epoch": 0.4778972520908005, + "grad_norm": 0.39275736714185777, + "learning_rate": 4.929877688446657e-06, + "loss": 0.6369, + "step": 1800 + }, + { + "epoch": 0.4781627505641843, + "grad_norm": 0.42733990423350493, + "learning_rate": 4.929795555754407e-06, + "loss": 0.6399, + "step": 1801 + }, + { + "epoch": 0.47842824903756803, + "grad_norm": 0.3981819973597323, + "learning_rate": 4.929713375675196e-06, + "loss": 0.6239, + "step": 1802 + }, + { + "epoch": 0.47869374751095184, + "grad_norm": 0.36912415854551983, + "learning_rate": 4.929631148210624e-06, + "loss": 0.6336, + "step": 1803 + }, + { + "epoch": 0.4789592459843356, + "grad_norm": 0.42297729082895735, + "learning_rate": 4.929548873362297e-06, + "loss": 0.6176, + "step": 1804 + }, + { + "epoch": 0.47922474445771934, + "grad_norm": 0.4065210066685864, + "learning_rate": 4.929466551131816e-06, + "loss": 0.6289, + "step": 1805 + }, + { + "epoch": 0.47949024293110315, + "grad_norm": 0.38213165971721497, + "learning_rate": 4.929384181520791e-06, + "loss": 0.6381, + "step": 1806 + }, + { + "epoch": 0.4797557414044869, + "grad_norm": 0.3827911908875227, + "learning_rate": 4.929301764530826e-06, + "loss": 0.6069, + "step": 1807 + }, + { + "epoch": 0.4800212398778707, + "grad_norm": 0.4111573647662111, + "learning_rate": 4.929219300163528e-06, + "loss": 0.621, + "step": 1808 + }, + { + "epoch": 0.48028673835125446, + "grad_norm": 0.37363367624712823, + "learning_rate": 4.929136788420507e-06, + "loss": 0.6097, + "step": 1809 + }, + { + "epoch": 0.48055223682463827, + "grad_norm": 0.38187753033938593, + "learning_rate": 4.92905422930337e-06, + "loss": 0.6182, + "step": 1810 + }, + { + "epoch": 0.480817735298022, + "grad_norm": 0.3928287873363872, + "learning_rate": 4.928971622813728e-06, + "loss": 0.6448, + "step": 1811 + }, + { + "epoch": 0.4810832337714058, + "grad_norm": 0.4317048263471141, + "learning_rate": 4.928888968953193e-06, + "loss": 0.623, + "step": 1812 + }, + { + "epoch": 0.4813487322447896, + "grad_norm": 0.3839940387112986, + "learning_rate": 4.9288062677233756e-06, + "loss": 0.6157, + "step": 1813 + }, + { + "epoch": 0.4816142307181734, + "grad_norm": 0.38336127182567864, + "learning_rate": 4.92872351912589e-06, + "loss": 0.6468, + "step": 1814 + }, + { + "epoch": 0.48187972919155714, + "grad_norm": 0.43882607404615626, + "learning_rate": 4.9286407231623476e-06, + "loss": 0.6676, + "step": 1815 + }, + { + "epoch": 0.48214522766494095, + "grad_norm": 0.43762663715889866, + "learning_rate": 4.928557879834366e-06, + "loss": 0.6156, + "step": 1816 + }, + { + "epoch": 0.4824107261383247, + "grad_norm": 0.3991874039101594, + "learning_rate": 4.92847498914356e-06, + "loss": 0.6445, + "step": 1817 + }, + { + "epoch": 0.4826762246117085, + "grad_norm": 0.43988804884879057, + "learning_rate": 4.928392051091545e-06, + "loss": 0.5833, + "step": 1818 + }, + { + "epoch": 0.48294172308509226, + "grad_norm": 0.40037173837794393, + "learning_rate": 4.92830906567994e-06, + "loss": 0.6617, + "step": 1819 + }, + { + "epoch": 0.48320722155847606, + "grad_norm": 0.5218895118820784, + "learning_rate": 4.9282260329103614e-06, + "loss": 0.6237, + "step": 1820 + }, + { + "epoch": 0.4834727200318598, + "grad_norm": 0.39367543447004505, + "learning_rate": 4.928142952784431e-06, + "loss": 0.6131, + "step": 1821 + }, + { + "epoch": 0.4837382185052436, + "grad_norm": 0.4803276478832675, + "learning_rate": 4.928059825303768e-06, + "loss": 0.5896, + "step": 1822 + }, + { + "epoch": 0.4840037169786274, + "grad_norm": 0.40419417536708496, + "learning_rate": 4.927976650469993e-06, + "loss": 0.6523, + "step": 1823 + }, + { + "epoch": 0.4842692154520111, + "grad_norm": 0.3851814826644902, + "learning_rate": 4.927893428284728e-06, + "loss": 0.6691, + "step": 1824 + }, + { + "epoch": 0.48453471392539493, + "grad_norm": 0.4364195716483101, + "learning_rate": 4.927810158749597e-06, + "loss": 0.5915, + "step": 1825 + }, + { + "epoch": 0.4848002123987787, + "grad_norm": 0.4587611935637998, + "learning_rate": 4.927726841866225e-06, + "loss": 0.6119, + "step": 1826 + }, + { + "epoch": 0.4850657108721625, + "grad_norm": 0.38441463311340074, + "learning_rate": 4.927643477636234e-06, + "loss": 0.6193, + "step": 1827 + }, + { + "epoch": 0.48533120934554624, + "grad_norm": 0.3699694153904709, + "learning_rate": 4.927560066061251e-06, + "loss": 0.6252, + "step": 1828 + }, + { + "epoch": 0.48559670781893005, + "grad_norm": 0.446749391341119, + "learning_rate": 4.927476607142904e-06, + "loss": 0.6393, + "step": 1829 + }, + { + "epoch": 0.4858622062923138, + "grad_norm": 0.4180574613486697, + "learning_rate": 4.92739310088282e-06, + "loss": 0.599, + "step": 1830 + }, + { + "epoch": 0.4861277047656976, + "grad_norm": 0.375267555779444, + "learning_rate": 4.927309547282626e-06, + "loss": 0.6389, + "step": 1831 + }, + { + "epoch": 0.48639320323908136, + "grad_norm": 0.400155794848709, + "learning_rate": 4.927225946343954e-06, + "loss": 0.6436, + "step": 1832 + }, + { + "epoch": 0.48665870171246517, + "grad_norm": 0.4662098221918315, + "learning_rate": 4.927142298068432e-06, + "loss": 0.607, + "step": 1833 + }, + { + "epoch": 0.4869242001858489, + "grad_norm": 0.3877638533614041, + "learning_rate": 4.927058602457694e-06, + "loss": 0.5983, + "step": 1834 + }, + { + "epoch": 0.48718969865923273, + "grad_norm": 0.39973654597842606, + "learning_rate": 4.92697485951337e-06, + "loss": 0.6361, + "step": 1835 + }, + { + "epoch": 0.4874551971326165, + "grad_norm": 0.4067946433566237, + "learning_rate": 4.926891069237093e-06, + "loss": 0.6393, + "step": 1836 + }, + { + "epoch": 0.4877206956060003, + "grad_norm": 0.42188160943189607, + "learning_rate": 4.926807231630499e-06, + "loss": 0.612, + "step": 1837 + }, + { + "epoch": 0.48798619407938404, + "grad_norm": 0.4354009373050952, + "learning_rate": 4.926723346695222e-06, + "loss": 0.6072, + "step": 1838 + }, + { + "epoch": 0.48825169255276785, + "grad_norm": 0.4223277098623755, + "learning_rate": 4.926639414432898e-06, + "loss": 0.6073, + "step": 1839 + }, + { + "epoch": 0.4885171910261516, + "grad_norm": 0.4497404463001629, + "learning_rate": 4.926555434845164e-06, + "loss": 0.6079, + "step": 1840 + }, + { + "epoch": 0.48878268949953535, + "grad_norm": 0.4038820148101401, + "learning_rate": 4.926471407933658e-06, + "loss": 0.6057, + "step": 1841 + }, + { + "epoch": 0.48904818797291916, + "grad_norm": 0.38044712431509375, + "learning_rate": 4.926387333700018e-06, + "loss": 0.6292, + "step": 1842 + }, + { + "epoch": 0.4893136864463029, + "grad_norm": 0.4162725292710651, + "learning_rate": 4.926303212145885e-06, + "loss": 0.6521, + "step": 1843 + }, + { + "epoch": 0.4895791849196867, + "grad_norm": 0.38821500859141916, + "learning_rate": 4.926219043272899e-06, + "loss": 0.605, + "step": 1844 + }, + { + "epoch": 0.48984468339307047, + "grad_norm": 0.3952703625765734, + "learning_rate": 4.9261348270827004e-06, + "loss": 0.6602, + "step": 1845 + }, + { + "epoch": 0.4901101818664543, + "grad_norm": 0.37552225562544594, + "learning_rate": 4.926050563576932e-06, + "loss": 0.6489, + "step": 1846 + }, + { + "epoch": 0.490375680339838, + "grad_norm": 0.388530643313539, + "learning_rate": 4.925966252757238e-06, + "loss": 0.6347, + "step": 1847 + }, + { + "epoch": 0.49064117881322183, + "grad_norm": 0.4114483732608687, + "learning_rate": 4.925881894625263e-06, + "loss": 0.6333, + "step": 1848 + }, + { + "epoch": 0.4909066772866056, + "grad_norm": 0.3793648338354744, + "learning_rate": 4.925797489182651e-06, + "loss": 0.6559, + "step": 1849 + }, + { + "epoch": 0.4911721757599894, + "grad_norm": 0.37267369016981056, + "learning_rate": 4.925713036431049e-06, + "loss": 0.6239, + "step": 1850 + }, + { + "epoch": 0.49143767423337315, + "grad_norm": 0.3938862689139422, + "learning_rate": 4.9256285363721025e-06, + "loss": 0.6173, + "step": 1851 + }, + { + "epoch": 0.49170317270675695, + "grad_norm": 0.3859814816879519, + "learning_rate": 4.925543989007462e-06, + "loss": 0.6213, + "step": 1852 + }, + { + "epoch": 0.4919686711801407, + "grad_norm": 0.39389672074854576, + "learning_rate": 4.925459394338774e-06, + "loss": 0.6036, + "step": 1853 + }, + { + "epoch": 0.4922341696535245, + "grad_norm": 0.37010200453351183, + "learning_rate": 4.92537475236769e-06, + "loss": 0.5774, + "step": 1854 + }, + { + "epoch": 0.49249966812690826, + "grad_norm": 0.38274026855741344, + "learning_rate": 4.925290063095859e-06, + "loss": 0.592, + "step": 1855 + }, + { + "epoch": 0.49276516660029207, + "grad_norm": 0.3965939670089687, + "learning_rate": 4.925205326524934e-06, + "loss": 0.5765, + "step": 1856 + }, + { + "epoch": 0.4930306650736758, + "grad_norm": 0.3815564350233024, + "learning_rate": 4.925120542656567e-06, + "loss": 0.6389, + "step": 1857 + }, + { + "epoch": 0.49329616354705963, + "grad_norm": 0.3938485307172248, + "learning_rate": 4.925035711492413e-06, + "loss": 0.6484, + "step": 1858 + }, + { + "epoch": 0.4935616620204434, + "grad_norm": 0.3957464540329845, + "learning_rate": 4.924950833034124e-06, + "loss": 0.6379, + "step": 1859 + }, + { + "epoch": 0.49382716049382713, + "grad_norm": 0.41618181645568386, + "learning_rate": 4.924865907283356e-06, + "loss": 0.5952, + "step": 1860 + }, + { + "epoch": 0.49409265896721094, + "grad_norm": 0.38897976703843057, + "learning_rate": 4.924780934241766e-06, + "loss": 0.6365, + "step": 1861 + }, + { + "epoch": 0.4943581574405947, + "grad_norm": 0.3909324650568712, + "learning_rate": 4.924695913911011e-06, + "loss": 0.6289, + "step": 1862 + }, + { + "epoch": 0.4946236559139785, + "grad_norm": 0.394899926105948, + "learning_rate": 4.924610846292751e-06, + "loss": 0.6087, + "step": 1863 + }, + { + "epoch": 0.49488915438736225, + "grad_norm": 0.41135468729534785, + "learning_rate": 4.924525731388641e-06, + "loss": 0.5911, + "step": 1864 + }, + { + "epoch": 0.49515465286074606, + "grad_norm": 0.39475059271360924, + "learning_rate": 4.924440569200343e-06, + "loss": 0.6319, + "step": 1865 + }, + { + "epoch": 0.4954201513341298, + "grad_norm": 0.44822777827399035, + "learning_rate": 4.924355359729518e-06, + "loss": 0.5802, + "step": 1866 + }, + { + "epoch": 0.4956856498075136, + "grad_norm": 0.37686364393220034, + "learning_rate": 4.924270102977828e-06, + "loss": 0.6389, + "step": 1867 + }, + { + "epoch": 0.49595114828089737, + "grad_norm": 0.37756133814720716, + "learning_rate": 4.924184798946935e-06, + "loss": 0.6604, + "step": 1868 + }, + { + "epoch": 0.4962166467542812, + "grad_norm": 0.3806651847473416, + "learning_rate": 4.924099447638503e-06, + "loss": 0.6396, + "step": 1869 + }, + { + "epoch": 0.49648214522766493, + "grad_norm": 0.40260430574696265, + "learning_rate": 4.924014049054197e-06, + "loss": 0.6566, + "step": 1870 + }, + { + "epoch": 0.49674764370104874, + "grad_norm": 0.3993604109813291, + "learning_rate": 4.923928603195682e-06, + "loss": 0.5899, + "step": 1871 + }, + { + "epoch": 0.4970131421744325, + "grad_norm": 0.3932748029941721, + "learning_rate": 4.923843110064624e-06, + "loss": 0.6216, + "step": 1872 + }, + { + "epoch": 0.4972786406478163, + "grad_norm": 0.3870168233713288, + "learning_rate": 4.9237575696626905e-06, + "loss": 0.631, + "step": 1873 + }, + { + "epoch": 0.49754413912120005, + "grad_norm": 0.3930017732892036, + "learning_rate": 4.92367198199155e-06, + "loss": 0.6544, + "step": 1874 + }, + { + "epoch": 0.49780963759458385, + "grad_norm": 0.37701820531803265, + "learning_rate": 4.923586347052872e-06, + "loss": 0.6274, + "step": 1875 + }, + { + "epoch": 0.4980751360679676, + "grad_norm": 0.3823095560780843, + "learning_rate": 4.923500664848327e-06, + "loss": 0.6287, + "step": 1876 + }, + { + "epoch": 0.4983406345413514, + "grad_norm": 0.3864494476918735, + "learning_rate": 4.923414935379584e-06, + "loss": 0.6253, + "step": 1877 + }, + { + "epoch": 0.49860613301473516, + "grad_norm": 0.3948185098943959, + "learning_rate": 4.9233291586483165e-06, + "loss": 0.6448, + "step": 1878 + }, + { + "epoch": 0.4988716314881189, + "grad_norm": 0.3773610927653805, + "learning_rate": 4.923243334656198e-06, + "loss": 0.5942, + "step": 1879 + }, + { + "epoch": 0.4991371299615027, + "grad_norm": 0.3764972842969289, + "learning_rate": 4.9231574634049005e-06, + "loss": 0.6336, + "step": 1880 + }, + { + "epoch": 0.4994026284348865, + "grad_norm": 0.39257912049596666, + "learning_rate": 4.9230715448960995e-06, + "loss": 0.6022, + "step": 1881 + }, + { + "epoch": 0.4996681269082703, + "grad_norm": 0.37839307617243384, + "learning_rate": 4.922985579131471e-06, + "loss": 0.6447, + "step": 1882 + }, + { + "epoch": 0.49993362538165403, + "grad_norm": 0.3746841193048756, + "learning_rate": 4.922899566112691e-06, + "loss": 0.5758, + "step": 1883 + }, + { + "epoch": 0.5001991238550378, + "grad_norm": 0.39589685535459257, + "learning_rate": 4.922813505841438e-06, + "loss": 0.6123, + "step": 1884 + }, + { + "epoch": 0.5004646223284216, + "grad_norm": 0.3998132706267363, + "learning_rate": 4.922727398319389e-06, + "loss": 0.606, + "step": 1885 + }, + { + "epoch": 0.5007301208018053, + "grad_norm": 0.37631711209643176, + "learning_rate": 4.922641243548223e-06, + "loss": 0.6253, + "step": 1886 + }, + { + "epoch": 0.5009956192751892, + "grad_norm": 0.372388653748478, + "learning_rate": 4.9225550415296226e-06, + "loss": 0.6033, + "step": 1887 + }, + { + "epoch": 0.501261117748573, + "grad_norm": 0.3939052723338677, + "learning_rate": 4.922468792265267e-06, + "loss": 0.6563, + "step": 1888 + }, + { + "epoch": 0.5015266162219567, + "grad_norm": 0.3772137272142795, + "learning_rate": 4.922382495756838e-06, + "loss": 0.635, + "step": 1889 + }, + { + "epoch": 0.5017921146953405, + "grad_norm": 0.38930701121697076, + "learning_rate": 4.92229615200602e-06, + "loss": 0.6286, + "step": 1890 + }, + { + "epoch": 0.5020576131687243, + "grad_norm": 0.3956069772498084, + "learning_rate": 4.9222097610144955e-06, + "loss": 0.6391, + "step": 1891 + }, + { + "epoch": 0.5023231116421081, + "grad_norm": 0.3923718045233908, + "learning_rate": 4.922123322783951e-06, + "loss": 0.6124, + "step": 1892 + }, + { + "epoch": 0.5025886101154918, + "grad_norm": 0.37514500669708245, + "learning_rate": 4.922036837316071e-06, + "loss": 0.619, + "step": 1893 + }, + { + "epoch": 0.5028541085888756, + "grad_norm": 0.40192421736446754, + "learning_rate": 4.921950304612543e-06, + "loss": 0.6053, + "step": 1894 + }, + { + "epoch": 0.5031196070622594, + "grad_norm": 0.379298999507021, + "learning_rate": 4.9218637246750535e-06, + "loss": 0.6295, + "step": 1895 + }, + { + "epoch": 0.5033851055356432, + "grad_norm": 0.37048689247873723, + "learning_rate": 4.921777097505291e-06, + "loss": 0.6595, + "step": 1896 + }, + { + "epoch": 0.503650604009027, + "grad_norm": 0.3694561367715235, + "learning_rate": 4.921690423104947e-06, + "loss": 0.6218, + "step": 1897 + }, + { + "epoch": 0.5039161024824107, + "grad_norm": 0.3730893334920156, + "learning_rate": 4.921603701475709e-06, + "loss": 0.6303, + "step": 1898 + }, + { + "epoch": 0.5041816009557945, + "grad_norm": 0.3766107856638912, + "learning_rate": 4.92151693261927e-06, + "loss": 0.6239, + "step": 1899 + }, + { + "epoch": 0.5044470994291783, + "grad_norm": 0.3908428042761929, + "learning_rate": 4.9214301165373225e-06, + "loss": 0.6026, + "step": 1900 + }, + { + "epoch": 0.5047125979025621, + "grad_norm": 0.3899714867806938, + "learning_rate": 4.921343253231558e-06, + "loss": 0.6405, + "step": 1901 + }, + { + "epoch": 0.5049780963759458, + "grad_norm": 0.3958765178921134, + "learning_rate": 4.9212563427036735e-06, + "loss": 0.6587, + "step": 1902 + }, + { + "epoch": 0.5052435948493296, + "grad_norm": 0.3939974386685129, + "learning_rate": 4.921169384955361e-06, + "loss": 0.6699, + "step": 1903 + }, + { + "epoch": 0.5055090933227134, + "grad_norm": 0.38011296113512727, + "learning_rate": 4.921082379988317e-06, + "loss": 0.65, + "step": 1904 + }, + { + "epoch": 0.5057745917960972, + "grad_norm": 0.3797311677169585, + "learning_rate": 4.92099532780424e-06, + "loss": 0.6685, + "step": 1905 + }, + { + "epoch": 0.5060400902694809, + "grad_norm": 0.37817001612046675, + "learning_rate": 4.9209082284048245e-06, + "loss": 0.6321, + "step": 1906 + }, + { + "epoch": 0.5063055887428647, + "grad_norm": 0.3856365392957159, + "learning_rate": 4.9208210817917735e-06, + "loss": 0.6152, + "step": 1907 + }, + { + "epoch": 0.5065710872162486, + "grad_norm": 0.3867586074534019, + "learning_rate": 4.920733887966783e-06, + "loss": 0.6274, + "step": 1908 + }, + { + "epoch": 0.5068365856896323, + "grad_norm": 0.3766034581587804, + "learning_rate": 4.920646646931554e-06, + "loss": 0.6002, + "step": 1909 + }, + { + "epoch": 0.507102084163016, + "grad_norm": 0.3811016647787012, + "learning_rate": 4.92055935868779e-06, + "loss": 0.6141, + "step": 1910 + }, + { + "epoch": 0.5073675826363998, + "grad_norm": 0.386717603058035, + "learning_rate": 4.920472023237191e-06, + "loss": 0.6163, + "step": 1911 + }, + { + "epoch": 0.5076330811097837, + "grad_norm": 0.3927583755703415, + "learning_rate": 4.9203846405814616e-06, + "loss": 0.6331, + "step": 1912 + }, + { + "epoch": 0.5078985795831674, + "grad_norm": 0.3815325638823032, + "learning_rate": 4.920297210722306e-06, + "loss": 0.6331, + "step": 1913 + }, + { + "epoch": 0.5081640780565512, + "grad_norm": 0.38050871211737247, + "learning_rate": 4.920209733661427e-06, + "loss": 0.651, + "step": 1914 + }, + { + "epoch": 0.5084295765299349, + "grad_norm": 0.38264302361312386, + "learning_rate": 4.920122209400535e-06, + "loss": 0.6577, + "step": 1915 + }, + { + "epoch": 0.5086950750033188, + "grad_norm": 0.3858956475533456, + "learning_rate": 4.920034637941334e-06, + "loss": 0.6397, + "step": 1916 + }, + { + "epoch": 0.5089605734767025, + "grad_norm": 0.39106068747768324, + "learning_rate": 4.91994701928553e-06, + "loss": 0.6389, + "step": 1917 + }, + { + "epoch": 0.5092260719500863, + "grad_norm": 0.3962571386974248, + "learning_rate": 4.919859353434836e-06, + "loss": 0.6127, + "step": 1918 + }, + { + "epoch": 0.50949157042347, + "grad_norm": 0.3919897578088484, + "learning_rate": 4.91977164039096e-06, + "loss": 0.6568, + "step": 1919 + }, + { + "epoch": 0.5097570688968538, + "grad_norm": 0.3885273390315128, + "learning_rate": 4.9196838801556115e-06, + "loss": 0.617, + "step": 1920 + }, + { + "epoch": 0.5100225673702377, + "grad_norm": 0.3644433260512407, + "learning_rate": 4.919596072730504e-06, + "loss": 0.5982, + "step": 1921 + }, + { + "epoch": 0.5102880658436214, + "grad_norm": 0.37282171537928127, + "learning_rate": 4.919508218117348e-06, + "loss": 0.6375, + "step": 1922 + }, + { + "epoch": 0.5105535643170052, + "grad_norm": 0.3799837899155145, + "learning_rate": 4.919420316317858e-06, + "loss": 0.6373, + "step": 1923 + }, + { + "epoch": 0.5108190627903889, + "grad_norm": 0.37946793023993824, + "learning_rate": 4.919332367333748e-06, + "loss": 0.6057, + "step": 1924 + }, + { + "epoch": 0.5110845612637728, + "grad_norm": 0.40069051623420443, + "learning_rate": 4.919244371166733e-06, + "loss": 0.608, + "step": 1925 + }, + { + "epoch": 0.5113500597371565, + "grad_norm": 0.3802355423096846, + "learning_rate": 4.919156327818529e-06, + "loss": 0.643, + "step": 1926 + }, + { + "epoch": 0.5116155582105403, + "grad_norm": 0.39186629265062317, + "learning_rate": 4.919068237290855e-06, + "loss": 0.6291, + "step": 1927 + }, + { + "epoch": 0.511881056683924, + "grad_norm": 0.4238564062674733, + "learning_rate": 4.9189800995854266e-06, + "loss": 0.6116, + "step": 1928 + }, + { + "epoch": 0.5121465551573079, + "grad_norm": 0.3834861960942196, + "learning_rate": 4.918891914703964e-06, + "loss": 0.6637, + "step": 1929 + }, + { + "epoch": 0.5124120536306916, + "grad_norm": 0.39934349711203193, + "learning_rate": 4.918803682648186e-06, + "loss": 0.6033, + "step": 1930 + }, + { + "epoch": 0.5126775521040754, + "grad_norm": 0.4200366020980555, + "learning_rate": 4.918715403419813e-06, + "loss": 0.5966, + "step": 1931 + }, + { + "epoch": 0.5129430505774591, + "grad_norm": 0.376407814148563, + "learning_rate": 4.91862707702057e-06, + "loss": 0.6287, + "step": 1932 + }, + { + "epoch": 0.513208549050843, + "grad_norm": 0.40309078261311976, + "learning_rate": 4.918538703452176e-06, + "loss": 0.6408, + "step": 1933 + }, + { + "epoch": 0.5134740475242268, + "grad_norm": 0.3819125400165194, + "learning_rate": 4.9184502827163555e-06, + "loss": 0.6342, + "step": 1934 + }, + { + "epoch": 0.5137395459976105, + "grad_norm": 0.40529513749499674, + "learning_rate": 4.918361814814833e-06, + "loss": 0.6229, + "step": 1935 + }, + { + "epoch": 0.5140050444709943, + "grad_norm": 0.38053152431488924, + "learning_rate": 4.918273299749334e-06, + "loss": 0.6413, + "step": 1936 + }, + { + "epoch": 0.514270542944378, + "grad_norm": 0.38757573142446594, + "learning_rate": 4.918184737521585e-06, + "loss": 0.6348, + "step": 1937 + }, + { + "epoch": 0.5145360414177619, + "grad_norm": 0.3818057490279428, + "learning_rate": 4.918096128133313e-06, + "loss": 0.5663, + "step": 1938 + }, + { + "epoch": 0.5148015398911456, + "grad_norm": 0.4005751737622086, + "learning_rate": 4.918007471586246e-06, + "loss": 0.6036, + "step": 1939 + }, + { + "epoch": 0.5150670383645294, + "grad_norm": 0.39270978192568645, + "learning_rate": 4.917918767882113e-06, + "loss": 0.6767, + "step": 1940 + }, + { + "epoch": 0.5153325368379131, + "grad_norm": 0.3763477916252793, + "learning_rate": 4.917830017022643e-06, + "loss": 0.6385, + "step": 1941 + }, + { + "epoch": 0.515598035311297, + "grad_norm": 0.39086733670678614, + "learning_rate": 4.917741219009569e-06, + "loss": 0.632, + "step": 1942 + }, + { + "epoch": 0.5158635337846808, + "grad_norm": 0.40244996628122104, + "learning_rate": 4.917652373844621e-06, + "loss": 0.6464, + "step": 1943 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 0.39273291272186367, + "learning_rate": 4.917563481529532e-06, + "loss": 0.6733, + "step": 1944 + }, + { + "epoch": 0.5163945307314483, + "grad_norm": 0.39209331331117075, + "learning_rate": 4.9174745420660365e-06, + "loss": 0.6217, + "step": 1945 + }, + { + "epoch": 0.5166600292048321, + "grad_norm": 0.3839363807793837, + "learning_rate": 4.917385555455868e-06, + "loss": 0.6422, + "step": 1946 + }, + { + "epoch": 0.5169255276782159, + "grad_norm": 0.43240063995311945, + "learning_rate": 4.917296521700763e-06, + "loss": 0.6074, + "step": 1947 + }, + { + "epoch": 0.5171910261515996, + "grad_norm": 0.41508983897325225, + "learning_rate": 4.917207440802456e-06, + "loss": 0.6111, + "step": 1948 + }, + { + "epoch": 0.5174565246249834, + "grad_norm": 0.41016295935876546, + "learning_rate": 4.9171183127626865e-06, + "loss": 0.6234, + "step": 1949 + }, + { + "epoch": 0.5177220230983672, + "grad_norm": 0.3954904487286936, + "learning_rate": 4.917029137583192e-06, + "loss": 0.5951, + "step": 1950 + }, + { + "epoch": 0.517987521571751, + "grad_norm": 0.394923398565474, + "learning_rate": 4.916939915265711e-06, + "loss": 0.6198, + "step": 1951 + }, + { + "epoch": 0.5182530200451347, + "grad_norm": 0.48502633921023514, + "learning_rate": 4.916850645811984e-06, + "loss": 0.6094, + "step": 1952 + }, + { + "epoch": 0.5185185185185185, + "grad_norm": 0.3786627461567855, + "learning_rate": 4.916761329223752e-06, + "loss": 0.6312, + "step": 1953 + }, + { + "epoch": 0.5187840169919022, + "grad_norm": 0.40261587125930187, + "learning_rate": 4.916671965502757e-06, + "loss": 0.6369, + "step": 1954 + }, + { + "epoch": 0.5190495154652861, + "grad_norm": 0.41534193470608577, + "learning_rate": 4.916582554650741e-06, + "loss": 0.6197, + "step": 1955 + }, + { + "epoch": 0.5193150139386699, + "grad_norm": 0.46011395269336497, + "learning_rate": 4.916493096669449e-06, + "loss": 0.6118, + "step": 1956 + }, + { + "epoch": 0.5195805124120536, + "grad_norm": 0.40225877249385233, + "learning_rate": 4.916403591560625e-06, + "loss": 0.6054, + "step": 1957 + }, + { + "epoch": 0.5198460108854374, + "grad_norm": 0.415958633591973, + "learning_rate": 4.916314039326014e-06, + "loss": 0.6552, + "step": 1958 + }, + { + "epoch": 0.5201115093588212, + "grad_norm": 0.46886199065225426, + "learning_rate": 4.916224439967364e-06, + "loss": 0.6294, + "step": 1959 + }, + { + "epoch": 0.520377007832205, + "grad_norm": 0.40599814251803173, + "learning_rate": 4.91613479348642e-06, + "loss": 0.6553, + "step": 1960 + }, + { + "epoch": 0.5206425063055887, + "grad_norm": 0.3836047734668553, + "learning_rate": 4.916045099884934e-06, + "loss": 0.6135, + "step": 1961 + }, + { + "epoch": 0.5209080047789725, + "grad_norm": 0.3856827309329937, + "learning_rate": 4.915955359164652e-06, + "loss": 0.6171, + "step": 1962 + }, + { + "epoch": 0.5211735032523563, + "grad_norm": 0.40526618476236925, + "learning_rate": 4.915865571327324e-06, + "loss": 0.6311, + "step": 1963 + }, + { + "epoch": 0.5214390017257401, + "grad_norm": 0.38913229193634735, + "learning_rate": 4.915775736374704e-06, + "loss": 0.659, + "step": 1964 + }, + { + "epoch": 0.5217045001991238, + "grad_norm": 0.38549491006600756, + "learning_rate": 4.915685854308542e-06, + "loss": 0.6519, + "step": 1965 + }, + { + "epoch": 0.5219699986725076, + "grad_norm": 0.3960830305499255, + "learning_rate": 4.915595925130591e-06, + "loss": 0.6061, + "step": 1966 + }, + { + "epoch": 0.5222354971458915, + "grad_norm": 0.3766447961195249, + "learning_rate": 4.915505948842604e-06, + "loss": 0.5812, + "step": 1967 + }, + { + "epoch": 0.5225009956192752, + "grad_norm": 0.5873945390769875, + "learning_rate": 4.915415925446338e-06, + "loss": 0.586, + "step": 1968 + }, + { + "epoch": 0.522766494092659, + "grad_norm": 0.3947862157472956, + "learning_rate": 4.9153258549435474e-06, + "loss": 0.6375, + "step": 1969 + }, + { + "epoch": 0.5230319925660427, + "grad_norm": 0.4009150621680005, + "learning_rate": 4.915235737335989e-06, + "loss": 0.6542, + "step": 1970 + }, + { + "epoch": 0.5232974910394266, + "grad_norm": 0.39546563680275615, + "learning_rate": 4.915145572625419e-06, + "loss": 0.6377, + "step": 1971 + }, + { + "epoch": 0.5235629895128103, + "grad_norm": 0.39279509113501543, + "learning_rate": 4.9150553608135985e-06, + "loss": 0.6694, + "step": 1972 + }, + { + "epoch": 0.5238284879861941, + "grad_norm": 0.3912170569135838, + "learning_rate": 4.914965101902285e-06, + "loss": 0.6344, + "step": 1973 + }, + { + "epoch": 0.5240939864595778, + "grad_norm": 0.39161873637847233, + "learning_rate": 4.914874795893239e-06, + "loss": 0.6263, + "step": 1974 + }, + { + "epoch": 0.5243594849329616, + "grad_norm": 0.3850736598857089, + "learning_rate": 4.914784442788222e-06, + "loss": 0.6469, + "step": 1975 + }, + { + "epoch": 0.5246249834063454, + "grad_norm": 0.3826200214861477, + "learning_rate": 4.914694042588995e-06, + "loss": 0.6107, + "step": 1976 + }, + { + "epoch": 0.5248904818797292, + "grad_norm": 0.41095595612741176, + "learning_rate": 4.914603595297324e-06, + "loss": 0.5918, + "step": 1977 + }, + { + "epoch": 0.525155980353113, + "grad_norm": 0.37871380973928637, + "learning_rate": 4.914513100914969e-06, + "loss": 0.6429, + "step": 1978 + }, + { + "epoch": 0.5254214788264967, + "grad_norm": 0.38368522679277045, + "learning_rate": 4.914422559443698e-06, + "loss": 0.5959, + "step": 1979 + }, + { + "epoch": 0.5256869772998806, + "grad_norm": 0.3828986814526875, + "learning_rate": 4.914331970885275e-06, + "loss": 0.6302, + "step": 1980 + }, + { + "epoch": 0.5259524757732643, + "grad_norm": 0.3981471908610477, + "learning_rate": 4.914241335241468e-06, + "loss": 0.6474, + "step": 1981 + }, + { + "epoch": 0.5262179742466481, + "grad_norm": 0.41024766901173804, + "learning_rate": 4.914150652514043e-06, + "loss": 0.6081, + "step": 1982 + }, + { + "epoch": 0.5264834727200318, + "grad_norm": 0.4124643430761794, + "learning_rate": 4.914059922704769e-06, + "loss": 0.6388, + "step": 1983 + }, + { + "epoch": 0.5267489711934157, + "grad_norm": 0.40345225455287465, + "learning_rate": 4.913969145815417e-06, + "loss": 0.6567, + "step": 1984 + }, + { + "epoch": 0.5270144696667994, + "grad_norm": 0.38991503410506784, + "learning_rate": 4.9138783218477556e-06, + "loss": 0.6512, + "step": 1985 + }, + { + "epoch": 0.5272799681401832, + "grad_norm": 0.39642400756437307, + "learning_rate": 4.913787450803557e-06, + "loss": 0.6559, + "step": 1986 + }, + { + "epoch": 0.5275454666135669, + "grad_norm": 0.40064833637557923, + "learning_rate": 4.913696532684593e-06, + "loss": 0.6459, + "step": 1987 + }, + { + "epoch": 0.5278109650869508, + "grad_norm": 0.38159091614178725, + "learning_rate": 4.913605567492636e-06, + "loss": 0.6303, + "step": 1988 + }, + { + "epoch": 0.5280764635603346, + "grad_norm": 0.37690842226897187, + "learning_rate": 4.913514555229463e-06, + "loss": 0.6044, + "step": 1989 + }, + { + "epoch": 0.5283419620337183, + "grad_norm": 0.40763192542970106, + "learning_rate": 4.913423495896845e-06, + "loss": 0.652, + "step": 1990 + }, + { + "epoch": 0.5286074605071021, + "grad_norm": 0.37128921995742165, + "learning_rate": 4.91333238949656e-06, + "loss": 0.6263, + "step": 1991 + }, + { + "epoch": 0.5288729589804858, + "grad_norm": 0.38138438301490124, + "learning_rate": 4.9132412360303855e-06, + "loss": 0.6325, + "step": 1992 + }, + { + "epoch": 0.5291384574538697, + "grad_norm": 0.389781491818443, + "learning_rate": 4.9131500355000975e-06, + "loss": 0.6023, + "step": 1993 + }, + { + "epoch": 0.5294039559272534, + "grad_norm": 0.3809007368154659, + "learning_rate": 4.913058787907476e-06, + "loss": 0.6644, + "step": 1994 + }, + { + "epoch": 0.5296694544006372, + "grad_norm": 0.37908606140321605, + "learning_rate": 4.9129674932542996e-06, + "loss": 0.6367, + "step": 1995 + }, + { + "epoch": 0.5299349528740209, + "grad_norm": 0.38096417942253313, + "learning_rate": 4.912876151542349e-06, + "loss": 0.6415, + "step": 1996 + }, + { + "epoch": 0.5302004513474048, + "grad_norm": 0.38040443064803364, + "learning_rate": 4.912784762773406e-06, + "loss": 0.6529, + "step": 1997 + }, + { + "epoch": 0.5304659498207885, + "grad_norm": 0.389670144775674, + "learning_rate": 4.912693326949252e-06, + "loss": 0.597, + "step": 1998 + }, + { + "epoch": 0.5307314482941723, + "grad_norm": 0.36797849256191056, + "learning_rate": 4.912601844071671e-06, + "loss": 0.6218, + "step": 1999 + }, + { + "epoch": 0.530996946767556, + "grad_norm": 0.3887401473220355, + "learning_rate": 4.912510314142448e-06, + "loss": 0.6188, + "step": 2000 + }, + { + "epoch": 0.5312624452409399, + "grad_norm": 0.394670833903728, + "learning_rate": 4.912418737163366e-06, + "loss": 0.6261, + "step": 2001 + }, + { + "epoch": 0.5315279437143237, + "grad_norm": 0.3991550639507201, + "learning_rate": 4.912327113136212e-06, + "loss": 0.6313, + "step": 2002 + }, + { + "epoch": 0.5317934421877074, + "grad_norm": 0.41440807456952067, + "learning_rate": 4.912235442062773e-06, + "loss": 0.6428, + "step": 2003 + }, + { + "epoch": 0.5320589406610912, + "grad_norm": 0.39260958449403605, + "learning_rate": 4.9121437239448376e-06, + "loss": 0.6329, + "step": 2004 + }, + { + "epoch": 0.532324439134475, + "grad_norm": 0.4215609059821753, + "learning_rate": 4.912051958784193e-06, + "loss": 0.6451, + "step": 2005 + }, + { + "epoch": 0.5325899376078588, + "grad_norm": 0.44110540921989233, + "learning_rate": 4.911960146582629e-06, + "loss": 0.6123, + "step": 2006 + }, + { + "epoch": 0.5328554360812425, + "grad_norm": 0.3919226211228559, + "learning_rate": 4.911868287341936e-06, + "loss": 0.6093, + "step": 2007 + }, + { + "epoch": 0.5331209345546263, + "grad_norm": 0.39614996621878684, + "learning_rate": 4.911776381063908e-06, + "loss": 0.5863, + "step": 2008 + }, + { + "epoch": 0.5333864330280101, + "grad_norm": 0.3842995675771415, + "learning_rate": 4.911684427750335e-06, + "loss": 0.6377, + "step": 2009 + }, + { + "epoch": 0.5336519315013939, + "grad_norm": 0.38487675830179735, + "learning_rate": 4.91159242740301e-06, + "loss": 0.6142, + "step": 2010 + }, + { + "epoch": 0.5339174299747776, + "grad_norm": 0.3886019426889275, + "learning_rate": 4.911500380023728e-06, + "loss": 0.632, + "step": 2011 + }, + { + "epoch": 0.5341829284481614, + "grad_norm": 0.39223741962400865, + "learning_rate": 4.911408285614286e-06, + "loss": 0.6526, + "step": 2012 + }, + { + "epoch": 0.5344484269215451, + "grad_norm": 0.41792845694869524, + "learning_rate": 4.911316144176477e-06, + "loss": 0.6127, + "step": 2013 + }, + { + "epoch": 0.534713925394929, + "grad_norm": 0.37886528393274843, + "learning_rate": 4.911223955712099e-06, + "loss": 0.5881, + "step": 2014 + }, + { + "epoch": 0.5349794238683128, + "grad_norm": 0.3922125535634023, + "learning_rate": 4.91113172022295e-06, + "loss": 0.6373, + "step": 2015 + }, + { + "epoch": 0.5352449223416965, + "grad_norm": 0.4099652113635852, + "learning_rate": 4.91103943771083e-06, + "loss": 0.6486, + "step": 2016 + }, + { + "epoch": 0.5355104208150803, + "grad_norm": 0.3925792103145467, + "learning_rate": 4.910947108177537e-06, + "loss": 0.6365, + "step": 2017 + }, + { + "epoch": 0.5357759192884641, + "grad_norm": 0.3914181019984941, + "learning_rate": 4.910854731624873e-06, + "loss": 0.6489, + "step": 2018 + }, + { + "epoch": 0.5360414177618479, + "grad_norm": 0.4027266335702039, + "learning_rate": 4.910762308054638e-06, + "loss": 0.5776, + "step": 2019 + }, + { + "epoch": 0.5363069162352316, + "grad_norm": 0.3698376211538782, + "learning_rate": 4.910669837468637e-06, + "loss": 0.6194, + "step": 2020 + }, + { + "epoch": 0.5365724147086154, + "grad_norm": 0.3894126970083076, + "learning_rate": 4.91057731986867e-06, + "loss": 0.6479, + "step": 2021 + }, + { + "epoch": 0.5368379131819992, + "grad_norm": 0.3885370586836856, + "learning_rate": 4.910484755256544e-06, + "loss": 0.6217, + "step": 2022 + }, + { + "epoch": 0.537103411655383, + "grad_norm": 0.38024985391777083, + "learning_rate": 4.910392143634064e-06, + "loss": 0.6163, + "step": 2023 + }, + { + "epoch": 0.5373689101287668, + "grad_norm": 0.3978649928960978, + "learning_rate": 4.910299485003034e-06, + "loss": 0.5969, + "step": 2024 + }, + { + "epoch": 0.5376344086021505, + "grad_norm": 0.38999713662535207, + "learning_rate": 4.9102067793652645e-06, + "loss": 0.6124, + "step": 2025 + }, + { + "epoch": 0.5378999070755344, + "grad_norm": 0.3771815764071492, + "learning_rate": 4.91011402672256e-06, + "loss": 0.6173, + "step": 2026 + }, + { + "epoch": 0.5381654055489181, + "grad_norm": 0.4011553085079414, + "learning_rate": 4.9100212270767315e-06, + "loss": 0.6015, + "step": 2027 + }, + { + "epoch": 0.5384309040223019, + "grad_norm": 0.3810920171325633, + "learning_rate": 4.909928380429588e-06, + "loss": 0.6359, + "step": 2028 + }, + { + "epoch": 0.5386964024956856, + "grad_norm": 0.3767688501902981, + "learning_rate": 4.9098354867829415e-06, + "loss": 0.6153, + "step": 2029 + }, + { + "epoch": 0.5389619009690694, + "grad_norm": 0.38881017925996, + "learning_rate": 4.909742546138602e-06, + "loss": 0.6388, + "step": 2030 + }, + { + "epoch": 0.5392273994424532, + "grad_norm": 0.38149688393532777, + "learning_rate": 4.909649558498383e-06, + "loss": 0.5973, + "step": 2031 + }, + { + "epoch": 0.539492897915837, + "grad_norm": 0.37580474143004144, + "learning_rate": 4.909556523864098e-06, + "loss": 0.5982, + "step": 2032 + }, + { + "epoch": 0.5397583963892207, + "grad_norm": 0.3882417055617691, + "learning_rate": 4.909463442237561e-06, + "loss": 0.6263, + "step": 2033 + }, + { + "epoch": 0.5400238948626045, + "grad_norm": 0.38715921572958895, + "learning_rate": 4.909370313620587e-06, + "loss": 0.6442, + "step": 2034 + }, + { + "epoch": 0.5402893933359884, + "grad_norm": 0.3839340050720667, + "learning_rate": 4.909277138014994e-06, + "loss": 0.6274, + "step": 2035 + }, + { + "epoch": 0.5405548918093721, + "grad_norm": 0.3865694160339814, + "learning_rate": 4.909183915422596e-06, + "loss": 0.576, + "step": 2036 + }, + { + "epoch": 0.5408203902827559, + "grad_norm": 0.3855764536735899, + "learning_rate": 4.909090645845214e-06, + "loss": 0.6458, + "step": 2037 + }, + { + "epoch": 0.5410858887561396, + "grad_norm": 0.381334965046924, + "learning_rate": 4.9089973292846665e-06, + "loss": 0.5939, + "step": 2038 + }, + { + "epoch": 0.5413513872295235, + "grad_norm": 0.3890453294774196, + "learning_rate": 4.908903965742772e-06, + "loss": 0.6255, + "step": 2039 + }, + { + "epoch": 0.5416168857029072, + "grad_norm": 0.41455624731884266, + "learning_rate": 4.908810555221352e-06, + "loss": 0.6487, + "step": 2040 + }, + { + "epoch": 0.541882384176291, + "grad_norm": 0.3987133890260368, + "learning_rate": 4.90871709772223e-06, + "loss": 0.6385, + "step": 2041 + }, + { + "epoch": 0.5421478826496747, + "grad_norm": 0.3966554965078316, + "learning_rate": 4.9086235932472254e-06, + "loss": 0.5804, + "step": 2042 + }, + { + "epoch": 0.5424133811230586, + "grad_norm": 0.3891784830525446, + "learning_rate": 4.908530041798164e-06, + "loss": 0.6051, + "step": 2043 + }, + { + "epoch": 0.5426788795964423, + "grad_norm": 0.37050107770847845, + "learning_rate": 4.908436443376869e-06, + "loss": 0.6068, + "step": 2044 + }, + { + "epoch": 0.5429443780698261, + "grad_norm": 0.39616810997144736, + "learning_rate": 4.908342797985167e-06, + "loss": 0.6365, + "step": 2045 + }, + { + "epoch": 0.5432098765432098, + "grad_norm": 0.3992223883897869, + "learning_rate": 4.908249105624884e-06, + "loss": 0.6466, + "step": 2046 + }, + { + "epoch": 0.5434753750165937, + "grad_norm": 0.40806638014668023, + "learning_rate": 4.908155366297846e-06, + "loss": 0.6417, + "step": 2047 + }, + { + "epoch": 0.5437408734899775, + "grad_norm": 0.41048772467987144, + "learning_rate": 4.908061580005884e-06, + "loss": 0.6069, + "step": 2048 + }, + { + "epoch": 0.5440063719633612, + "grad_norm": 0.3807003999786117, + "learning_rate": 4.907967746750824e-06, + "loss": 0.6143, + "step": 2049 + }, + { + "epoch": 0.544271870436745, + "grad_norm": 0.3906568534344132, + "learning_rate": 4.907873866534498e-06, + "loss": 0.6267, + "step": 2050 + }, + { + "epoch": 0.5445373689101287, + "grad_norm": 0.38566281253909396, + "learning_rate": 4.907779939358735e-06, + "loss": 0.6033, + "step": 2051 + }, + { + "epoch": 0.5448028673835126, + "grad_norm": 0.3758823380254977, + "learning_rate": 4.907685965225369e-06, + "loss": 0.6175, + "step": 2052 + }, + { + "epoch": 0.5450683658568963, + "grad_norm": 0.4113044262717281, + "learning_rate": 4.907591944136231e-06, + "loss": 0.6416, + "step": 2053 + }, + { + "epoch": 0.5453338643302801, + "grad_norm": 0.3938740571600203, + "learning_rate": 4.907497876093155e-06, + "loss": 0.5973, + "step": 2054 + }, + { + "epoch": 0.5455993628036638, + "grad_norm": 0.38334057621592116, + "learning_rate": 4.9074037610979765e-06, + "loss": 0.6369, + "step": 2055 + }, + { + "epoch": 0.5458648612770477, + "grad_norm": 0.37107462150204695, + "learning_rate": 4.90730959915253e-06, + "loss": 0.6142, + "step": 2056 + }, + { + "epoch": 0.5461303597504314, + "grad_norm": 0.38365278644512046, + "learning_rate": 4.907215390258653e-06, + "loss": 0.6447, + "step": 2057 + }, + { + "epoch": 0.5463958582238152, + "grad_norm": 0.3943919668941426, + "learning_rate": 4.907121134418181e-06, + "loss": 0.5708, + "step": 2058 + }, + { + "epoch": 0.546661356697199, + "grad_norm": 0.4163368103225318, + "learning_rate": 4.907026831632953e-06, + "loss": 0.6066, + "step": 2059 + }, + { + "epoch": 0.5469268551705828, + "grad_norm": 0.38527796730582103, + "learning_rate": 4.906932481904809e-06, + "loss": 0.6412, + "step": 2060 + }, + { + "epoch": 0.5471923536439666, + "grad_norm": 0.37312812386696503, + "learning_rate": 4.906838085235589e-06, + "loss": 0.6116, + "step": 2061 + }, + { + "epoch": 0.5474578521173503, + "grad_norm": 0.3874072289231269, + "learning_rate": 4.9067436416271315e-06, + "loss": 0.6487, + "step": 2062 + }, + { + "epoch": 0.5477233505907341, + "grad_norm": 0.4022103791020934, + "learning_rate": 4.906649151081282e-06, + "loss": 0.6378, + "step": 2063 + }, + { + "epoch": 0.5479888490641179, + "grad_norm": 0.38061999121785595, + "learning_rate": 4.906554613599881e-06, + "loss": 0.6464, + "step": 2064 + }, + { + "epoch": 0.5482543475375017, + "grad_norm": 0.3859123159817981, + "learning_rate": 4.906460029184773e-06, + "loss": 0.6227, + "step": 2065 + }, + { + "epoch": 0.5485198460108854, + "grad_norm": 0.3961735599825678, + "learning_rate": 4.906365397837803e-06, + "loss": 0.6385, + "step": 2066 + }, + { + "epoch": 0.5487853444842692, + "grad_norm": 0.394029028948993, + "learning_rate": 4.906270719560815e-06, + "loss": 0.6126, + "step": 2067 + }, + { + "epoch": 0.5490508429576529, + "grad_norm": 0.3693685697958605, + "learning_rate": 4.906175994355656e-06, + "loss": 0.6442, + "step": 2068 + }, + { + "epoch": 0.5493163414310368, + "grad_norm": 0.39211302516438096, + "learning_rate": 4.906081222224174e-06, + "loss": 0.5986, + "step": 2069 + }, + { + "epoch": 0.5495818399044206, + "grad_norm": 0.3830786351532926, + "learning_rate": 4.905986403168218e-06, + "loss": 0.5719, + "step": 2070 + }, + { + "epoch": 0.5498473383778043, + "grad_norm": 0.3782755714287307, + "learning_rate": 4.9058915371896354e-06, + "loss": 0.623, + "step": 2071 + }, + { + "epoch": 0.5501128368511881, + "grad_norm": 0.3990298814224943, + "learning_rate": 4.9057966242902775e-06, + "loss": 0.6315, + "step": 2072 + }, + { + "epoch": 0.5503783353245719, + "grad_norm": 0.3939806321713868, + "learning_rate": 4.905701664471994e-06, + "loss": 0.6092, + "step": 2073 + }, + { + "epoch": 0.5506438337979557, + "grad_norm": 0.37623921441529895, + "learning_rate": 4.90560665773664e-06, + "loss": 0.6366, + "step": 2074 + }, + { + "epoch": 0.5509093322713394, + "grad_norm": 0.3757539305206352, + "learning_rate": 4.905511604086065e-06, + "loss": 0.6525, + "step": 2075 + }, + { + "epoch": 0.5511748307447232, + "grad_norm": 0.39633723753870215, + "learning_rate": 4.905416503522124e-06, + "loss": 0.6233, + "step": 2076 + }, + { + "epoch": 0.551440329218107, + "grad_norm": 0.388027801508874, + "learning_rate": 4.905321356046672e-06, + "loss": 0.6111, + "step": 2077 + }, + { + "epoch": 0.5517058276914908, + "grad_norm": 0.3899109684092773, + "learning_rate": 4.905226161661564e-06, + "loss": 0.6576, + "step": 2078 + }, + { + "epoch": 0.5519713261648745, + "grad_norm": 0.40443956482179805, + "learning_rate": 4.905130920368657e-06, + "loss": 0.6243, + "step": 2079 + }, + { + "epoch": 0.5522368246382583, + "grad_norm": 0.38594194174105484, + "learning_rate": 4.905035632169808e-06, + "loss": 0.6381, + "step": 2080 + }, + { + "epoch": 0.5525023231116422, + "grad_norm": 0.3889138022398403, + "learning_rate": 4.904940297066876e-06, + "loss": 0.62, + "step": 2081 + }, + { + "epoch": 0.5527678215850259, + "grad_norm": 0.3705549552623768, + "learning_rate": 4.9048449150617195e-06, + "loss": 0.631, + "step": 2082 + }, + { + "epoch": 0.5530333200584097, + "grad_norm": 0.36827424623469257, + "learning_rate": 4.9047494861562e-06, + "loss": 0.5987, + "step": 2083 + }, + { + "epoch": 0.5532988185317934, + "grad_norm": 0.3741083791691221, + "learning_rate": 4.9046540103521765e-06, + "loss": 0.6235, + "step": 2084 + }, + { + "epoch": 0.5535643170051773, + "grad_norm": 0.386302888384375, + "learning_rate": 4.904558487651513e-06, + "loss": 0.5965, + "step": 2085 + }, + { + "epoch": 0.553829815478561, + "grad_norm": 0.3910143789167576, + "learning_rate": 4.904462918056071e-06, + "loss": 0.6102, + "step": 2086 + }, + { + "epoch": 0.5540953139519448, + "grad_norm": 0.3806114225783564, + "learning_rate": 4.904367301567715e-06, + "loss": 0.6444, + "step": 2087 + }, + { + "epoch": 0.5543608124253285, + "grad_norm": 0.3758951461070809, + "learning_rate": 4.90427163818831e-06, + "loss": 0.6482, + "step": 2088 + }, + { + "epoch": 0.5546263108987123, + "grad_norm": 0.38715614700989387, + "learning_rate": 4.904175927919722e-06, + "loss": 0.6361, + "step": 2089 + }, + { + "epoch": 0.5548918093720961, + "grad_norm": 0.38367039724574203, + "learning_rate": 4.904080170763816e-06, + "loss": 0.634, + "step": 2090 + }, + { + "epoch": 0.5551573078454799, + "grad_norm": 0.4168224051384316, + "learning_rate": 4.903984366722461e-06, + "loss": 0.6188, + "step": 2091 + }, + { + "epoch": 0.5554228063188636, + "grad_norm": 0.3817421936850519, + "learning_rate": 4.903888515797524e-06, + "loss": 0.6048, + "step": 2092 + }, + { + "epoch": 0.5556883047922474, + "grad_norm": 0.3670280478052143, + "learning_rate": 4.903792617990876e-06, + "loss": 0.6051, + "step": 2093 + }, + { + "epoch": 0.5559538032656313, + "grad_norm": 0.3938453401105219, + "learning_rate": 4.903696673304387e-06, + "loss": 0.6186, + "step": 2094 + }, + { + "epoch": 0.556219301739015, + "grad_norm": 0.39360141206843813, + "learning_rate": 4.903600681739926e-06, + "loss": 0.6329, + "step": 2095 + }, + { + "epoch": 0.5564848002123988, + "grad_norm": 0.3954338510193279, + "learning_rate": 4.903504643299368e-06, + "loss": 0.6368, + "step": 2096 + }, + { + "epoch": 0.5567502986857825, + "grad_norm": 0.3799079005061654, + "learning_rate": 4.903408557984585e-06, + "loss": 0.6386, + "step": 2097 + }, + { + "epoch": 0.5570157971591664, + "grad_norm": 0.37192170883761927, + "learning_rate": 4.90331242579745e-06, + "loss": 0.6183, + "step": 2098 + }, + { + "epoch": 0.5572812956325501, + "grad_norm": 0.38815571622690004, + "learning_rate": 4.903216246739837e-06, + "loss": 0.6415, + "step": 2099 + }, + { + "epoch": 0.5575467941059339, + "grad_norm": 0.38535358872207337, + "learning_rate": 4.903120020813625e-06, + "loss": 0.6452, + "step": 2100 + }, + { + "epoch": 0.5578122925793176, + "grad_norm": 0.39592321936836095, + "learning_rate": 4.903023748020688e-06, + "loss": 0.6255, + "step": 2101 + }, + { + "epoch": 0.5580777910527015, + "grad_norm": 0.3863115053098059, + "learning_rate": 4.9029274283629046e-06, + "loss": 0.6427, + "step": 2102 + }, + { + "epoch": 0.5583432895260853, + "grad_norm": 0.38221772361703116, + "learning_rate": 4.902831061842153e-06, + "loss": 0.6506, + "step": 2103 + }, + { + "epoch": 0.558608787999469, + "grad_norm": 0.3973981667806005, + "learning_rate": 4.9027346484603116e-06, + "loss": 0.6004, + "step": 2104 + }, + { + "epoch": 0.5588742864728528, + "grad_norm": 0.3907938965791774, + "learning_rate": 4.902638188219262e-06, + "loss": 0.6335, + "step": 2105 + }, + { + "epoch": 0.5591397849462365, + "grad_norm": 0.3860874537813031, + "learning_rate": 4.902541681120886e-06, + "loss": 0.6203, + "step": 2106 + }, + { + "epoch": 0.5594052834196204, + "grad_norm": 0.39445636651628896, + "learning_rate": 4.9024451271670635e-06, + "loss": 0.5878, + "step": 2107 + }, + { + "epoch": 0.5596707818930041, + "grad_norm": 0.3991038587018792, + "learning_rate": 4.9023485263596795e-06, + "loss": 0.6103, + "step": 2108 + }, + { + "epoch": 0.5599362803663879, + "grad_norm": 0.38404543983183625, + "learning_rate": 4.902251878700618e-06, + "loss": 0.6428, + "step": 2109 + }, + { + "epoch": 0.5602017788397716, + "grad_norm": 0.3758713989457986, + "learning_rate": 4.902155184191762e-06, + "loss": 0.6637, + "step": 2110 + }, + { + "epoch": 0.5604672773131555, + "grad_norm": 0.39684714170555097, + "learning_rate": 4.902058442834999e-06, + "loss": 0.6469, + "step": 2111 + }, + { + "epoch": 0.5607327757865392, + "grad_norm": 0.3955355601685318, + "learning_rate": 4.901961654632214e-06, + "loss": 0.6513, + "step": 2112 + }, + { + "epoch": 0.560998274259923, + "grad_norm": 0.38149509209917476, + "learning_rate": 4.901864819585297e-06, + "loss": 0.6326, + "step": 2113 + }, + { + "epoch": 0.5612637727333067, + "grad_norm": 0.38726352688714627, + "learning_rate": 4.901767937696135e-06, + "loss": 0.6567, + "step": 2114 + }, + { + "epoch": 0.5615292712066906, + "grad_norm": 0.3800742017223432, + "learning_rate": 4.901671008966618e-06, + "loss": 0.6582, + "step": 2115 + }, + { + "epoch": 0.5617947696800744, + "grad_norm": 0.3746328300006796, + "learning_rate": 4.901574033398635e-06, + "loss": 0.6445, + "step": 2116 + }, + { + "epoch": 0.5620602681534581, + "grad_norm": 0.38260282871000356, + "learning_rate": 4.901477010994079e-06, + "loss": 0.5919, + "step": 2117 + }, + { + "epoch": 0.5623257666268419, + "grad_norm": 0.3740965623998434, + "learning_rate": 4.901379941754841e-06, + "loss": 0.6254, + "step": 2118 + }, + { + "epoch": 0.5625912651002257, + "grad_norm": 0.3761963427338876, + "learning_rate": 4.901282825682815e-06, + "loss": 0.6224, + "step": 2119 + }, + { + "epoch": 0.5628567635736095, + "grad_norm": 0.38083963318217334, + "learning_rate": 4.901185662779895e-06, + "loss": 0.612, + "step": 2120 + }, + { + "epoch": 0.5631222620469932, + "grad_norm": 0.392352340225909, + "learning_rate": 4.901088453047975e-06, + "loss": 0.6406, + "step": 2121 + }, + { + "epoch": 0.563387760520377, + "grad_norm": 0.37933525053159506, + "learning_rate": 4.900991196488951e-06, + "loss": 0.5823, + "step": 2122 + }, + { + "epoch": 0.5636532589937607, + "grad_norm": 0.38008886707506945, + "learning_rate": 4.90089389310472e-06, + "loss": 0.6119, + "step": 2123 + }, + { + "epoch": 0.5639187574671446, + "grad_norm": 0.38651929782239175, + "learning_rate": 4.90079654289718e-06, + "loss": 0.644, + "step": 2124 + }, + { + "epoch": 0.5641842559405283, + "grad_norm": 0.39176422515446785, + "learning_rate": 4.900699145868228e-06, + "loss": 0.6371, + "step": 2125 + }, + { + "epoch": 0.5644497544139121, + "grad_norm": 0.393499491297263, + "learning_rate": 4.900601702019767e-06, + "loss": 0.587, + "step": 2126 + }, + { + "epoch": 0.5647152528872958, + "grad_norm": 0.3877468380146798, + "learning_rate": 4.900504211353694e-06, + "loss": 0.5753, + "step": 2127 + }, + { + "epoch": 0.5649807513606797, + "grad_norm": 0.39194630719423856, + "learning_rate": 4.900406673871912e-06, + "loss": 0.6076, + "step": 2128 + }, + { + "epoch": 0.5652462498340635, + "grad_norm": 0.3865805070743415, + "learning_rate": 4.900309089576321e-06, + "loss": 0.6458, + "step": 2129 + }, + { + "epoch": 0.5655117483074472, + "grad_norm": 0.413154720498655, + "learning_rate": 4.900211458468827e-06, + "loss": 0.6264, + "step": 2130 + }, + { + "epoch": 0.565777246780831, + "grad_norm": 0.37762177320125556, + "learning_rate": 4.900113780551332e-06, + "loss": 0.6636, + "step": 2131 + }, + { + "epoch": 0.5660427452542148, + "grad_norm": 0.39938122548014515, + "learning_rate": 4.900016055825743e-06, + "loss": 0.5983, + "step": 2132 + }, + { + "epoch": 0.5663082437275986, + "grad_norm": 0.3825194381566569, + "learning_rate": 4.899918284293964e-06, + "loss": 0.6577, + "step": 2133 + }, + { + "epoch": 0.5665737422009823, + "grad_norm": 0.41054618584625147, + "learning_rate": 4.899820465957903e-06, + "loss": 0.6314, + "step": 2134 + }, + { + "epoch": 0.5668392406743661, + "grad_norm": 0.37313881058907006, + "learning_rate": 4.899722600819467e-06, + "loss": 0.6299, + "step": 2135 + }, + { + "epoch": 0.56710473914775, + "grad_norm": 0.40422916207703785, + "learning_rate": 4.899624688880564e-06, + "loss": 0.6348, + "step": 2136 + }, + { + "epoch": 0.5673702376211337, + "grad_norm": 0.374248197860821, + "learning_rate": 4.899526730143104e-06, + "loss": 0.6403, + "step": 2137 + }, + { + "epoch": 0.5676357360945175, + "grad_norm": 0.37025737340891063, + "learning_rate": 4.8994287246089985e-06, + "loss": 0.6463, + "step": 2138 + }, + { + "epoch": 0.5679012345679012, + "grad_norm": 0.3769084840508759, + "learning_rate": 4.899330672280158e-06, + "loss": 0.6328, + "step": 2139 + }, + { + "epoch": 0.5681667330412851, + "grad_norm": 0.3858461436635138, + "learning_rate": 4.899232573158495e-06, + "loss": 0.6214, + "step": 2140 + }, + { + "epoch": 0.5684322315146688, + "grad_norm": 0.3824468158039468, + "learning_rate": 4.899134427245922e-06, + "loss": 0.5963, + "step": 2141 + }, + { + "epoch": 0.5686977299880526, + "grad_norm": 0.3880901580951896, + "learning_rate": 4.899036234544354e-06, + "loss": 0.6617, + "step": 2142 + }, + { + "epoch": 0.5689632284614363, + "grad_norm": 0.4064318711168065, + "learning_rate": 4.898937995055706e-06, + "loss": 0.6264, + "step": 2143 + }, + { + "epoch": 0.5692287269348201, + "grad_norm": 0.3923192035373891, + "learning_rate": 4.898839708781893e-06, + "loss": 0.6457, + "step": 2144 + }, + { + "epoch": 0.5694942254082039, + "grad_norm": 0.3888934408786419, + "learning_rate": 4.8987413757248316e-06, + "loss": 0.609, + "step": 2145 + }, + { + "epoch": 0.5697597238815877, + "grad_norm": 0.3908300467173408, + "learning_rate": 4.898642995886441e-06, + "loss": 0.6109, + "step": 2146 + }, + { + "epoch": 0.5700252223549714, + "grad_norm": 0.3863165067932937, + "learning_rate": 4.898544569268639e-06, + "loss": 0.637, + "step": 2147 + }, + { + "epoch": 0.5702907208283552, + "grad_norm": 0.3935153573179178, + "learning_rate": 4.898446095873345e-06, + "loss": 0.6266, + "step": 2148 + }, + { + "epoch": 0.570556219301739, + "grad_norm": 0.37662569809214685, + "learning_rate": 4.8983475757024805e-06, + "loss": 0.6219, + "step": 2149 + }, + { + "epoch": 0.5708217177751228, + "grad_norm": 0.38826483250184635, + "learning_rate": 4.898249008757965e-06, + "loss": 0.6317, + "step": 2150 + }, + { + "epoch": 0.5710872162485066, + "grad_norm": 0.4313419777251204, + "learning_rate": 4.898150395041723e-06, + "loss": 0.6306, + "step": 2151 + }, + { + "epoch": 0.5713527147218903, + "grad_norm": 0.3847667107954513, + "learning_rate": 4.898051734555676e-06, + "loss": 0.6373, + "step": 2152 + }, + { + "epoch": 0.5716182131952742, + "grad_norm": 0.4021383500058913, + "learning_rate": 4.897953027301748e-06, + "loss": 0.644, + "step": 2153 + }, + { + "epoch": 0.5718837116686579, + "grad_norm": 0.4219288427264436, + "learning_rate": 4.897854273281866e-06, + "loss": 0.6115, + "step": 2154 + }, + { + "epoch": 0.5721492101420417, + "grad_norm": 0.38071541982270457, + "learning_rate": 4.897755472497954e-06, + "loss": 0.5962, + "step": 2155 + }, + { + "epoch": 0.5724147086154254, + "grad_norm": 0.4540849896264911, + "learning_rate": 4.89765662495194e-06, + "loss": 0.6148, + "step": 2156 + }, + { + "epoch": 0.5726802070888093, + "grad_norm": 0.3842505208160628, + "learning_rate": 4.897557730645751e-06, + "loss": 0.6364, + "step": 2157 + }, + { + "epoch": 0.572945705562193, + "grad_norm": 0.3844551975074067, + "learning_rate": 4.897458789581317e-06, + "loss": 0.6173, + "step": 2158 + }, + { + "epoch": 0.5732112040355768, + "grad_norm": 0.38431361074353193, + "learning_rate": 4.897359801760565e-06, + "loss": 0.6352, + "step": 2159 + }, + { + "epoch": 0.5734767025089605, + "grad_norm": 0.43270885048710706, + "learning_rate": 4.8972607671854276e-06, + "loss": 0.6047, + "step": 2160 + }, + { + "epoch": 0.5737422009823443, + "grad_norm": 0.3846691149620548, + "learning_rate": 4.897161685857836e-06, + "loss": 0.6176, + "step": 2161 + }, + { + "epoch": 0.5740076994557282, + "grad_norm": 0.4007762887021661, + "learning_rate": 4.897062557779722e-06, + "loss": 0.6244, + "step": 2162 + }, + { + "epoch": 0.5742731979291119, + "grad_norm": 0.3961438131633055, + "learning_rate": 4.89696338295302e-06, + "loss": 0.6412, + "step": 2163 + }, + { + "epoch": 0.5745386964024957, + "grad_norm": 0.390097268979398, + "learning_rate": 4.8968641613796634e-06, + "loss": 0.6479, + "step": 2164 + }, + { + "epoch": 0.5748041948758794, + "grad_norm": 0.38810299744104254, + "learning_rate": 4.896764893061586e-06, + "loss": 0.6425, + "step": 2165 + }, + { + "epoch": 0.5750696933492633, + "grad_norm": 0.3959714972015471, + "learning_rate": 4.8966655780007246e-06, + "loss": 0.641, + "step": 2166 + }, + { + "epoch": 0.575335191822647, + "grad_norm": 0.39502538544557075, + "learning_rate": 4.896566216199017e-06, + "loss": 0.6516, + "step": 2167 + }, + { + "epoch": 0.5756006902960308, + "grad_norm": 0.3893632869954923, + "learning_rate": 4.896466807658401e-06, + "loss": 0.6515, + "step": 2168 + }, + { + "epoch": 0.5758661887694145, + "grad_norm": 0.3749742772574467, + "learning_rate": 4.8963673523808146e-06, + "loss": 0.5914, + "step": 2169 + }, + { + "epoch": 0.5761316872427984, + "grad_norm": 0.3693946699712746, + "learning_rate": 4.896267850368196e-06, + "loss": 0.6232, + "step": 2170 + }, + { + "epoch": 0.5763971857161821, + "grad_norm": 0.405383443756792, + "learning_rate": 4.896168301622489e-06, + "loss": 0.6182, + "step": 2171 + }, + { + "epoch": 0.5766626841895659, + "grad_norm": 0.3732202033074253, + "learning_rate": 4.896068706145632e-06, + "loss": 0.5644, + "step": 2172 + }, + { + "epoch": 0.5769281826629497, + "grad_norm": 0.3793821430485284, + "learning_rate": 4.89596906393957e-06, + "loss": 0.6167, + "step": 2173 + }, + { + "epoch": 0.5771936811363335, + "grad_norm": 0.4111046814110193, + "learning_rate": 4.895869375006244e-06, + "loss": 0.6211, + "step": 2174 + }, + { + "epoch": 0.5774591796097173, + "grad_norm": 0.4054227692967559, + "learning_rate": 4.895769639347598e-06, + "loss": 0.6486, + "step": 2175 + }, + { + "epoch": 0.577724678083101, + "grad_norm": 0.37646422034201293, + "learning_rate": 4.895669856965581e-06, + "loss": 0.6548, + "step": 2176 + }, + { + "epoch": 0.5779901765564848, + "grad_norm": 0.41723005095630505, + "learning_rate": 4.895570027862133e-06, + "loss": 0.6085, + "step": 2177 + }, + { + "epoch": 0.5782556750298686, + "grad_norm": 0.40438159811930047, + "learning_rate": 4.895470152039206e-06, + "loss": 0.6463, + "step": 2178 + }, + { + "epoch": 0.5785211735032524, + "grad_norm": 0.3897077639387259, + "learning_rate": 4.895370229498746e-06, + "loss": 0.6097, + "step": 2179 + }, + { + "epoch": 0.5787866719766361, + "grad_norm": 0.3800236733526297, + "learning_rate": 4.895270260242701e-06, + "loss": 0.6326, + "step": 2180 + }, + { + "epoch": 0.5790521704500199, + "grad_norm": 0.3874467382028264, + "learning_rate": 4.895170244273022e-06, + "loss": 0.5995, + "step": 2181 + }, + { + "epoch": 0.5793176689234036, + "grad_norm": 0.38017708406839484, + "learning_rate": 4.895070181591658e-06, + "loss": 0.6276, + "step": 2182 + }, + { + "epoch": 0.5795831673967875, + "grad_norm": 0.38278814329780914, + "learning_rate": 4.894970072200561e-06, + "loss": 0.6501, + "step": 2183 + }, + { + "epoch": 0.5798486658701713, + "grad_norm": 0.3958442998239954, + "learning_rate": 4.894869916101685e-06, + "loss": 0.6083, + "step": 2184 + }, + { + "epoch": 0.580114164343555, + "grad_norm": 0.38684358144848, + "learning_rate": 4.894769713296981e-06, + "loss": 0.6409, + "step": 2185 + }, + { + "epoch": 0.5803796628169388, + "grad_norm": 0.388722478341223, + "learning_rate": 4.8946694637884045e-06, + "loss": 0.6105, + "step": 2186 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 0.3754123202087565, + "learning_rate": 4.8945691675779104e-06, + "loss": 0.6206, + "step": 2187 + }, + { + "epoch": 0.5809106597637064, + "grad_norm": 0.3926535967431366, + "learning_rate": 4.894468824667454e-06, + "loss": 0.6424, + "step": 2188 + }, + { + "epoch": 0.5811761582370901, + "grad_norm": 0.37486026021945246, + "learning_rate": 4.894368435058993e-06, + "loss": 0.6491, + "step": 2189 + }, + { + "epoch": 0.5814416567104739, + "grad_norm": 0.3863608593671605, + "learning_rate": 4.894267998754486e-06, + "loss": 0.633, + "step": 2190 + }, + { + "epoch": 0.5817071551838577, + "grad_norm": 0.3884998634533141, + "learning_rate": 4.894167515755891e-06, + "loss": 0.6061, + "step": 2191 + }, + { + "epoch": 0.5819726536572415, + "grad_norm": 0.38409327701132057, + "learning_rate": 4.894066986065167e-06, + "loss": 0.6324, + "step": 2192 + }, + { + "epoch": 0.5822381521306252, + "grad_norm": 0.3714906413935847, + "learning_rate": 4.893966409684274e-06, + "loss": 0.6231, + "step": 2193 + }, + { + "epoch": 0.582503650604009, + "grad_norm": 0.39370885532732425, + "learning_rate": 4.893865786615176e-06, + "loss": 0.6145, + "step": 2194 + }, + { + "epoch": 0.5827691490773929, + "grad_norm": 0.38889061772731237, + "learning_rate": 4.893765116859833e-06, + "loss": 0.5917, + "step": 2195 + }, + { + "epoch": 0.5830346475507766, + "grad_norm": 0.37937530144908327, + "learning_rate": 4.89366440042021e-06, + "loss": 0.6295, + "step": 2196 + }, + { + "epoch": 0.5833001460241604, + "grad_norm": 0.37975776473212, + "learning_rate": 4.89356363729827e-06, + "loss": 0.6051, + "step": 2197 + }, + { + "epoch": 0.5835656444975441, + "grad_norm": 0.39773401945617287, + "learning_rate": 4.893462827495979e-06, + "loss": 0.6453, + "step": 2198 + }, + { + "epoch": 0.5838311429709279, + "grad_norm": 0.3764356020152879, + "learning_rate": 4.893361971015302e-06, + "loss": 0.6339, + "step": 2199 + }, + { + "epoch": 0.5840966414443117, + "grad_norm": 0.3968692273333052, + "learning_rate": 4.8932610678582076e-06, + "loss": 0.6714, + "step": 2200 + }, + { + "epoch": 0.5843621399176955, + "grad_norm": 0.3703178505898537, + "learning_rate": 4.893160118026662e-06, + "loss": 0.6337, + "step": 2201 + }, + { + "epoch": 0.5846276383910792, + "grad_norm": 0.3930140117563288, + "learning_rate": 4.8930591215226355e-06, + "loss": 0.622, + "step": 2202 + }, + { + "epoch": 0.584893136864463, + "grad_norm": 0.3816700564419683, + "learning_rate": 4.892958078348096e-06, + "loss": 0.6051, + "step": 2203 + }, + { + "epoch": 0.5851586353378468, + "grad_norm": 0.3870884558521837, + "learning_rate": 4.892856988505015e-06, + "loss": 0.6211, + "step": 2204 + }, + { + "epoch": 0.5854241338112306, + "grad_norm": 0.41377118989595385, + "learning_rate": 4.892755851995364e-06, + "loss": 0.6265, + "step": 2205 + }, + { + "epoch": 0.5856896322846143, + "grad_norm": 0.39182707672478273, + "learning_rate": 4.8926546688211155e-06, + "loss": 0.6161, + "step": 2206 + }, + { + "epoch": 0.5859551307579981, + "grad_norm": 0.38699158023072017, + "learning_rate": 4.892553438984243e-06, + "loss": 0.6379, + "step": 2207 + }, + { + "epoch": 0.586220629231382, + "grad_norm": 0.38068111580536557, + "learning_rate": 4.8924521624867204e-06, + "loss": 0.6017, + "step": 2208 + }, + { + "epoch": 0.5864861277047657, + "grad_norm": 0.3752111789320971, + "learning_rate": 4.8923508393305224e-06, + "loss": 0.6447, + "step": 2209 + }, + { + "epoch": 0.5867516261781495, + "grad_norm": 0.3851509876776998, + "learning_rate": 4.892249469517626e-06, + "loss": 0.6231, + "step": 2210 + }, + { + "epoch": 0.5870171246515332, + "grad_norm": 0.3948533170299444, + "learning_rate": 4.892148053050008e-06, + "loss": 0.6378, + "step": 2211 + }, + { + "epoch": 0.5872826231249171, + "grad_norm": 0.398749040086564, + "learning_rate": 4.892046589929645e-06, + "loss": 0.599, + "step": 2212 + }, + { + "epoch": 0.5875481215983008, + "grad_norm": 0.38272011041291865, + "learning_rate": 4.891945080158518e-06, + "loss": 0.6075, + "step": 2213 + }, + { + "epoch": 0.5878136200716846, + "grad_norm": 0.3949612619072957, + "learning_rate": 4.891843523738605e-06, + "loss": 0.6475, + "step": 2214 + }, + { + "epoch": 0.5880791185450683, + "grad_norm": 0.38584000416633896, + "learning_rate": 4.8917419206718866e-06, + "loss": 0.595, + "step": 2215 + }, + { + "epoch": 0.5883446170184522, + "grad_norm": 0.3859712641239847, + "learning_rate": 4.891640270960345e-06, + "loss": 0.6273, + "step": 2216 + }, + { + "epoch": 0.588610115491836, + "grad_norm": 0.39724845837803974, + "learning_rate": 4.891538574605962e-06, + "loss": 0.6298, + "step": 2217 + }, + { + "epoch": 0.5888756139652197, + "grad_norm": 0.3978337776338302, + "learning_rate": 4.8914368316107215e-06, + "loss": 0.6375, + "step": 2218 + }, + { + "epoch": 0.5891411124386035, + "grad_norm": 0.37927679100241796, + "learning_rate": 4.891335041976608e-06, + "loss": 0.6444, + "step": 2219 + }, + { + "epoch": 0.5894066109119872, + "grad_norm": 0.3850657177831566, + "learning_rate": 4.891233205705606e-06, + "loss": 0.6335, + "step": 2220 + }, + { + "epoch": 0.5896721093853711, + "grad_norm": 0.39558329617006666, + "learning_rate": 4.891131322799701e-06, + "loss": 0.6047, + "step": 2221 + }, + { + "epoch": 0.5899376078587548, + "grad_norm": 0.4014748284874707, + "learning_rate": 4.891029393260882e-06, + "loss": 0.6587, + "step": 2222 + }, + { + "epoch": 0.5902031063321386, + "grad_norm": 0.38448678396113367, + "learning_rate": 4.890927417091135e-06, + "loss": 0.6375, + "step": 2223 + }, + { + "epoch": 0.5904686048055223, + "grad_norm": 0.3918819994005177, + "learning_rate": 4.890825394292449e-06, + "loss": 0.6437, + "step": 2224 + }, + { + "epoch": 0.5907341032789062, + "grad_norm": 0.379902795113804, + "learning_rate": 4.8907233248668145e-06, + "loss": 0.5903, + "step": 2225 + }, + { + "epoch": 0.5909996017522899, + "grad_norm": 0.3878970032464588, + "learning_rate": 4.890621208816222e-06, + "loss": 0.6575, + "step": 2226 + }, + { + "epoch": 0.5912651002256737, + "grad_norm": 0.39848748754990604, + "learning_rate": 4.890519046142662e-06, + "loss": 0.5976, + "step": 2227 + }, + { + "epoch": 0.5915305986990574, + "grad_norm": 0.3774780289070293, + "learning_rate": 4.890416836848128e-06, + "loss": 0.5948, + "step": 2228 + }, + { + "epoch": 0.5917960971724413, + "grad_norm": 0.3853094810340306, + "learning_rate": 4.890314580934612e-06, + "loss": 0.6249, + "step": 2229 + }, + { + "epoch": 0.5920615956458251, + "grad_norm": 0.3764111814497864, + "learning_rate": 4.890212278404111e-06, + "loss": 0.6345, + "step": 2230 + }, + { + "epoch": 0.5923270941192088, + "grad_norm": 0.41719540411039174, + "learning_rate": 4.890109929258616e-06, + "loss": 0.6243, + "step": 2231 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 0.38991606571766907, + "learning_rate": 4.890007533500127e-06, + "loss": 0.6322, + "step": 2232 + }, + { + "epoch": 0.5928580910659764, + "grad_norm": 0.386810740284029, + "learning_rate": 4.889905091130639e-06, + "loss": 0.6245, + "step": 2233 + }, + { + "epoch": 0.5931235895393602, + "grad_norm": 0.3957528074562135, + "learning_rate": 4.889802602152151e-06, + "loss": 0.5907, + "step": 2234 + }, + { + "epoch": 0.5933890880127439, + "grad_norm": 0.4026387439350841, + "learning_rate": 4.889700066566659e-06, + "loss": 0.6793, + "step": 2235 + }, + { + "epoch": 0.5936545864861277, + "grad_norm": 0.38501982758376346, + "learning_rate": 4.889597484376166e-06, + "loss": 0.6314, + "step": 2236 + }, + { + "epoch": 0.5939200849595114, + "grad_norm": 0.4095472745716219, + "learning_rate": 4.8894948555826714e-06, + "loss": 0.6493, + "step": 2237 + }, + { + "epoch": 0.5941855834328953, + "grad_norm": 0.37240788797093644, + "learning_rate": 4.889392180188177e-06, + "loss": 0.6136, + "step": 2238 + }, + { + "epoch": 0.594451081906279, + "grad_norm": 0.3729662364141783, + "learning_rate": 4.889289458194684e-06, + "loss": 0.6159, + "step": 2239 + }, + { + "epoch": 0.5947165803796628, + "grad_norm": 0.36713331053545234, + "learning_rate": 4.889186689604197e-06, + "loss": 0.591, + "step": 2240 + }, + { + "epoch": 0.5949820788530465, + "grad_norm": 0.3893915458181373, + "learning_rate": 4.88908387441872e-06, + "loss": 0.6591, + "step": 2241 + }, + { + "epoch": 0.5952475773264304, + "grad_norm": 0.3718382314454185, + "learning_rate": 4.888981012640257e-06, + "loss": 0.6121, + "step": 2242 + }, + { + "epoch": 0.5955130757998142, + "grad_norm": 0.3867013733578012, + "learning_rate": 4.888878104270816e-06, + "loss": 0.6369, + "step": 2243 + }, + { + "epoch": 0.5957785742731979, + "grad_norm": 0.3803969294872666, + "learning_rate": 4.888775149312403e-06, + "loss": 0.6076, + "step": 2244 + }, + { + "epoch": 0.5960440727465817, + "grad_norm": 0.394707896190095, + "learning_rate": 4.888672147767025e-06, + "loss": 0.6186, + "step": 2245 + }, + { + "epoch": 0.5963095712199655, + "grad_norm": 0.3851378577706324, + "learning_rate": 4.888569099636692e-06, + "loss": 0.6166, + "step": 2246 + }, + { + "epoch": 0.5965750696933493, + "grad_norm": 0.3969387739544168, + "learning_rate": 4.888466004923413e-06, + "loss": 0.6316, + "step": 2247 + }, + { + "epoch": 0.596840568166733, + "grad_norm": 0.3847333963298914, + "learning_rate": 4.8883628636292e-06, + "loss": 0.6065, + "step": 2248 + }, + { + "epoch": 0.5971060666401168, + "grad_norm": 0.41493574810457373, + "learning_rate": 4.888259675756063e-06, + "loss": 0.6137, + "step": 2249 + }, + { + "epoch": 0.5973715651135006, + "grad_norm": 0.3994941140972394, + "learning_rate": 4.888156441306014e-06, + "loss": 0.6411, + "step": 2250 + }, + { + "epoch": 0.5976370635868844, + "grad_norm": 0.40092443510912823, + "learning_rate": 4.888053160281069e-06, + "loss": 0.6552, + "step": 2251 + }, + { + "epoch": 0.5979025620602682, + "grad_norm": 0.3882861571211597, + "learning_rate": 4.8879498326832385e-06, + "loss": 0.6154, + "step": 2252 + }, + { + "epoch": 0.5981680605336519, + "grad_norm": 0.3713948020326159, + "learning_rate": 4.887846458514541e-06, + "loss": 0.5616, + "step": 2253 + }, + { + "epoch": 0.5984335590070358, + "grad_norm": 0.38384861987675445, + "learning_rate": 4.88774303777699e-06, + "loss": 0.633, + "step": 2254 + }, + { + "epoch": 0.5986990574804195, + "grad_norm": 0.3839817470691487, + "learning_rate": 4.887639570472604e-06, + "loss": 0.5817, + "step": 2255 + }, + { + "epoch": 0.5989645559538033, + "grad_norm": 0.39361941153049085, + "learning_rate": 4.887536056603401e-06, + "loss": 0.5861, + "step": 2256 + }, + { + "epoch": 0.599230054427187, + "grad_norm": 0.371897654657084, + "learning_rate": 4.887432496171398e-06, + "loss": 0.5875, + "step": 2257 + }, + { + "epoch": 0.5994955529005708, + "grad_norm": 0.3888488630457417, + "learning_rate": 4.887328889178617e-06, + "loss": 0.6195, + "step": 2258 + }, + { + "epoch": 0.5997610513739546, + "grad_norm": 0.3828262064183697, + "learning_rate": 4.887225235627077e-06, + "loss": 0.6017, + "step": 2259 + }, + { + "epoch": 0.6000265498473384, + "grad_norm": 0.38484680256981363, + "learning_rate": 4.887121535518799e-06, + "loss": 0.6761, + "step": 2260 + }, + { + "epoch": 0.6002920483207221, + "grad_norm": 0.3840856986296762, + "learning_rate": 4.887017788855808e-06, + "loss": 0.6035, + "step": 2261 + }, + { + "epoch": 0.6005575467941059, + "grad_norm": 0.3849416262305795, + "learning_rate": 4.8869139956401255e-06, + "loss": 0.632, + "step": 2262 + }, + { + "epoch": 0.6008230452674898, + "grad_norm": 0.385096199829097, + "learning_rate": 4.8868101558737755e-06, + "loss": 0.6467, + "step": 2263 + }, + { + "epoch": 0.6010885437408735, + "grad_norm": 0.38411824941438083, + "learning_rate": 4.886706269558783e-06, + "loss": 0.5553, + "step": 2264 + }, + { + "epoch": 0.6013540422142573, + "grad_norm": 0.3801440224246112, + "learning_rate": 4.886602336697175e-06, + "loss": 0.63, + "step": 2265 + }, + { + "epoch": 0.601619540687641, + "grad_norm": 0.38733630432591415, + "learning_rate": 4.886498357290978e-06, + "loss": 0.6047, + "step": 2266 + }, + { + "epoch": 0.6018850391610249, + "grad_norm": 0.41059978073098796, + "learning_rate": 4.88639433134222e-06, + "loss": 0.6292, + "step": 2267 + }, + { + "epoch": 0.6021505376344086, + "grad_norm": 0.3855283486466279, + "learning_rate": 4.88629025885293e-06, + "loss": 0.6129, + "step": 2268 + }, + { + "epoch": 0.6024160361077924, + "grad_norm": 0.41265246113613696, + "learning_rate": 4.886186139825137e-06, + "loss": 0.6255, + "step": 2269 + }, + { + "epoch": 0.6026815345811761, + "grad_norm": 0.40886195111049, + "learning_rate": 4.8860819742608715e-06, + "loss": 0.6148, + "step": 2270 + }, + { + "epoch": 0.60294703305456, + "grad_norm": 0.3764460429812742, + "learning_rate": 4.885977762162166e-06, + "loss": 0.6421, + "step": 2271 + }, + { + "epoch": 0.6032125315279437, + "grad_norm": 0.38148557590078197, + "learning_rate": 4.885873503531052e-06, + "loss": 0.6015, + "step": 2272 + }, + { + "epoch": 0.6034780300013275, + "grad_norm": 0.37988106953348527, + "learning_rate": 4.885769198369562e-06, + "loss": 0.6073, + "step": 2273 + }, + { + "epoch": 0.6037435284747112, + "grad_norm": 0.38697233761955335, + "learning_rate": 4.885664846679733e-06, + "loss": 0.6403, + "step": 2274 + }, + { + "epoch": 0.604009026948095, + "grad_norm": 0.3883099861193893, + "learning_rate": 4.885560448463598e-06, + "loss": 0.5837, + "step": 2275 + }, + { + "epoch": 0.6042745254214789, + "grad_norm": 0.39095583833319403, + "learning_rate": 4.885456003723192e-06, + "loss": 0.6, + "step": 2276 + }, + { + "epoch": 0.6045400238948626, + "grad_norm": 0.38923836797913375, + "learning_rate": 4.885351512460554e-06, + "loss": 0.6394, + "step": 2277 + }, + { + "epoch": 0.6048055223682464, + "grad_norm": 0.392686111275325, + "learning_rate": 4.885246974677722e-06, + "loss": 0.5906, + "step": 2278 + }, + { + "epoch": 0.6050710208416301, + "grad_norm": 0.39871754092907447, + "learning_rate": 4.885142390376734e-06, + "loss": 0.6354, + "step": 2279 + }, + { + "epoch": 0.605336519315014, + "grad_norm": 0.39714503012304886, + "learning_rate": 4.885037759559629e-06, + "loss": 0.6732, + "step": 2280 + }, + { + "epoch": 0.6056020177883977, + "grad_norm": 0.38248809310209814, + "learning_rate": 4.884933082228448e-06, + "loss": 0.6383, + "step": 2281 + }, + { + "epoch": 0.6058675162617815, + "grad_norm": 0.382362656022168, + "learning_rate": 4.884828358385232e-06, + "loss": 0.6416, + "step": 2282 + }, + { + "epoch": 0.6061330147351652, + "grad_norm": 0.40072667467094, + "learning_rate": 4.884723588032025e-06, + "loss": 0.5973, + "step": 2283 + }, + { + "epoch": 0.6063985132085491, + "grad_norm": 0.3930848269379804, + "learning_rate": 4.884618771170869e-06, + "loss": 0.6469, + "step": 2284 + }, + { + "epoch": 0.6066640116819328, + "grad_norm": 0.3866725265930605, + "learning_rate": 4.884513907803808e-06, + "loss": 0.6217, + "step": 2285 + }, + { + "epoch": 0.6069295101553166, + "grad_norm": 0.3952857127342665, + "learning_rate": 4.884408997932888e-06, + "loss": 0.6326, + "step": 2286 + }, + { + "epoch": 0.6071950086287004, + "grad_norm": 0.39260635351597767, + "learning_rate": 4.884304041560154e-06, + "loss": 0.6259, + "step": 2287 + }, + { + "epoch": 0.6074605071020842, + "grad_norm": 0.4073982009483649, + "learning_rate": 4.884199038687653e-06, + "loss": 0.6402, + "step": 2288 + }, + { + "epoch": 0.607726005575468, + "grad_norm": 0.37151038893093485, + "learning_rate": 4.8840939893174346e-06, + "loss": 0.6137, + "step": 2289 + }, + { + "epoch": 0.6079915040488517, + "grad_norm": 0.3764410770156596, + "learning_rate": 4.883988893451545e-06, + "loss": 0.5987, + "step": 2290 + }, + { + "epoch": 0.6082570025222355, + "grad_norm": 0.3988277215546898, + "learning_rate": 4.883883751092035e-06, + "loss": 0.6097, + "step": 2291 + }, + { + "epoch": 0.6085225009956192, + "grad_norm": 0.38839529823626273, + "learning_rate": 4.883778562240956e-06, + "loss": 0.6456, + "step": 2292 + }, + { + "epoch": 0.6087879994690031, + "grad_norm": 0.3885724256697228, + "learning_rate": 4.8836733269003574e-06, + "loss": 0.6078, + "step": 2293 + }, + { + "epoch": 0.6090534979423868, + "grad_norm": 0.4111759276077352, + "learning_rate": 4.8835680450722936e-06, + "loss": 0.6468, + "step": 2294 + }, + { + "epoch": 0.6093189964157706, + "grad_norm": 0.3807975493978897, + "learning_rate": 4.883462716758817e-06, + "loss": 0.656, + "step": 2295 + }, + { + "epoch": 0.6095844948891543, + "grad_norm": 0.3923694304730504, + "learning_rate": 4.88335734196198e-06, + "loss": 0.6475, + "step": 2296 + }, + { + "epoch": 0.6098499933625382, + "grad_norm": 0.37477259929688456, + "learning_rate": 4.8832519206838414e-06, + "loss": 0.6318, + "step": 2297 + }, + { + "epoch": 0.610115491835922, + "grad_norm": 0.3838689728476556, + "learning_rate": 4.883146452926455e-06, + "loss": 0.6224, + "step": 2298 + }, + { + "epoch": 0.6103809903093057, + "grad_norm": 0.3657888431749063, + "learning_rate": 4.8830409386918766e-06, + "loss": 0.6015, + "step": 2299 + }, + { + "epoch": 0.6106464887826895, + "grad_norm": 0.39157567946203004, + "learning_rate": 4.882935377982166e-06, + "loss": 0.6444, + "step": 2300 + }, + { + "epoch": 0.6109119872560733, + "grad_norm": 0.3791873823184957, + "learning_rate": 4.882829770799381e-06, + "loss": 0.6111, + "step": 2301 + }, + { + "epoch": 0.6111774857294571, + "grad_norm": 0.3758602781250536, + "learning_rate": 4.88272411714558e-06, + "loss": 0.6124, + "step": 2302 + }, + { + "epoch": 0.6114429842028408, + "grad_norm": 0.3925975205540793, + "learning_rate": 4.882618417022827e-06, + "loss": 0.622, + "step": 2303 + }, + { + "epoch": 0.6117084826762246, + "grad_norm": 0.3899158034417881, + "learning_rate": 4.88251267043318e-06, + "loss": 0.6238, + "step": 2304 + }, + { + "epoch": 0.6119739811496084, + "grad_norm": 0.38634268746897243, + "learning_rate": 4.882406877378703e-06, + "loss": 0.6152, + "step": 2305 + }, + { + "epoch": 0.6122394796229922, + "grad_norm": 0.3809589119286443, + "learning_rate": 4.882301037861459e-06, + "loss": 0.5864, + "step": 2306 + }, + { + "epoch": 0.6125049780963759, + "grad_norm": 0.3857995001561622, + "learning_rate": 4.882195151883511e-06, + "loss": 0.5764, + "step": 2307 + }, + { + "epoch": 0.6127704765697597, + "grad_norm": 0.3913976214194172, + "learning_rate": 4.882089219446925e-06, + "loss": 0.6648, + "step": 2308 + }, + { + "epoch": 0.6130359750431436, + "grad_norm": 0.38831120435461774, + "learning_rate": 4.8819832405537675e-06, + "loss": 0.6469, + "step": 2309 + }, + { + "epoch": 0.6133014735165273, + "grad_norm": 0.49249891976627497, + "learning_rate": 4.881877215206104e-06, + "loss": 0.6067, + "step": 2310 + }, + { + "epoch": 0.6135669719899111, + "grad_norm": 0.39022262201084135, + "learning_rate": 4.881771143406004e-06, + "loss": 0.6406, + "step": 2311 + }, + { + "epoch": 0.6138324704632948, + "grad_norm": 0.41752508991924614, + "learning_rate": 4.881665025155535e-06, + "loss": 0.6215, + "step": 2312 + }, + { + "epoch": 0.6140979689366786, + "grad_norm": 0.3862454665198474, + "learning_rate": 4.881558860456765e-06, + "loss": 0.5983, + "step": 2313 + }, + { + "epoch": 0.6143634674100624, + "grad_norm": 0.38444100504837136, + "learning_rate": 4.881452649311768e-06, + "loss": 0.6294, + "step": 2314 + }, + { + "epoch": 0.6146289658834462, + "grad_norm": 0.3857585870410081, + "learning_rate": 4.881346391722613e-06, + "loss": 0.6513, + "step": 2315 + }, + { + "epoch": 0.6148944643568299, + "grad_norm": 0.41067777735844124, + "learning_rate": 4.881240087691372e-06, + "loss": 0.6406, + "step": 2316 + }, + { + "epoch": 0.6151599628302137, + "grad_norm": 0.3887479327214694, + "learning_rate": 4.88113373722012e-06, + "loss": 0.5949, + "step": 2317 + }, + { + "epoch": 0.6154254613035975, + "grad_norm": 0.40226059323754443, + "learning_rate": 4.88102734031093e-06, + "loss": 0.6234, + "step": 2318 + }, + { + "epoch": 0.6156909597769813, + "grad_norm": 0.39721682665934654, + "learning_rate": 4.880920896965877e-06, + "loss": 0.6123, + "step": 2319 + }, + { + "epoch": 0.615956458250365, + "grad_norm": 0.39369038462415645, + "learning_rate": 4.880814407187037e-06, + "loss": 0.6271, + "step": 2320 + }, + { + "epoch": 0.6162219567237488, + "grad_norm": 0.3897859329053599, + "learning_rate": 4.880707870976486e-06, + "loss": 0.6068, + "step": 2321 + }, + { + "epoch": 0.6164874551971327, + "grad_norm": 0.42543408190189935, + "learning_rate": 4.880601288336304e-06, + "loss": 0.6471, + "step": 2322 + }, + { + "epoch": 0.6167529536705164, + "grad_norm": 0.39625575285272974, + "learning_rate": 4.880494659268567e-06, + "loss": 0.6388, + "step": 2323 + }, + { + "epoch": 0.6170184521439002, + "grad_norm": 0.371069958328103, + "learning_rate": 4.880387983775356e-06, + "loss": 0.6013, + "step": 2324 + }, + { + "epoch": 0.6172839506172839, + "grad_norm": 0.39029632802901953, + "learning_rate": 4.880281261858751e-06, + "loss": 0.6347, + "step": 2325 + }, + { + "epoch": 0.6175494490906678, + "grad_norm": 0.41170146729381074, + "learning_rate": 4.880174493520834e-06, + "loss": 0.6275, + "step": 2326 + }, + { + "epoch": 0.6178149475640515, + "grad_norm": 0.3864660763060855, + "learning_rate": 4.8800676787636865e-06, + "loss": 0.658, + "step": 2327 + }, + { + "epoch": 0.6180804460374353, + "grad_norm": 0.41771132209064465, + "learning_rate": 4.879960817589392e-06, + "loss": 0.6564, + "step": 2328 + }, + { + "epoch": 0.618345944510819, + "grad_norm": 0.4069991024607502, + "learning_rate": 4.879853910000034e-06, + "loss": 0.6413, + "step": 2329 + }, + { + "epoch": 0.6186114429842028, + "grad_norm": 0.3868430910284126, + "learning_rate": 4.879746955997698e-06, + "loss": 0.6354, + "step": 2330 + }, + { + "epoch": 0.6188769414575866, + "grad_norm": 0.39251733973127556, + "learning_rate": 4.87963995558447e-06, + "loss": 0.5813, + "step": 2331 + }, + { + "epoch": 0.6191424399309704, + "grad_norm": 0.3891066975473416, + "learning_rate": 4.879532908762436e-06, + "loss": 0.6261, + "step": 2332 + }, + { + "epoch": 0.6194079384043542, + "grad_norm": 0.3965593417905505, + "learning_rate": 4.8794258155336845e-06, + "loss": 0.6196, + "step": 2333 + }, + { + "epoch": 0.6196734368777379, + "grad_norm": 0.38956380180103933, + "learning_rate": 4.879318675900304e-06, + "loss": 0.6374, + "step": 2334 + }, + { + "epoch": 0.6199389353511218, + "grad_norm": 0.3928432233142457, + "learning_rate": 4.879211489864383e-06, + "loss": 0.6188, + "step": 2335 + }, + { + "epoch": 0.6202044338245055, + "grad_norm": 0.4422949602136624, + "learning_rate": 4.8791042574280135e-06, + "loss": 0.588, + "step": 2336 + }, + { + "epoch": 0.6204699322978893, + "grad_norm": 0.39925491957786974, + "learning_rate": 4.878996978593286e-06, + "loss": 0.6214, + "step": 2337 + }, + { + "epoch": 0.620735430771273, + "grad_norm": 0.38079109358165275, + "learning_rate": 4.878889653362292e-06, + "loss": 0.611, + "step": 2338 + }, + { + "epoch": 0.6210009292446569, + "grad_norm": 0.46654279735645504, + "learning_rate": 4.878782281737125e-06, + "loss": 0.621, + "step": 2339 + }, + { + "epoch": 0.6212664277180406, + "grad_norm": 0.40210668345158673, + "learning_rate": 4.878674863719879e-06, + "loss": 0.5821, + "step": 2340 + }, + { + "epoch": 0.6215319261914244, + "grad_norm": 0.3770331725948, + "learning_rate": 4.87856739931265e-06, + "loss": 0.6089, + "step": 2341 + }, + { + "epoch": 0.6217974246648081, + "grad_norm": 0.38523267183865273, + "learning_rate": 4.878459888517533e-06, + "loss": 0.6494, + "step": 2342 + }, + { + "epoch": 0.622062923138192, + "grad_norm": 0.391862533871917, + "learning_rate": 4.878352331336624e-06, + "loss": 0.5861, + "step": 2343 + }, + { + "epoch": 0.6223284216115758, + "grad_norm": 0.39001758829827077, + "learning_rate": 4.8782447277720225e-06, + "loss": 0.6192, + "step": 2344 + }, + { + "epoch": 0.6225939200849595, + "grad_norm": 0.39191717983325164, + "learning_rate": 4.8781370778258254e-06, + "loss": 0.6415, + "step": 2345 + }, + { + "epoch": 0.6228594185583433, + "grad_norm": 0.3792087930924814, + "learning_rate": 4.878029381500132e-06, + "loss": 0.6257, + "step": 2346 + }, + { + "epoch": 0.6231249170317271, + "grad_norm": 0.38707098623135416, + "learning_rate": 4.877921638797044e-06, + "loss": 0.6411, + "step": 2347 + }, + { + "epoch": 0.6233904155051109, + "grad_norm": 0.3853638096747392, + "learning_rate": 4.877813849718661e-06, + "loss": 0.6032, + "step": 2348 + }, + { + "epoch": 0.6236559139784946, + "grad_norm": 0.3898622760666724, + "learning_rate": 4.877706014267086e-06, + "loss": 0.6281, + "step": 2349 + }, + { + "epoch": 0.6239214124518784, + "grad_norm": 0.3822552211577024, + "learning_rate": 4.877598132444423e-06, + "loss": 0.5934, + "step": 2350 + }, + { + "epoch": 0.6241869109252621, + "grad_norm": 0.40283323302067503, + "learning_rate": 4.877490204252775e-06, + "loss": 0.5896, + "step": 2351 + }, + { + "epoch": 0.624452409398646, + "grad_norm": 0.3879238698124411, + "learning_rate": 4.877382229694247e-06, + "loss": 0.6444, + "step": 2352 + }, + { + "epoch": 0.6247179078720297, + "grad_norm": 0.39407605100023513, + "learning_rate": 4.8772742087709435e-06, + "loss": 0.623, + "step": 2353 + }, + { + "epoch": 0.6249834063454135, + "grad_norm": 0.41825717516977834, + "learning_rate": 4.877166141484973e-06, + "loss": 0.6239, + "step": 2354 + }, + { + "epoch": 0.6252489048187972, + "grad_norm": 0.39924390668571913, + "learning_rate": 4.877058027838443e-06, + "loss": 0.5883, + "step": 2355 + }, + { + "epoch": 0.6255144032921811, + "grad_norm": 0.38975037913708366, + "learning_rate": 4.8769498678334605e-06, + "loss": 0.6402, + "step": 2356 + }, + { + "epoch": 0.6257799017655649, + "grad_norm": 0.3961743296798766, + "learning_rate": 4.876841661472136e-06, + "loss": 0.5923, + "step": 2357 + }, + { + "epoch": 0.6260454002389486, + "grad_norm": 0.3749225849145829, + "learning_rate": 4.8767334087565795e-06, + "loss": 0.6059, + "step": 2358 + }, + { + "epoch": 0.6263108987123324, + "grad_norm": 0.39068089806455725, + "learning_rate": 4.876625109688903e-06, + "loss": 0.653, + "step": 2359 + }, + { + "epoch": 0.6265763971857162, + "grad_norm": 0.4281047437820375, + "learning_rate": 4.8765167642712175e-06, + "loss": 0.6203, + "step": 2360 + }, + { + "epoch": 0.6268418956591, + "grad_norm": 0.3866583387083701, + "learning_rate": 4.876408372505636e-06, + "loss": 0.6348, + "step": 2361 + }, + { + "epoch": 0.6271073941324837, + "grad_norm": 0.39484408716558617, + "learning_rate": 4.876299934394274e-06, + "loss": 0.6721, + "step": 2362 + }, + { + "epoch": 0.6273728926058675, + "grad_norm": 0.39982151261862975, + "learning_rate": 4.876191449939244e-06, + "loss": 0.6193, + "step": 2363 + }, + { + "epoch": 0.6276383910792513, + "grad_norm": 0.410190825250382, + "learning_rate": 4.8760829191426626e-06, + "loss": 0.6488, + "step": 2364 + }, + { + "epoch": 0.6279038895526351, + "grad_norm": 0.3840322930565453, + "learning_rate": 4.875974342006647e-06, + "loss": 0.6174, + "step": 2365 + }, + { + "epoch": 0.6281693880260188, + "grad_norm": 0.37532539966358164, + "learning_rate": 4.875865718533315e-06, + "loss": 0.6687, + "step": 2366 + }, + { + "epoch": 0.6284348864994026, + "grad_norm": 0.43432916636481106, + "learning_rate": 4.875757048724784e-06, + "loss": 0.6141, + "step": 2367 + }, + { + "epoch": 0.6287003849727864, + "grad_norm": 0.3839813673259421, + "learning_rate": 4.875648332583174e-06, + "loss": 0.6498, + "step": 2368 + }, + { + "epoch": 0.6289658834461702, + "grad_norm": 0.37749868778090734, + "learning_rate": 4.875539570110604e-06, + "loss": 0.5948, + "step": 2369 + }, + { + "epoch": 0.629231381919554, + "grad_norm": 0.3790653920541351, + "learning_rate": 4.875430761309197e-06, + "loss": 0.6334, + "step": 2370 + }, + { + "epoch": 0.6294968803929377, + "grad_norm": 0.3914305185793328, + "learning_rate": 4.875321906181073e-06, + "loss": 0.6317, + "step": 2371 + }, + { + "epoch": 0.6297623788663215, + "grad_norm": 0.38487283781364384, + "learning_rate": 4.8752130047283565e-06, + "loss": 0.6152, + "step": 2372 + }, + { + "epoch": 0.6300278773397053, + "grad_norm": 0.3890328379433162, + "learning_rate": 4.875104056953172e-06, + "loss": 0.6278, + "step": 2373 + }, + { + "epoch": 0.6302933758130891, + "grad_norm": 0.3780682485002683, + "learning_rate": 4.874995062857641e-06, + "loss": 0.6052, + "step": 2374 + }, + { + "epoch": 0.6305588742864728, + "grad_norm": 0.3964397805161175, + "learning_rate": 4.874886022443893e-06, + "loss": 0.6295, + "step": 2375 + }, + { + "epoch": 0.6308243727598566, + "grad_norm": 0.400707792006883, + "learning_rate": 4.8747769357140515e-06, + "loss": 0.6554, + "step": 2376 + }, + { + "epoch": 0.6310898712332405, + "grad_norm": 0.3745922323646813, + "learning_rate": 4.874667802670246e-06, + "loss": 0.5956, + "step": 2377 + }, + { + "epoch": 0.6313553697066242, + "grad_norm": 0.3802428317650885, + "learning_rate": 4.874558623314604e-06, + "loss": 0.6002, + "step": 2378 + }, + { + "epoch": 0.631620868180008, + "grad_norm": 0.4018762375198942, + "learning_rate": 4.874449397649256e-06, + "loss": 0.6311, + "step": 2379 + }, + { + "epoch": 0.6318863666533917, + "grad_norm": 0.38163789447817625, + "learning_rate": 4.87434012567633e-06, + "loss": 0.6713, + "step": 2380 + }, + { + "epoch": 0.6321518651267756, + "grad_norm": 0.37485428652108665, + "learning_rate": 4.8742308073979584e-06, + "loss": 0.5963, + "step": 2381 + }, + { + "epoch": 0.6324173636001593, + "grad_norm": 0.4083955630109161, + "learning_rate": 4.874121442816273e-06, + "loss": 0.6268, + "step": 2382 + }, + { + "epoch": 0.6326828620735431, + "grad_norm": 0.4070481869629904, + "learning_rate": 4.874012031933407e-06, + "loss": 0.6142, + "step": 2383 + }, + { + "epoch": 0.6329483605469268, + "grad_norm": 0.3918489236527451, + "learning_rate": 4.873902574751493e-06, + "loss": 0.645, + "step": 2384 + }, + { + "epoch": 0.6332138590203107, + "grad_norm": 0.3857532718122673, + "learning_rate": 4.8737930712726665e-06, + "loss": 0.6117, + "step": 2385 + }, + { + "epoch": 0.6334793574936944, + "grad_norm": 0.400455466140325, + "learning_rate": 4.873683521499064e-06, + "loss": 0.6635, + "step": 2386 + }, + { + "epoch": 0.6337448559670782, + "grad_norm": 0.39315283241470816, + "learning_rate": 4.87357392543282e-06, + "loss": 0.6267, + "step": 2387 + }, + { + "epoch": 0.6340103544404619, + "grad_norm": 0.375461837976201, + "learning_rate": 4.873464283076074e-06, + "loss": 0.5915, + "step": 2388 + }, + { + "epoch": 0.6342758529138457, + "grad_norm": 0.382130824836503, + "learning_rate": 4.873354594430963e-06, + "loss": 0.6489, + "step": 2389 + }, + { + "epoch": 0.6345413513872296, + "grad_norm": 0.38855594608669636, + "learning_rate": 4.873244859499625e-06, + "loss": 0.6349, + "step": 2390 + }, + { + "epoch": 0.6348068498606133, + "grad_norm": 0.37666399182589605, + "learning_rate": 4.873135078284203e-06, + "loss": 0.6137, + "step": 2391 + }, + { + "epoch": 0.6350723483339971, + "grad_norm": 0.38143451438472825, + "learning_rate": 4.873025250786836e-06, + "loss": 0.6406, + "step": 2392 + }, + { + "epoch": 0.6353378468073808, + "grad_norm": 0.37615911290761034, + "learning_rate": 4.872915377009666e-06, + "loss": 0.6218, + "step": 2393 + }, + { + "epoch": 0.6356033452807647, + "grad_norm": 0.3849287713753922, + "learning_rate": 4.872805456954837e-06, + "loss": 0.619, + "step": 2394 + }, + { + "epoch": 0.6358688437541484, + "grad_norm": 0.3945313230420764, + "learning_rate": 4.8726954906244915e-06, + "loss": 0.6451, + "step": 2395 + }, + { + "epoch": 0.6361343422275322, + "grad_norm": 0.4192730294881194, + "learning_rate": 4.872585478020776e-06, + "loss": 0.6402, + "step": 2396 + }, + { + "epoch": 0.6363998407009159, + "grad_norm": 0.3879844510767756, + "learning_rate": 4.872475419145832e-06, + "loss": 0.6363, + "step": 2397 + }, + { + "epoch": 0.6366653391742998, + "grad_norm": 0.3771814158584119, + "learning_rate": 4.87236531400181e-06, + "loss": 0.6697, + "step": 2398 + }, + { + "epoch": 0.6369308376476835, + "grad_norm": 0.3794289928880538, + "learning_rate": 4.872255162590857e-06, + "loss": 0.6053, + "step": 2399 + }, + { + "epoch": 0.6371963361210673, + "grad_norm": 0.3914351725253561, + "learning_rate": 4.872144964915118e-06, + "loss": 0.6366, + "step": 2400 + }, + { + "epoch": 0.637461834594451, + "grad_norm": 0.3903557114148793, + "learning_rate": 4.872034720976745e-06, + "loss": 0.62, + "step": 2401 + }, + { + "epoch": 0.6377273330678349, + "grad_norm": 0.3947328295929092, + "learning_rate": 4.871924430777888e-06, + "loss": 0.6098, + "step": 2402 + }, + { + "epoch": 0.6379928315412187, + "grad_norm": 0.3850221782244195, + "learning_rate": 4.871814094320696e-06, + "loss": 0.6343, + "step": 2403 + }, + { + "epoch": 0.6382583300146024, + "grad_norm": 0.4028496713885383, + "learning_rate": 4.871703711607322e-06, + "loss": 0.664, + "step": 2404 + }, + { + "epoch": 0.6385238284879862, + "grad_norm": 0.36735043987302124, + "learning_rate": 4.871593282639919e-06, + "loss": 0.6447, + "step": 2405 + }, + { + "epoch": 0.6387893269613699, + "grad_norm": 0.3851391222934083, + "learning_rate": 4.87148280742064e-06, + "loss": 0.5908, + "step": 2406 + }, + { + "epoch": 0.6390548254347538, + "grad_norm": 0.3893816450710175, + "learning_rate": 4.87137228595164e-06, + "loss": 0.6276, + "step": 2407 + }, + { + "epoch": 0.6393203239081375, + "grad_norm": 0.39413184248496463, + "learning_rate": 4.871261718235075e-06, + "loss": 0.6278, + "step": 2408 + }, + { + "epoch": 0.6395858223815213, + "grad_norm": 0.4365868367662229, + "learning_rate": 4.8711511042731e-06, + "loss": 0.6267, + "step": 2409 + }, + { + "epoch": 0.639851320854905, + "grad_norm": 0.3810323792503826, + "learning_rate": 4.871040444067873e-06, + "loss": 0.6209, + "step": 2410 + }, + { + "epoch": 0.6401168193282889, + "grad_norm": 0.39784333252611237, + "learning_rate": 4.870929737621551e-06, + "loss": 0.6409, + "step": 2411 + }, + { + "epoch": 0.6403823178016727, + "grad_norm": 0.38942566771250076, + "learning_rate": 4.870818984936296e-06, + "loss": 0.6378, + "step": 2412 + }, + { + "epoch": 0.6406478162750564, + "grad_norm": 0.4087388819997767, + "learning_rate": 4.8707081860142645e-06, + "loss": 0.6171, + "step": 2413 + }, + { + "epoch": 0.6409133147484402, + "grad_norm": 0.36240731231871115, + "learning_rate": 4.8705973408576205e-06, + "loss": 0.6004, + "step": 2414 + }, + { + "epoch": 0.641178813221824, + "grad_norm": 0.39722425919833493, + "learning_rate": 4.870486449468522e-06, + "loss": 0.6066, + "step": 2415 + }, + { + "epoch": 0.6414443116952078, + "grad_norm": 0.4163633823795304, + "learning_rate": 4.870375511849136e-06, + "loss": 0.6402, + "step": 2416 + }, + { + "epoch": 0.6417098101685915, + "grad_norm": 0.36712661710311617, + "learning_rate": 4.870264528001623e-06, + "loss": 0.6108, + "step": 2417 + }, + { + "epoch": 0.6419753086419753, + "grad_norm": 0.397016529454373, + "learning_rate": 4.870153497928147e-06, + "loss": 0.6484, + "step": 2418 + }, + { + "epoch": 0.6422408071153591, + "grad_norm": 0.38553671497935776, + "learning_rate": 4.8700424216308765e-06, + "loss": 0.5517, + "step": 2419 + }, + { + "epoch": 0.6425063055887429, + "grad_norm": 0.40996167485293356, + "learning_rate": 4.869931299111974e-06, + "loss": 0.6491, + "step": 2420 + }, + { + "epoch": 0.6427718040621266, + "grad_norm": 0.38722113113089673, + "learning_rate": 4.8698201303736105e-06, + "loss": 0.6395, + "step": 2421 + }, + { + "epoch": 0.6430373025355104, + "grad_norm": 0.4000129523578866, + "learning_rate": 4.869708915417951e-06, + "loss": 0.6179, + "step": 2422 + }, + { + "epoch": 0.6433028010088943, + "grad_norm": 0.40020045379966307, + "learning_rate": 4.869597654247166e-06, + "loss": 0.6324, + "step": 2423 + }, + { + "epoch": 0.643568299482278, + "grad_norm": 0.3850289895582323, + "learning_rate": 4.8694863468634255e-06, + "loss": 0.6296, + "step": 2424 + }, + { + "epoch": 0.6438337979556618, + "grad_norm": 0.3797961163216567, + "learning_rate": 4.869374993268898e-06, + "loss": 0.6131, + "step": 2425 + }, + { + "epoch": 0.6440992964290455, + "grad_norm": 0.3965706741086633, + "learning_rate": 4.8692635934657585e-06, + "loss": 0.6176, + "step": 2426 + }, + { + "epoch": 0.6443647949024293, + "grad_norm": 0.4042120244882442, + "learning_rate": 4.869152147456177e-06, + "loss": 0.6599, + "step": 2427 + }, + { + "epoch": 0.6446302933758131, + "grad_norm": 0.381712068532955, + "learning_rate": 4.8690406552423294e-06, + "loss": 0.6246, + "step": 2428 + }, + { + "epoch": 0.6448957918491969, + "grad_norm": 0.3758611799165009, + "learning_rate": 4.868929116826388e-06, + "loss": 0.5742, + "step": 2429 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 0.39409171170112794, + "learning_rate": 4.868817532210528e-06, + "loss": 0.631, + "step": 2430 + }, + { + "epoch": 0.6454267887959644, + "grad_norm": 0.37446255195683514, + "learning_rate": 4.868705901396926e-06, + "loss": 0.5964, + "step": 2431 + }, + { + "epoch": 0.6456922872693482, + "grad_norm": 0.38023788410347037, + "learning_rate": 4.86859422438776e-06, + "loss": 0.6272, + "step": 2432 + }, + { + "epoch": 0.645957785742732, + "grad_norm": 0.3706862445659322, + "learning_rate": 4.868482501185208e-06, + "loss": 0.622, + "step": 2433 + }, + { + "epoch": 0.6462232842161157, + "grad_norm": 0.39954781948419105, + "learning_rate": 4.868370731791447e-06, + "loss": 0.6237, + "step": 2434 + }, + { + "epoch": 0.6464887826894995, + "grad_norm": 0.3917522697853562, + "learning_rate": 4.868258916208658e-06, + "loss": 0.6468, + "step": 2435 + }, + { + "epoch": 0.6467542811628834, + "grad_norm": 0.3816753365170755, + "learning_rate": 4.868147054439022e-06, + "loss": 0.633, + "step": 2436 + }, + { + "epoch": 0.6470197796362671, + "grad_norm": 0.3877572153449095, + "learning_rate": 4.8680351464847206e-06, + "loss": 0.6265, + "step": 2437 + }, + { + "epoch": 0.6472852781096509, + "grad_norm": 0.3753767609207334, + "learning_rate": 4.867923192347936e-06, + "loss": 0.5868, + "step": 2438 + }, + { + "epoch": 0.6475507765830346, + "grad_norm": 0.40940122388600325, + "learning_rate": 4.86781119203085e-06, + "loss": 0.6212, + "step": 2439 + }, + { + "epoch": 0.6478162750564185, + "grad_norm": 0.38740799178512, + "learning_rate": 4.86769914553565e-06, + "loss": 0.6205, + "step": 2440 + }, + { + "epoch": 0.6480817735298022, + "grad_norm": 0.4109288749924628, + "learning_rate": 4.867587052864519e-06, + "loss": 0.6439, + "step": 2441 + }, + { + "epoch": 0.648347272003186, + "grad_norm": 0.3935390604153536, + "learning_rate": 4.867474914019643e-06, + "loss": 0.6322, + "step": 2442 + }, + { + "epoch": 0.6486127704765697, + "grad_norm": 0.3936567995133444, + "learning_rate": 4.86736272900321e-06, + "loss": 0.65, + "step": 2443 + }, + { + "epoch": 0.6488782689499535, + "grad_norm": 0.38473693777235074, + "learning_rate": 4.867250497817407e-06, + "loss": 0.6052, + "step": 2444 + }, + { + "epoch": 0.6491437674233373, + "grad_norm": 0.40568914204433476, + "learning_rate": 4.8671382204644245e-06, + "loss": 0.6185, + "step": 2445 + }, + { + "epoch": 0.6494092658967211, + "grad_norm": 0.42111090788991906, + "learning_rate": 4.86702589694645e-06, + "loss": 0.5368, + "step": 2446 + }, + { + "epoch": 0.6496747643701049, + "grad_norm": 0.4185580111244889, + "learning_rate": 4.866913527265676e-06, + "loss": 0.6461, + "step": 2447 + }, + { + "epoch": 0.6499402628434886, + "grad_norm": 0.42930425429914, + "learning_rate": 4.8668011114242916e-06, + "loss": 0.621, + "step": 2448 + }, + { + "epoch": 0.6502057613168725, + "grad_norm": 0.4021288392418631, + "learning_rate": 4.8666886494244916e-06, + "loss": 0.6738, + "step": 2449 + }, + { + "epoch": 0.6504712597902562, + "grad_norm": 0.39124077599819274, + "learning_rate": 4.866576141268468e-06, + "loss": 0.6422, + "step": 2450 + }, + { + "epoch": 0.65073675826364, + "grad_norm": 0.40667606076307977, + "learning_rate": 4.866463586958415e-06, + "loss": 0.6491, + "step": 2451 + }, + { + "epoch": 0.6510022567370237, + "grad_norm": 0.38649628067677233, + "learning_rate": 4.8663509864965286e-06, + "loss": 0.6452, + "step": 2452 + }, + { + "epoch": 0.6512677552104076, + "grad_norm": 0.4401364031459709, + "learning_rate": 4.866238339885004e-06, + "loss": 0.6321, + "step": 2453 + }, + { + "epoch": 0.6515332536837913, + "grad_norm": 0.38044106896229735, + "learning_rate": 4.866125647126038e-06, + "loss": 0.6282, + "step": 2454 + }, + { + "epoch": 0.6517987521571751, + "grad_norm": 0.48513941071869, + "learning_rate": 4.866012908221829e-06, + "loss": 0.5852, + "step": 2455 + }, + { + "epoch": 0.6520642506305588, + "grad_norm": 0.4096964037537831, + "learning_rate": 4.865900123174574e-06, + "loss": 0.6311, + "step": 2456 + }, + { + "epoch": 0.6523297491039427, + "grad_norm": 0.4013675193995615, + "learning_rate": 4.8657872919864756e-06, + "loss": 0.6262, + "step": 2457 + }, + { + "epoch": 0.6525952475773265, + "grad_norm": 0.40322131157403657, + "learning_rate": 4.8656744146597315e-06, + "loss": 0.617, + "step": 2458 + }, + { + "epoch": 0.6528607460507102, + "grad_norm": 0.4021442945770837, + "learning_rate": 4.8655614911965445e-06, + "loss": 0.5883, + "step": 2459 + }, + { + "epoch": 0.653126244524094, + "grad_norm": 0.4001518030032774, + "learning_rate": 4.865448521599116e-06, + "loss": 0.64, + "step": 2460 + }, + { + "epoch": 0.6533917429974777, + "grad_norm": 0.392434863142415, + "learning_rate": 4.86533550586965e-06, + "loss": 0.6122, + "step": 2461 + }, + { + "epoch": 0.6536572414708616, + "grad_norm": 0.3944349165766381, + "learning_rate": 4.86522244401035e-06, + "loss": 0.629, + "step": 2462 + }, + { + "epoch": 0.6539227399442453, + "grad_norm": 0.3884606087919453, + "learning_rate": 4.865109336023422e-06, + "loss": 0.538, + "step": 2463 + }, + { + "epoch": 0.6541882384176291, + "grad_norm": 0.402793762278969, + "learning_rate": 4.864996181911072e-06, + "loss": 0.6368, + "step": 2464 + }, + { + "epoch": 0.6544537368910128, + "grad_norm": 0.3866759782295091, + "learning_rate": 4.8648829816755035e-06, + "loss": 0.6094, + "step": 2465 + }, + { + "epoch": 0.6547192353643967, + "grad_norm": 0.38169281248894205, + "learning_rate": 4.864769735318929e-06, + "loss": 0.6391, + "step": 2466 + }, + { + "epoch": 0.6549847338377804, + "grad_norm": 0.3824012639897559, + "learning_rate": 4.864656442843553e-06, + "loss": 0.5972, + "step": 2467 + }, + { + "epoch": 0.6552502323111642, + "grad_norm": 0.3939800833629133, + "learning_rate": 4.864543104251587e-06, + "loss": 0.6094, + "step": 2468 + }, + { + "epoch": 0.655515730784548, + "grad_norm": 0.40144669559925655, + "learning_rate": 4.864429719545241e-06, + "loss": 0.6595, + "step": 2469 + }, + { + "epoch": 0.6557812292579318, + "grad_norm": 0.3853803866206681, + "learning_rate": 4.864316288726727e-06, + "loss": 0.6412, + "step": 2470 + }, + { + "epoch": 0.6560467277313156, + "grad_norm": 0.3883359918727682, + "learning_rate": 4.864202811798257e-06, + "loss": 0.639, + "step": 2471 + }, + { + "epoch": 0.6563122262046993, + "grad_norm": 0.3761311457924535, + "learning_rate": 4.864089288762042e-06, + "loss": 0.5927, + "step": 2472 + }, + { + "epoch": 0.6565777246780831, + "grad_norm": 0.3781772054186187, + "learning_rate": 4.863975719620298e-06, + "loss": 0.6301, + "step": 2473 + }, + { + "epoch": 0.6568432231514669, + "grad_norm": 0.39547772531661374, + "learning_rate": 4.8638621043752406e-06, + "loss": 0.6462, + "step": 2474 + }, + { + "epoch": 0.6571087216248507, + "grad_norm": 0.3772884194350595, + "learning_rate": 4.863748443029084e-06, + "loss": 0.6164, + "step": 2475 + }, + { + "epoch": 0.6573742200982344, + "grad_norm": 0.3939065243459478, + "learning_rate": 4.863634735584045e-06, + "loss": 0.6083, + "step": 2476 + }, + { + "epoch": 0.6576397185716182, + "grad_norm": 0.3892537674893566, + "learning_rate": 4.863520982042341e-06, + "loss": 0.6283, + "step": 2477 + }, + { + "epoch": 0.657905217045002, + "grad_norm": 0.37374868197468986, + "learning_rate": 4.863407182406192e-06, + "loss": 0.6056, + "step": 2478 + }, + { + "epoch": 0.6581707155183858, + "grad_norm": 0.39387086314838793, + "learning_rate": 4.863293336677815e-06, + "loss": 0.6126, + "step": 2479 + }, + { + "epoch": 0.6584362139917695, + "grad_norm": 0.4092623613598788, + "learning_rate": 4.8631794448594325e-06, + "loss": 0.6825, + "step": 2480 + }, + { + "epoch": 0.6587017124651533, + "grad_norm": 0.378173667013933, + "learning_rate": 4.863065506953265e-06, + "loss": 0.6548, + "step": 2481 + }, + { + "epoch": 0.658967210938537, + "grad_norm": 0.37910481845293986, + "learning_rate": 4.862951522961534e-06, + "loss": 0.6229, + "step": 2482 + }, + { + "epoch": 0.6592327094119209, + "grad_norm": 0.3849573192672843, + "learning_rate": 4.862837492886463e-06, + "loss": 0.6437, + "step": 2483 + }, + { + "epoch": 0.6594982078853047, + "grad_norm": 0.38643860083625237, + "learning_rate": 4.862723416730275e-06, + "loss": 0.6016, + "step": 2484 + }, + { + "epoch": 0.6597637063586884, + "grad_norm": 0.38429315220821314, + "learning_rate": 4.8626092944951965e-06, + "loss": 0.6487, + "step": 2485 + }, + { + "epoch": 0.6600292048320722, + "grad_norm": 0.39503680112549455, + "learning_rate": 4.862495126183451e-06, + "loss": 0.6416, + "step": 2486 + }, + { + "epoch": 0.660294703305456, + "grad_norm": 0.3689026852007654, + "learning_rate": 4.8623809117972665e-06, + "loss": 0.619, + "step": 2487 + }, + { + "epoch": 0.6605602017788398, + "grad_norm": 0.3866011676925736, + "learning_rate": 4.86226665133887e-06, + "loss": 0.6535, + "step": 2488 + }, + { + "epoch": 0.6608257002522235, + "grad_norm": 0.38050250486608783, + "learning_rate": 4.862152344810491e-06, + "loss": 0.6343, + "step": 2489 + }, + { + "epoch": 0.6610911987256073, + "grad_norm": 0.38632021616597995, + "learning_rate": 4.862037992214357e-06, + "loss": 0.6215, + "step": 2490 + }, + { + "epoch": 0.6613566971989912, + "grad_norm": 0.38711932869121424, + "learning_rate": 4.861923593552699e-06, + "loss": 0.6334, + "step": 2491 + }, + { + "epoch": 0.6616221956723749, + "grad_norm": 0.37789754896138056, + "learning_rate": 4.861809148827747e-06, + "loss": 0.6601, + "step": 2492 + }, + { + "epoch": 0.6618876941457587, + "grad_norm": 0.38116298789751557, + "learning_rate": 4.8616946580417344e-06, + "loss": 0.6237, + "step": 2493 + }, + { + "epoch": 0.6621531926191424, + "grad_norm": 0.38789035611558487, + "learning_rate": 4.861580121196894e-06, + "loss": 0.6037, + "step": 2494 + }, + { + "epoch": 0.6624186910925263, + "grad_norm": 0.38684696372789545, + "learning_rate": 4.861465538295459e-06, + "loss": 0.6267, + "step": 2495 + }, + { + "epoch": 0.66268418956591, + "grad_norm": 0.38752623864801644, + "learning_rate": 4.861350909339664e-06, + "loss": 0.6366, + "step": 2496 + }, + { + "epoch": 0.6629496880392938, + "grad_norm": 0.4059626560122765, + "learning_rate": 4.861236234331745e-06, + "loss": 0.6642, + "step": 2497 + }, + { + "epoch": 0.6632151865126775, + "grad_norm": 0.3871700790127218, + "learning_rate": 4.861121513273938e-06, + "loss": 0.6233, + "step": 2498 + }, + { + "epoch": 0.6634806849860613, + "grad_norm": 0.39138099142193133, + "learning_rate": 4.861006746168479e-06, + "loss": 0.5794, + "step": 2499 + }, + { + "epoch": 0.6637461834594451, + "grad_norm": 0.38879068707973874, + "learning_rate": 4.86089193301761e-06, + "loss": 0.6356, + "step": 2500 + }, + { + "epoch": 0.6640116819328289, + "grad_norm": 0.37596873234278433, + "learning_rate": 4.860777073823567e-06, + "loss": 0.6215, + "step": 2501 + }, + { + "epoch": 0.6642771804062126, + "grad_norm": 0.383533070368256, + "learning_rate": 4.86066216858859e-06, + "loss": 0.6416, + "step": 2502 + }, + { + "epoch": 0.6645426788795964, + "grad_norm": 0.3982564508673109, + "learning_rate": 4.860547217314922e-06, + "loss": 0.6029, + "step": 2503 + }, + { + "epoch": 0.6648081773529803, + "grad_norm": 0.378744352411347, + "learning_rate": 4.860432220004803e-06, + "loss": 0.606, + "step": 2504 + }, + { + "epoch": 0.665073675826364, + "grad_norm": 0.3732731941170594, + "learning_rate": 4.860317176660477e-06, + "loss": 0.6209, + "step": 2505 + }, + { + "epoch": 0.6653391742997478, + "grad_norm": 0.38207672202431, + "learning_rate": 4.8602020872841865e-06, + "loss": 0.6204, + "step": 2506 + }, + { + "epoch": 0.6656046727731315, + "grad_norm": 0.3994957365386139, + "learning_rate": 4.860086951878177e-06, + "loss": 0.6453, + "step": 2507 + }, + { + "epoch": 0.6658701712465154, + "grad_norm": 0.3856215522084647, + "learning_rate": 4.859971770444692e-06, + "loss": 0.6513, + "step": 2508 + }, + { + "epoch": 0.6661356697198991, + "grad_norm": 0.41097373652956726, + "learning_rate": 4.859856542985981e-06, + "loss": 0.5832, + "step": 2509 + }, + { + "epoch": 0.6664011681932829, + "grad_norm": 0.38595628458347075, + "learning_rate": 4.859741269504289e-06, + "loss": 0.5778, + "step": 2510 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.3992401585977327, + "learning_rate": 4.859625950001864e-06, + "loss": 0.619, + "step": 2511 + }, + { + "epoch": 0.6669321651400505, + "grad_norm": 0.39287580036457476, + "learning_rate": 4.859510584480956e-06, + "loss": 0.6259, + "step": 2512 + }, + { + "epoch": 0.6671976636134342, + "grad_norm": 0.38021736579129695, + "learning_rate": 4.859395172943815e-06, + "loss": 0.6263, + "step": 2513 + }, + { + "epoch": 0.667463162086818, + "grad_norm": 0.3838993766133399, + "learning_rate": 4.859279715392691e-06, + "loss": 0.6107, + "step": 2514 + }, + { + "epoch": 0.6677286605602017, + "grad_norm": 0.38181431090714285, + "learning_rate": 4.8591642118298355e-06, + "loss": 0.6186, + "step": 2515 + }, + { + "epoch": 0.6679941590335856, + "grad_norm": 0.37836799081787303, + "learning_rate": 4.8590486622575015e-06, + "loss": 0.6305, + "step": 2516 + }, + { + "epoch": 0.6682596575069694, + "grad_norm": 0.38835025871021733, + "learning_rate": 4.858933066677944e-06, + "loss": 0.6106, + "step": 2517 + }, + { + "epoch": 0.6685251559803531, + "grad_norm": 0.3775874377775798, + "learning_rate": 4.858817425093415e-06, + "loss": 0.6459, + "step": 2518 + }, + { + "epoch": 0.6687906544537369, + "grad_norm": 0.39040841830586814, + "learning_rate": 4.85870173750617e-06, + "loss": 0.6496, + "step": 2519 + }, + { + "epoch": 0.6690561529271206, + "grad_norm": 0.38147813795752034, + "learning_rate": 4.858586003918469e-06, + "loss": 0.6072, + "step": 2520 + }, + { + "epoch": 0.6693216514005045, + "grad_norm": 0.3865106030914664, + "learning_rate": 4.8584702243325635e-06, + "loss": 0.6387, + "step": 2521 + }, + { + "epoch": 0.6695871498738882, + "grad_norm": 0.36842893630068424, + "learning_rate": 4.858354398750715e-06, + "loss": 0.6074, + "step": 2522 + }, + { + "epoch": 0.669852648347272, + "grad_norm": 0.3822321973649649, + "learning_rate": 4.858238527175181e-06, + "loss": 0.6423, + "step": 2523 + }, + { + "epoch": 0.6701181468206557, + "grad_norm": 0.3735881108709939, + "learning_rate": 4.858122609608222e-06, + "loss": 0.6357, + "step": 2524 + }, + { + "epoch": 0.6703836452940396, + "grad_norm": 0.3908279670644259, + "learning_rate": 4.858006646052098e-06, + "loss": 0.6357, + "step": 2525 + }, + { + "epoch": 0.6706491437674234, + "grad_norm": 0.38800731462938065, + "learning_rate": 4.857890636509072e-06, + "loss": 0.6179, + "step": 2526 + }, + { + "epoch": 0.6709146422408071, + "grad_norm": 0.38994344794696584, + "learning_rate": 4.8577745809814046e-06, + "loss": 0.6182, + "step": 2527 + }, + { + "epoch": 0.6711801407141909, + "grad_norm": 0.374108861793892, + "learning_rate": 4.85765847947136e-06, + "loss": 0.5971, + "step": 2528 + }, + { + "epoch": 0.6714456391875747, + "grad_norm": 0.38951660976775165, + "learning_rate": 4.857542331981203e-06, + "loss": 0.6474, + "step": 2529 + }, + { + "epoch": 0.6717111376609585, + "grad_norm": 0.3873093174653202, + "learning_rate": 4.857426138513198e-06, + "loss": 0.5944, + "step": 2530 + }, + { + "epoch": 0.6719766361343422, + "grad_norm": 0.47293909425983943, + "learning_rate": 4.857309899069612e-06, + "loss": 0.5405, + "step": 2531 + }, + { + "epoch": 0.672242134607726, + "grad_norm": 0.416257384381464, + "learning_rate": 4.857193613652711e-06, + "loss": 0.6393, + "step": 2532 + }, + { + "epoch": 0.6725076330811098, + "grad_norm": 0.40259090507413703, + "learning_rate": 4.857077282264763e-06, + "loss": 0.628, + "step": 2533 + }, + { + "epoch": 0.6727731315544936, + "grad_norm": 0.39750122403523114, + "learning_rate": 4.856960904908037e-06, + "loss": 0.6022, + "step": 2534 + }, + { + "epoch": 0.6730386300278773, + "grad_norm": 0.38010415065804315, + "learning_rate": 4.856844481584801e-06, + "loss": 0.6291, + "step": 2535 + }, + { + "epoch": 0.6733041285012611, + "grad_norm": 0.38542806440509086, + "learning_rate": 4.856728012297329e-06, + "loss": 0.6237, + "step": 2536 + }, + { + "epoch": 0.6735696269746448, + "grad_norm": 0.3873558433018967, + "learning_rate": 4.85661149704789e-06, + "loss": 0.6305, + "step": 2537 + }, + { + "epoch": 0.6738351254480287, + "grad_norm": 0.47909573788052434, + "learning_rate": 4.856494935838757e-06, + "loss": 0.6323, + "step": 2538 + }, + { + "epoch": 0.6741006239214125, + "grad_norm": 0.39494063248857897, + "learning_rate": 4.856378328672202e-06, + "loss": 0.579, + "step": 2539 + }, + { + "epoch": 0.6743661223947962, + "grad_norm": 0.38522195779640217, + "learning_rate": 4.856261675550501e-06, + "loss": 0.6539, + "step": 2540 + }, + { + "epoch": 0.67463162086818, + "grad_norm": 0.398179815084743, + "learning_rate": 4.856144976475928e-06, + "loss": 0.6416, + "step": 2541 + }, + { + "epoch": 0.6748971193415638, + "grad_norm": 0.3836072796857305, + "learning_rate": 4.85602823145076e-06, + "loss": 0.6114, + "step": 2542 + }, + { + "epoch": 0.6751626178149476, + "grad_norm": 0.385671427520204, + "learning_rate": 4.855911440477272e-06, + "loss": 0.6208, + "step": 2543 + }, + { + "epoch": 0.6754281162883313, + "grad_norm": 0.3879075507199383, + "learning_rate": 4.8557946035577426e-06, + "loss": 0.6268, + "step": 2544 + }, + { + "epoch": 0.6756936147617151, + "grad_norm": 0.3852121785704072, + "learning_rate": 4.8556777206944506e-06, + "loss": 0.624, + "step": 2545 + }, + { + "epoch": 0.6759591132350989, + "grad_norm": 0.3740846958512857, + "learning_rate": 4.8555607918896755e-06, + "loss": 0.6048, + "step": 2546 + }, + { + "epoch": 0.6762246117084827, + "grad_norm": 0.37518597880135274, + "learning_rate": 4.8554438171456974e-06, + "loss": 0.5879, + "step": 2547 + }, + { + "epoch": 0.6764901101818664, + "grad_norm": 0.38795221060220025, + "learning_rate": 4.855326796464798e-06, + "loss": 0.6211, + "step": 2548 + }, + { + "epoch": 0.6767556086552502, + "grad_norm": 0.39484116607055514, + "learning_rate": 4.855209729849259e-06, + "loss": 0.6239, + "step": 2549 + }, + { + "epoch": 0.6770211071286341, + "grad_norm": 0.3926309328666225, + "learning_rate": 4.8550926173013634e-06, + "loss": 0.6356, + "step": 2550 + }, + { + "epoch": 0.6772866056020178, + "grad_norm": 0.3880984084489336, + "learning_rate": 4.854975458823396e-06, + "loss": 0.6169, + "step": 2551 + }, + { + "epoch": 0.6775521040754016, + "grad_norm": 0.3833201663137394, + "learning_rate": 4.854858254417642e-06, + "loss": 0.6241, + "step": 2552 + }, + { + "epoch": 0.6778176025487853, + "grad_norm": 0.38820548935411137, + "learning_rate": 4.854741004086384e-06, + "loss": 0.62, + "step": 2553 + }, + { + "epoch": 0.6780831010221692, + "grad_norm": 0.38958891269785556, + "learning_rate": 4.854623707831912e-06, + "loss": 0.6407, + "step": 2554 + }, + { + "epoch": 0.6783485994955529, + "grad_norm": 0.3872380183920155, + "learning_rate": 4.854506365656514e-06, + "loss": 0.6505, + "step": 2555 + }, + { + "epoch": 0.6786140979689367, + "grad_norm": 0.3976817989003109, + "learning_rate": 4.854388977562476e-06, + "loss": 0.5915, + "step": 2556 + }, + { + "epoch": 0.6788795964423204, + "grad_norm": 0.38741463647113006, + "learning_rate": 4.8542715435520885e-06, + "loss": 0.6612, + "step": 2557 + }, + { + "epoch": 0.6791450949157042, + "grad_norm": 0.38786176003940787, + "learning_rate": 4.854154063627642e-06, + "loss": 0.6679, + "step": 2558 + }, + { + "epoch": 0.679410593389088, + "grad_norm": 0.3831634590879999, + "learning_rate": 4.854036537791426e-06, + "loss": 0.5804, + "step": 2559 + }, + { + "epoch": 0.6796760918624718, + "grad_norm": 0.4029791035561383, + "learning_rate": 4.853918966045735e-06, + "loss": 0.6045, + "step": 2560 + }, + { + "epoch": 0.6799415903358556, + "grad_norm": 0.3939396834785739, + "learning_rate": 4.85380134839286e-06, + "loss": 0.6094, + "step": 2561 + }, + { + "epoch": 0.6802070888092393, + "grad_norm": 0.3883102941178923, + "learning_rate": 4.853683684835097e-06, + "loss": 0.5842, + "step": 2562 + }, + { + "epoch": 0.6804725872826232, + "grad_norm": 0.39129053138260844, + "learning_rate": 4.853565975374738e-06, + "loss": 0.6411, + "step": 2563 + }, + { + "epoch": 0.6807380857560069, + "grad_norm": 0.374001069584491, + "learning_rate": 4.85344822001408e-06, + "loss": 0.6024, + "step": 2564 + }, + { + "epoch": 0.6810035842293907, + "grad_norm": 0.37645585029846396, + "learning_rate": 4.85333041875542e-06, + "loss": 0.619, + "step": 2565 + }, + { + "epoch": 0.6812690827027744, + "grad_norm": 0.3778577425661308, + "learning_rate": 4.853212571601055e-06, + "loss": 0.6425, + "step": 2566 + }, + { + "epoch": 0.6815345811761583, + "grad_norm": 0.391038252382859, + "learning_rate": 4.8530946785532825e-06, + "loss": 0.6227, + "step": 2567 + }, + { + "epoch": 0.681800079649542, + "grad_norm": 0.3763444120502296, + "learning_rate": 4.852976739614403e-06, + "loss": 0.6154, + "step": 2568 + }, + { + "epoch": 0.6820655781229258, + "grad_norm": 0.3894312190850422, + "learning_rate": 4.852858754786716e-06, + "loss": 0.616, + "step": 2569 + }, + { + "epoch": 0.6823310765963095, + "grad_norm": 0.37619742731414907, + "learning_rate": 4.852740724072523e-06, + "loss": 0.6362, + "step": 2570 + }, + { + "epoch": 0.6825965750696934, + "grad_norm": 0.3811860358247419, + "learning_rate": 4.852622647474125e-06, + "loss": 0.6251, + "step": 2571 + }, + { + "epoch": 0.6828620735430772, + "grad_norm": 0.3827484116118257, + "learning_rate": 4.852504524993824e-06, + "loss": 0.6624, + "step": 2572 + }, + { + "epoch": 0.6831275720164609, + "grad_norm": 0.3909718121010186, + "learning_rate": 4.852386356633926e-06, + "loss": 0.5929, + "step": 2573 + }, + { + "epoch": 0.6833930704898447, + "grad_norm": 0.3914324040933539, + "learning_rate": 4.852268142396734e-06, + "loss": 0.6272, + "step": 2574 + }, + { + "epoch": 0.6836585689632284, + "grad_norm": 0.3852289015281856, + "learning_rate": 4.852149882284554e-06, + "loss": 0.6328, + "step": 2575 + }, + { + "epoch": 0.6839240674366123, + "grad_norm": 0.38668677917707833, + "learning_rate": 4.852031576299693e-06, + "loss": 0.6178, + "step": 2576 + }, + { + "epoch": 0.684189565909996, + "grad_norm": 0.3954760639322168, + "learning_rate": 4.851913224444456e-06, + "loss": 0.658, + "step": 2577 + }, + { + "epoch": 0.6844550643833798, + "grad_norm": 0.4294103355890287, + "learning_rate": 4.851794826721153e-06, + "loss": 0.6365, + "step": 2578 + }, + { + "epoch": 0.6847205628567635, + "grad_norm": 0.3963387839243065, + "learning_rate": 4.851676383132093e-06, + "loss": 0.6214, + "step": 2579 + }, + { + "epoch": 0.6849860613301474, + "grad_norm": 0.4015125487870929, + "learning_rate": 4.851557893679586e-06, + "loss": 0.6058, + "step": 2580 + }, + { + "epoch": 0.6852515598035311, + "grad_norm": 0.45015400148145274, + "learning_rate": 4.8514393583659425e-06, + "loss": 0.5626, + "step": 2581 + }, + { + "epoch": 0.6855170582769149, + "grad_norm": 0.3991252721447173, + "learning_rate": 4.851320777193474e-06, + "loss": 0.592, + "step": 2582 + }, + { + "epoch": 0.6857825567502986, + "grad_norm": 0.3933441331229182, + "learning_rate": 4.851202150164493e-06, + "loss": 0.6199, + "step": 2583 + }, + { + "epoch": 0.6860480552236825, + "grad_norm": 0.44163082552696814, + "learning_rate": 4.851083477281313e-06, + "loss": 0.6181, + "step": 2584 + }, + { + "epoch": 0.6863135536970663, + "grad_norm": 0.41899120795505423, + "learning_rate": 4.85096475854625e-06, + "loss": 0.592, + "step": 2585 + }, + { + "epoch": 0.68657905217045, + "grad_norm": 0.4186786494049867, + "learning_rate": 4.850845993961617e-06, + "loss": 0.6148, + "step": 2586 + }, + { + "epoch": 0.6868445506438338, + "grad_norm": 0.5655822471740156, + "learning_rate": 4.850727183529732e-06, + "loss": 0.5874, + "step": 2587 + }, + { + "epoch": 0.6871100491172176, + "grad_norm": 0.3981542733599476, + "learning_rate": 4.8506083272529105e-06, + "loss": 0.6319, + "step": 2588 + }, + { + "epoch": 0.6873755475906014, + "grad_norm": 0.4261072246034151, + "learning_rate": 4.8504894251334725e-06, + "loss": 0.582, + "step": 2589 + }, + { + "epoch": 0.6876410460639851, + "grad_norm": 0.4008182960922975, + "learning_rate": 4.850370477173735e-06, + "loss": 0.6304, + "step": 2590 + }, + { + "epoch": 0.6879065445373689, + "grad_norm": 0.3826105897046771, + "learning_rate": 4.8502514833760185e-06, + "loss": 0.638, + "step": 2591 + }, + { + "epoch": 0.6881720430107527, + "grad_norm": 0.3908007000737641, + "learning_rate": 4.850132443742643e-06, + "loss": 0.6277, + "step": 2592 + }, + { + "epoch": 0.6884375414841365, + "grad_norm": 0.38731619213074264, + "learning_rate": 4.850013358275931e-06, + "loss": 0.5537, + "step": 2593 + }, + { + "epoch": 0.6887030399575202, + "grad_norm": 0.41059076033361336, + "learning_rate": 4.849894226978205e-06, + "loss": 0.6136, + "step": 2594 + }, + { + "epoch": 0.688968538430904, + "grad_norm": 0.3923793363302241, + "learning_rate": 4.849775049851788e-06, + "loss": 0.6258, + "step": 2595 + }, + { + "epoch": 0.6892340369042878, + "grad_norm": 0.4143458298244493, + "learning_rate": 4.849655826899005e-06, + "loss": 0.6242, + "step": 2596 + }, + { + "epoch": 0.6894995353776716, + "grad_norm": 0.38346814001939306, + "learning_rate": 4.84953655812218e-06, + "loss": 0.6289, + "step": 2597 + }, + { + "epoch": 0.6897650338510554, + "grad_norm": 0.3977828338130154, + "learning_rate": 4.849417243523639e-06, + "loss": 0.61, + "step": 2598 + }, + { + "epoch": 0.6900305323244391, + "grad_norm": 0.38353593247447937, + "learning_rate": 4.84929788310571e-06, + "loss": 0.6037, + "step": 2599 + }, + { + "epoch": 0.6902960307978229, + "grad_norm": 0.40118915263536675, + "learning_rate": 4.84917847687072e-06, + "loss": 0.6471, + "step": 2600 + }, + { + "epoch": 0.6905615292712067, + "grad_norm": 0.4059740296864397, + "learning_rate": 4.849059024820998e-06, + "loss": 0.6632, + "step": 2601 + }, + { + "epoch": 0.6908270277445905, + "grad_norm": 0.41762988105274723, + "learning_rate": 4.8489395269588725e-06, + "loss": 0.6171, + "step": 2602 + }, + { + "epoch": 0.6910925262179742, + "grad_norm": 0.3899823197542196, + "learning_rate": 4.848819983286677e-06, + "loss": 0.6133, + "step": 2603 + }, + { + "epoch": 0.691358024691358, + "grad_norm": 0.3848599299908165, + "learning_rate": 4.848700393806739e-06, + "loss": 0.6239, + "step": 2604 + }, + { + "epoch": 0.6916235231647418, + "grad_norm": 0.38733771313066306, + "learning_rate": 4.848580758521393e-06, + "loss": 0.6302, + "step": 2605 + }, + { + "epoch": 0.6918890216381256, + "grad_norm": 0.39635704743554306, + "learning_rate": 4.848461077432973e-06, + "loss": 0.6182, + "step": 2606 + }, + { + "epoch": 0.6921545201115094, + "grad_norm": 0.39039677922944666, + "learning_rate": 4.848341350543811e-06, + "loss": 0.63, + "step": 2607 + }, + { + "epoch": 0.6924200185848931, + "grad_norm": 0.3852633028118449, + "learning_rate": 4.848221577856244e-06, + "loss": 0.6088, + "step": 2608 + }, + { + "epoch": 0.692685517058277, + "grad_norm": 0.39322254504668075, + "learning_rate": 4.848101759372605e-06, + "loss": 0.609, + "step": 2609 + }, + { + "epoch": 0.6929510155316607, + "grad_norm": 0.3971663647336428, + "learning_rate": 4.847981895095234e-06, + "loss": 0.6183, + "step": 2610 + }, + { + "epoch": 0.6932165140050445, + "grad_norm": 0.3837183314663157, + "learning_rate": 4.8478619850264675e-06, + "loss": 0.6198, + "step": 2611 + }, + { + "epoch": 0.6934820124784282, + "grad_norm": 0.392346323756682, + "learning_rate": 4.847742029168643e-06, + "loss": 0.6466, + "step": 2612 + }, + { + "epoch": 0.693747510951812, + "grad_norm": 0.3698956181295069, + "learning_rate": 4.8476220275241e-06, + "loss": 0.6219, + "step": 2613 + }, + { + "epoch": 0.6940130094251958, + "grad_norm": 0.3773324830349831, + "learning_rate": 4.847501980095181e-06, + "loss": 0.5703, + "step": 2614 + }, + { + "epoch": 0.6942785078985796, + "grad_norm": 0.38836175044527804, + "learning_rate": 4.8473818868842246e-06, + "loss": 0.6545, + "step": 2615 + }, + { + "epoch": 0.6945440063719633, + "grad_norm": 0.4007302920292224, + "learning_rate": 4.8472617478935744e-06, + "loss": 0.6249, + "step": 2616 + }, + { + "epoch": 0.6948095048453471, + "grad_norm": 0.39216784343425254, + "learning_rate": 4.8471415631255725e-06, + "loss": 0.619, + "step": 2617 + }, + { + "epoch": 0.695075003318731, + "grad_norm": 0.38146862062044684, + "learning_rate": 4.847021332582564e-06, + "loss": 0.643, + "step": 2618 + }, + { + "epoch": 0.6953405017921147, + "grad_norm": 0.3844625924455284, + "learning_rate": 4.846901056266893e-06, + "loss": 0.6569, + "step": 2619 + }, + { + "epoch": 0.6956060002654985, + "grad_norm": 0.3870792179509595, + "learning_rate": 4.846780734180905e-06, + "loss": 0.6362, + "step": 2620 + }, + { + "epoch": 0.6958714987388822, + "grad_norm": 0.3878343142627452, + "learning_rate": 4.846660366326947e-06, + "loss": 0.6493, + "step": 2621 + }, + { + "epoch": 0.6961369972122661, + "grad_norm": 0.38824424518080686, + "learning_rate": 4.846539952707366e-06, + "loss": 0.6372, + "step": 2622 + }, + { + "epoch": 0.6964024956856498, + "grad_norm": 0.381097332062954, + "learning_rate": 4.84641949332451e-06, + "loss": 0.6238, + "step": 2623 + }, + { + "epoch": 0.6966679941590336, + "grad_norm": 0.39249737648919003, + "learning_rate": 4.8462989881807295e-06, + "loss": 0.6033, + "step": 2624 + }, + { + "epoch": 0.6969334926324173, + "grad_norm": 0.38492329963418653, + "learning_rate": 4.846178437278375e-06, + "loss": 0.6448, + "step": 2625 + }, + { + "epoch": 0.6971989911058012, + "grad_norm": 0.3803116438683266, + "learning_rate": 4.846057840619795e-06, + "loss": 0.6274, + "step": 2626 + }, + { + "epoch": 0.6974644895791849, + "grad_norm": 0.3875639142413487, + "learning_rate": 4.845937198207344e-06, + "loss": 0.6433, + "step": 2627 + }, + { + "epoch": 0.6977299880525687, + "grad_norm": 0.39390029475279664, + "learning_rate": 4.8458165100433725e-06, + "loss": 0.6401, + "step": 2628 + }, + { + "epoch": 0.6979954865259524, + "grad_norm": 0.3774646506851741, + "learning_rate": 4.845695776130236e-06, + "loss": 0.597, + "step": 2629 + }, + { + "epoch": 0.6982609849993362, + "grad_norm": 0.39145801555943405, + "learning_rate": 4.845574996470289e-06, + "loss": 0.65, + "step": 2630 + }, + { + "epoch": 0.6985264834727201, + "grad_norm": 0.37923453762103676, + "learning_rate": 4.845454171065887e-06, + "loss": 0.6119, + "step": 2631 + }, + { + "epoch": 0.6987919819461038, + "grad_norm": 0.3912627709400642, + "learning_rate": 4.8453332999193854e-06, + "loss": 0.6154, + "step": 2632 + }, + { + "epoch": 0.6990574804194876, + "grad_norm": 0.39647054375916135, + "learning_rate": 4.845212383033142e-06, + "loss": 0.5926, + "step": 2633 + }, + { + "epoch": 0.6993229788928713, + "grad_norm": 0.38543941146569916, + "learning_rate": 4.845091420409515e-06, + "loss": 0.5537, + "step": 2634 + }, + { + "epoch": 0.6995884773662552, + "grad_norm": 0.37478166965820847, + "learning_rate": 4.844970412050864e-06, + "loss": 0.608, + "step": 2635 + }, + { + "epoch": 0.6998539758396389, + "grad_norm": 0.3936287744363812, + "learning_rate": 4.8448493579595485e-06, + "loss": 0.616, + "step": 2636 + }, + { + "epoch": 0.7001194743130227, + "grad_norm": 0.3915730473874248, + "learning_rate": 4.844728258137929e-06, + "loss": 0.6132, + "step": 2637 + }, + { + "epoch": 0.7003849727864064, + "grad_norm": 0.38685701677178286, + "learning_rate": 4.8446071125883674e-06, + "loss": 0.6361, + "step": 2638 + }, + { + "epoch": 0.7006504712597903, + "grad_norm": 0.3790822534511941, + "learning_rate": 4.844485921313227e-06, + "loss": 0.6417, + "step": 2639 + }, + { + "epoch": 0.700915969733174, + "grad_norm": 0.39605015319172515, + "learning_rate": 4.844364684314871e-06, + "loss": 0.6304, + "step": 2640 + }, + { + "epoch": 0.7011814682065578, + "grad_norm": 0.3925159358194844, + "learning_rate": 4.844243401595663e-06, + "loss": 0.6234, + "step": 2641 + }, + { + "epoch": 0.7014469666799416, + "grad_norm": 0.3914855293098387, + "learning_rate": 4.84412207315797e-06, + "loss": 0.622, + "step": 2642 + }, + { + "epoch": 0.7017124651533254, + "grad_norm": 0.38446146141623927, + "learning_rate": 4.844000699004156e-06, + "loss": 0.6129, + "step": 2643 + }, + { + "epoch": 0.7019779636267092, + "grad_norm": 0.3955420711556819, + "learning_rate": 4.843879279136589e-06, + "loss": 0.6085, + "step": 2644 + }, + { + "epoch": 0.7022434621000929, + "grad_norm": 0.3722152369503793, + "learning_rate": 4.843757813557639e-06, + "loss": 0.6188, + "step": 2645 + }, + { + "epoch": 0.7025089605734767, + "grad_norm": 0.39288394958209916, + "learning_rate": 4.843636302269672e-06, + "loss": 0.6134, + "step": 2646 + }, + { + "epoch": 0.7027744590468605, + "grad_norm": 0.3780279065230019, + "learning_rate": 4.843514745275059e-06, + "loss": 0.6159, + "step": 2647 + }, + { + "epoch": 0.7030399575202443, + "grad_norm": 0.40262741603223956, + "learning_rate": 4.84339314257617e-06, + "loss": 0.6396, + "step": 2648 + }, + { + "epoch": 0.703305455993628, + "grad_norm": 0.3997449926090783, + "learning_rate": 4.843271494175378e-06, + "loss": 0.6559, + "step": 2649 + }, + { + "epoch": 0.7035709544670118, + "grad_norm": 0.4129818372307495, + "learning_rate": 4.843149800075054e-06, + "loss": 0.6005, + "step": 2650 + }, + { + "epoch": 0.7038364529403955, + "grad_norm": 0.3859111695168442, + "learning_rate": 4.8430280602775725e-06, + "loss": 0.5729, + "step": 2651 + }, + { + "epoch": 0.7041019514137794, + "grad_norm": 0.3785485467842736, + "learning_rate": 4.842906274785306e-06, + "loss": 0.5954, + "step": 2652 + }, + { + "epoch": 0.7043674498871632, + "grad_norm": 0.38978516180672673, + "learning_rate": 4.842784443600632e-06, + "loss": 0.6203, + "step": 2653 + }, + { + "epoch": 0.7046329483605469, + "grad_norm": 0.3996638681439292, + "learning_rate": 4.842662566725924e-06, + "loss": 0.644, + "step": 2654 + }, + { + "epoch": 0.7048984468339307, + "grad_norm": 0.3821948974207942, + "learning_rate": 4.8425406441635615e-06, + "loss": 0.5864, + "step": 2655 + }, + { + "epoch": 0.7051639453073145, + "grad_norm": 0.37975802340295783, + "learning_rate": 4.84241867591592e-06, + "loss": 0.6202, + "step": 2656 + }, + { + "epoch": 0.7054294437806983, + "grad_norm": 0.3865295040695601, + "learning_rate": 4.8422966619853795e-06, + "loss": 0.6403, + "step": 2657 + }, + { + "epoch": 0.705694942254082, + "grad_norm": 0.38345848440660857, + "learning_rate": 4.842174602374319e-06, + "loss": 0.6259, + "step": 2658 + }, + { + "epoch": 0.7059604407274658, + "grad_norm": 0.38894517327788974, + "learning_rate": 4.842052497085119e-06, + "loss": 0.6221, + "step": 2659 + }, + { + "epoch": 0.7062259392008496, + "grad_norm": 0.4014850168709462, + "learning_rate": 4.841930346120161e-06, + "loss": 0.5752, + "step": 2660 + }, + { + "epoch": 0.7064914376742334, + "grad_norm": 0.3779393000447171, + "learning_rate": 4.841808149481827e-06, + "loss": 0.6369, + "step": 2661 + }, + { + "epoch": 0.7067569361476171, + "grad_norm": 0.3980667358981174, + "learning_rate": 4.841685907172501e-06, + "loss": 0.6006, + "step": 2662 + }, + { + "epoch": 0.7070224346210009, + "grad_norm": 0.3965787010329716, + "learning_rate": 4.841563619194565e-06, + "loss": 0.5865, + "step": 2663 + }, + { + "epoch": 0.7072879330943848, + "grad_norm": 0.4149449600830029, + "learning_rate": 4.841441285550407e-06, + "loss": 0.6416, + "step": 2664 + }, + { + "epoch": 0.7075534315677685, + "grad_norm": 0.3986300544804875, + "learning_rate": 4.84131890624241e-06, + "loss": 0.6033, + "step": 2665 + }, + { + "epoch": 0.7078189300411523, + "grad_norm": 0.40950327115232465, + "learning_rate": 4.841196481272962e-06, + "loss": 0.6375, + "step": 2666 + }, + { + "epoch": 0.708084428514536, + "grad_norm": 0.39344157737685936, + "learning_rate": 4.841074010644451e-06, + "loss": 0.6013, + "step": 2667 + }, + { + "epoch": 0.7083499269879198, + "grad_norm": 0.3885292628877199, + "learning_rate": 4.840951494359264e-06, + "loss": 0.6394, + "step": 2668 + }, + { + "epoch": 0.7086154254613036, + "grad_norm": 0.39807870604794077, + "learning_rate": 4.840828932419792e-06, + "loss": 0.6394, + "step": 2669 + }, + { + "epoch": 0.7088809239346874, + "grad_norm": 0.3940794565330124, + "learning_rate": 4.840706324828423e-06, + "loss": 0.6408, + "step": 2670 + }, + { + "epoch": 0.7091464224080711, + "grad_norm": 0.389525565444789, + "learning_rate": 4.840583671587551e-06, + "loss": 0.6206, + "step": 2671 + }, + { + "epoch": 0.7094119208814549, + "grad_norm": 0.39086680902995535, + "learning_rate": 4.840460972699567e-06, + "loss": 0.63, + "step": 2672 + }, + { + "epoch": 0.7096774193548387, + "grad_norm": 0.38162221825724835, + "learning_rate": 4.840338228166862e-06, + "loss": 0.6207, + "step": 2673 + }, + { + "epoch": 0.7099429178282225, + "grad_norm": 0.38631093654671855, + "learning_rate": 4.840215437991833e-06, + "loss": 0.5902, + "step": 2674 + }, + { + "epoch": 0.7102084163016062, + "grad_norm": 0.40197529788488806, + "learning_rate": 4.840092602176872e-06, + "loss": 0.657, + "step": 2675 + }, + { + "epoch": 0.71047391477499, + "grad_norm": 0.3910634987928013, + "learning_rate": 4.839969720724377e-06, + "loss": 0.6357, + "step": 2676 + }, + { + "epoch": 0.7107394132483739, + "grad_norm": 0.3926464229659296, + "learning_rate": 4.839846793636742e-06, + "loss": 0.6051, + "step": 2677 + }, + { + "epoch": 0.7110049117217576, + "grad_norm": 0.3786085783996354, + "learning_rate": 4.839723820916366e-06, + "loss": 0.609, + "step": 2678 + }, + { + "epoch": 0.7112704101951414, + "grad_norm": 0.37515048093952635, + "learning_rate": 4.839600802565647e-06, + "loss": 0.5779, + "step": 2679 + }, + { + "epoch": 0.7115359086685251, + "grad_norm": 0.4016594721468733, + "learning_rate": 4.839477738586984e-06, + "loss": 0.5829, + "step": 2680 + }, + { + "epoch": 0.711801407141909, + "grad_norm": 0.38247376456700155, + "learning_rate": 4.839354628982777e-06, + "loss": 0.5868, + "step": 2681 + }, + { + "epoch": 0.7120669056152927, + "grad_norm": 0.3904012100676597, + "learning_rate": 4.839231473755427e-06, + "loss": 0.6004, + "step": 2682 + }, + { + "epoch": 0.7123324040886765, + "grad_norm": 0.3939250292403363, + "learning_rate": 4.839108272907336e-06, + "loss": 0.5925, + "step": 2683 + }, + { + "epoch": 0.7125979025620602, + "grad_norm": 0.37986215662453254, + "learning_rate": 4.8389850264409055e-06, + "loss": 0.6062, + "step": 2684 + }, + { + "epoch": 0.7128634010354441, + "grad_norm": 0.39323207411072897, + "learning_rate": 4.838861734358541e-06, + "loss": 0.6057, + "step": 2685 + }, + { + "epoch": 0.7131288995088279, + "grad_norm": 0.38716468749046573, + "learning_rate": 4.8387383966626465e-06, + "loss": 0.5998, + "step": 2686 + }, + { + "epoch": 0.7133943979822116, + "grad_norm": 0.3867185961010731, + "learning_rate": 4.838615013355627e-06, + "loss": 0.64, + "step": 2687 + }, + { + "epoch": 0.7136598964555954, + "grad_norm": 0.4484737722836541, + "learning_rate": 4.838491584439888e-06, + "loss": 0.5948, + "step": 2688 + }, + { + "epoch": 0.7139253949289791, + "grad_norm": 0.3810109151145234, + "learning_rate": 4.8383681099178385e-06, + "loss": 0.6127, + "step": 2689 + }, + { + "epoch": 0.714190893402363, + "grad_norm": 0.3888128776298206, + "learning_rate": 4.838244589791885e-06, + "loss": 0.6102, + "step": 2690 + }, + { + "epoch": 0.7144563918757467, + "grad_norm": 0.3909530990167454, + "learning_rate": 4.8381210240644375e-06, + "loss": 0.63, + "step": 2691 + }, + { + "epoch": 0.7147218903491305, + "grad_norm": 0.38630949045402846, + "learning_rate": 4.837997412737905e-06, + "loss": 0.611, + "step": 2692 + }, + { + "epoch": 0.7149873888225142, + "grad_norm": 0.39549871748641846, + "learning_rate": 4.837873755814698e-06, + "loss": 0.6378, + "step": 2693 + }, + { + "epoch": 0.7152528872958981, + "grad_norm": 0.37527015893172583, + "learning_rate": 4.837750053297229e-06, + "loss": 0.6327, + "step": 2694 + }, + { + "epoch": 0.7155183857692818, + "grad_norm": 0.4213494559443399, + "learning_rate": 4.83762630518791e-06, + "loss": 0.5699, + "step": 2695 + }, + { + "epoch": 0.7157838842426656, + "grad_norm": 0.38760136451103044, + "learning_rate": 4.837502511489155e-06, + "loss": 0.5828, + "step": 2696 + }, + { + "epoch": 0.7160493827160493, + "grad_norm": 0.4034611300192151, + "learning_rate": 4.837378672203378e-06, + "loss": 0.597, + "step": 2697 + }, + { + "epoch": 0.7163148811894332, + "grad_norm": 0.4034669076033467, + "learning_rate": 4.837254787332993e-06, + "loss": 0.6251, + "step": 2698 + }, + { + "epoch": 0.716580379662817, + "grad_norm": 0.4115041210496319, + "learning_rate": 4.837130856880418e-06, + "loss": 0.6053, + "step": 2699 + }, + { + "epoch": 0.7168458781362007, + "grad_norm": 0.4061381852681957, + "learning_rate": 4.837006880848069e-06, + "loss": 0.6161, + "step": 2700 + }, + { + "epoch": 0.7171113766095845, + "grad_norm": 0.3922997461013148, + "learning_rate": 4.8368828592383635e-06, + "loss": 0.6068, + "step": 2701 + }, + { + "epoch": 0.7173768750829683, + "grad_norm": 0.3817760610409912, + "learning_rate": 4.836758792053721e-06, + "loss": 0.596, + "step": 2702 + }, + { + "epoch": 0.7176423735563521, + "grad_norm": 0.3945996356900288, + "learning_rate": 4.83663467929656e-06, + "loss": 0.6483, + "step": 2703 + }, + { + "epoch": 0.7179078720297358, + "grad_norm": 0.39435325139445204, + "learning_rate": 4.8365105209693015e-06, + "loss": 0.6582, + "step": 2704 + }, + { + "epoch": 0.7181733705031196, + "grad_norm": 0.3665830778055999, + "learning_rate": 4.836386317074368e-06, + "loss": 0.6147, + "step": 2705 + }, + { + "epoch": 0.7184388689765033, + "grad_norm": 0.39212906815583454, + "learning_rate": 4.8362620676141805e-06, + "loss": 0.622, + "step": 2706 + }, + { + "epoch": 0.7187043674498872, + "grad_norm": 0.38888920059635756, + "learning_rate": 4.8361377725911625e-06, + "loss": 0.6221, + "step": 2707 + }, + { + "epoch": 0.718969865923271, + "grad_norm": 0.3838288585316257, + "learning_rate": 4.836013432007738e-06, + "loss": 0.609, + "step": 2708 + }, + { + "epoch": 0.7192353643966547, + "grad_norm": 0.3766154712694823, + "learning_rate": 4.835889045866332e-06, + "loss": 0.6175, + "step": 2709 + }, + { + "epoch": 0.7195008628700384, + "grad_norm": 0.3935365236909523, + "learning_rate": 4.83576461416937e-06, + "loss": 0.6386, + "step": 2710 + }, + { + "epoch": 0.7197663613434223, + "grad_norm": 0.3746494541735743, + "learning_rate": 4.83564013691928e-06, + "loss": 0.6039, + "step": 2711 + }, + { + "epoch": 0.7200318598168061, + "grad_norm": 0.3958855733104673, + "learning_rate": 4.835515614118488e-06, + "loss": 0.6206, + "step": 2712 + }, + { + "epoch": 0.7202973582901898, + "grad_norm": 0.3990273793543752, + "learning_rate": 4.8353910457694235e-06, + "loss": 0.5893, + "step": 2713 + }, + { + "epoch": 0.7205628567635736, + "grad_norm": 0.3825074603959132, + "learning_rate": 4.835266431874516e-06, + "loss": 0.632, + "step": 2714 + }, + { + "epoch": 0.7208283552369574, + "grad_norm": 0.38485740587732975, + "learning_rate": 4.835141772436194e-06, + "loss": 0.6317, + "step": 2715 + }, + { + "epoch": 0.7210938537103412, + "grad_norm": 0.4067854496688009, + "learning_rate": 4.835017067456892e-06, + "loss": 0.6057, + "step": 2716 + }, + { + "epoch": 0.7213593521837249, + "grad_norm": 0.38906319773523507, + "learning_rate": 4.834892316939038e-06, + "loss": 0.6125, + "step": 2717 + }, + { + "epoch": 0.7216248506571087, + "grad_norm": 0.40826904243452194, + "learning_rate": 4.834767520885069e-06, + "loss": 0.6612, + "step": 2718 + }, + { + "epoch": 0.7218903491304925, + "grad_norm": 0.3927443248306872, + "learning_rate": 4.834642679297415e-06, + "loss": 0.6076, + "step": 2719 + }, + { + "epoch": 0.7221558476038763, + "grad_norm": 0.47996324107007193, + "learning_rate": 4.834517792178513e-06, + "loss": 0.5771, + "step": 2720 + }, + { + "epoch": 0.72242134607726, + "grad_norm": 0.37420339276518444, + "learning_rate": 4.8343928595307986e-06, + "loss": 0.6255, + "step": 2721 + }, + { + "epoch": 0.7226868445506438, + "grad_norm": 0.4346393954653237, + "learning_rate": 4.834267881356708e-06, + "loss": 0.6164, + "step": 2722 + }, + { + "epoch": 0.7229523430240277, + "grad_norm": 0.598646683483842, + "learning_rate": 4.834142857658678e-06, + "loss": 0.6173, + "step": 2723 + }, + { + "epoch": 0.7232178414974114, + "grad_norm": 0.379375202482494, + "learning_rate": 4.834017788439148e-06, + "loss": 0.6317, + "step": 2724 + }, + { + "epoch": 0.7234833399707952, + "grad_norm": 0.4396162719192362, + "learning_rate": 4.833892673700556e-06, + "loss": 0.6448, + "step": 2725 + }, + { + "epoch": 0.7237488384441789, + "grad_norm": 0.46814974281808003, + "learning_rate": 4.833767513445342e-06, + "loss": 0.5925, + "step": 2726 + }, + { + "epoch": 0.7240143369175627, + "grad_norm": 0.6267303996799323, + "learning_rate": 4.833642307675948e-06, + "loss": 0.598, + "step": 2727 + }, + { + "epoch": 0.7242798353909465, + "grad_norm": 0.41578917951553057, + "learning_rate": 4.833517056394816e-06, + "loss": 0.6344, + "step": 2728 + }, + { + "epoch": 0.7245453338643303, + "grad_norm": 0.5404076902512648, + "learning_rate": 4.833391759604386e-06, + "loss": 0.6511, + "step": 2729 + }, + { + "epoch": 0.724810832337714, + "grad_norm": 0.5384994277964352, + "learning_rate": 4.833266417307105e-06, + "loss": 0.6271, + "step": 2730 + }, + { + "epoch": 0.7250763308110978, + "grad_norm": 0.4374745481304502, + "learning_rate": 4.833141029505417e-06, + "loss": 0.5907, + "step": 2731 + }, + { + "epoch": 0.7253418292844817, + "grad_norm": 0.4138675087051396, + "learning_rate": 4.8330155962017645e-06, + "loss": 0.6143, + "step": 2732 + }, + { + "epoch": 0.7256073277578654, + "grad_norm": 0.4640266889654643, + "learning_rate": 4.832890117398596e-06, + "loss": 0.597, + "step": 2733 + }, + { + "epoch": 0.7258728262312492, + "grad_norm": 0.4441762398041202, + "learning_rate": 4.8327645930983595e-06, + "loss": 0.651, + "step": 2734 + }, + { + "epoch": 0.7261383247046329, + "grad_norm": 0.47555521385135624, + "learning_rate": 4.832639023303501e-06, + "loss": 0.6317, + "step": 2735 + }, + { + "epoch": 0.7264038231780168, + "grad_norm": 0.4006585151692151, + "learning_rate": 4.832513408016471e-06, + "loss": 0.5759, + "step": 2736 + }, + { + "epoch": 0.7266693216514005, + "grad_norm": 0.4202307412152153, + "learning_rate": 4.832387747239717e-06, + "loss": 0.5972, + "step": 2737 + }, + { + "epoch": 0.7269348201247843, + "grad_norm": 0.4216843139057117, + "learning_rate": 4.832262040975692e-06, + "loss": 0.6194, + "step": 2738 + }, + { + "epoch": 0.727200318598168, + "grad_norm": 0.5441528042431764, + "learning_rate": 4.832136289226847e-06, + "loss": 0.595, + "step": 2739 + }, + { + "epoch": 0.7274658170715519, + "grad_norm": 0.3886247515099562, + "learning_rate": 4.832010491995634e-06, + "loss": 0.6416, + "step": 2740 + }, + { + "epoch": 0.7277313155449356, + "grad_norm": 0.39390311568534647, + "learning_rate": 4.831884649284507e-06, + "loss": 0.6182, + "step": 2741 + }, + { + "epoch": 0.7279968140183194, + "grad_norm": 0.38143248397905566, + "learning_rate": 4.83175876109592e-06, + "loss": 0.5992, + "step": 2742 + }, + { + "epoch": 0.7282623124917031, + "grad_norm": 0.3990434085906802, + "learning_rate": 4.8316328274323275e-06, + "loss": 0.6382, + "step": 2743 + }, + { + "epoch": 0.7285278109650869, + "grad_norm": 0.4152186638628756, + "learning_rate": 4.831506848296187e-06, + "loss": 0.625, + "step": 2744 + }, + { + "epoch": 0.7287933094384708, + "grad_norm": 0.39717738614322656, + "learning_rate": 4.831380823689954e-06, + "loss": 0.6854, + "step": 2745 + }, + { + "epoch": 0.7290588079118545, + "grad_norm": 0.4064110160412851, + "learning_rate": 4.8312547536160855e-06, + "loss": 0.6421, + "step": 2746 + }, + { + "epoch": 0.7293243063852383, + "grad_norm": 0.394455839798419, + "learning_rate": 4.831128638077044e-06, + "loss": 0.6162, + "step": 2747 + }, + { + "epoch": 0.729589804858622, + "grad_norm": 0.37564715334029924, + "learning_rate": 4.831002477075284e-06, + "loss": 0.6246, + "step": 2748 + }, + { + "epoch": 0.7298553033320059, + "grad_norm": 0.397268055489803, + "learning_rate": 4.830876270613269e-06, + "loss": 0.6154, + "step": 2749 + }, + { + "epoch": 0.7301208018053896, + "grad_norm": 0.39290058760195923, + "learning_rate": 4.830750018693461e-06, + "loss": 0.635, + "step": 2750 + }, + { + "epoch": 0.7303863002787734, + "grad_norm": 0.417533315899868, + "learning_rate": 4.830623721318319e-06, + "loss": 0.5752, + "step": 2751 + }, + { + "epoch": 0.7306517987521571, + "grad_norm": 0.3816486465129925, + "learning_rate": 4.8304973784903094e-06, + "loss": 0.595, + "step": 2752 + }, + { + "epoch": 0.730917297225541, + "grad_norm": 0.3919818387608073, + "learning_rate": 4.830370990211896e-06, + "loss": 0.6128, + "step": 2753 + }, + { + "epoch": 0.7311827956989247, + "grad_norm": 0.39466680240663665, + "learning_rate": 4.830244556485541e-06, + "loss": 0.641, + "step": 2754 + }, + { + "epoch": 0.7314482941723085, + "grad_norm": 0.40178825699033777, + "learning_rate": 4.830118077313711e-06, + "loss": 0.5925, + "step": 2755 + }, + { + "epoch": 0.7317137926456923, + "grad_norm": 0.3917799700294571, + "learning_rate": 4.829991552698875e-06, + "loss": 0.6573, + "step": 2756 + }, + { + "epoch": 0.7319792911190761, + "grad_norm": 0.3963976162008505, + "learning_rate": 4.829864982643498e-06, + "loss": 0.6097, + "step": 2757 + }, + { + "epoch": 0.7322447895924599, + "grad_norm": 0.3834633489838258, + "learning_rate": 4.82973836715005e-06, + "loss": 0.6348, + "step": 2758 + }, + { + "epoch": 0.7325102880658436, + "grad_norm": 0.3858137908361253, + "learning_rate": 4.829611706220999e-06, + "loss": 0.6223, + "step": 2759 + }, + { + "epoch": 0.7327757865392274, + "grad_norm": 0.402427907671124, + "learning_rate": 4.829484999858816e-06, + "loss": 0.59, + "step": 2760 + }, + { + "epoch": 0.7330412850126112, + "grad_norm": 0.39852846856995233, + "learning_rate": 4.829358248065972e-06, + "loss": 0.6413, + "step": 2761 + }, + { + "epoch": 0.733306783485995, + "grad_norm": 0.3968234651476136, + "learning_rate": 4.829231450844939e-06, + "loss": 0.6253, + "step": 2762 + }, + { + "epoch": 0.7335722819593787, + "grad_norm": 0.3878410739598387, + "learning_rate": 4.8291046081981895e-06, + "loss": 0.6117, + "step": 2763 + }, + { + "epoch": 0.7338377804327625, + "grad_norm": 0.3832351555693445, + "learning_rate": 4.828977720128198e-06, + "loss": 0.6264, + "step": 2764 + }, + { + "epoch": 0.7341032789061462, + "grad_norm": 0.3887570327875792, + "learning_rate": 4.828850786637438e-06, + "loss": 0.602, + "step": 2765 + }, + { + "epoch": 0.7343687773795301, + "grad_norm": 0.39188880916219765, + "learning_rate": 4.828723807728386e-06, + "loss": 0.6316, + "step": 2766 + }, + { + "epoch": 0.7346342758529139, + "grad_norm": 0.4009026649474955, + "learning_rate": 4.8285967834035186e-06, + "loss": 0.5947, + "step": 2767 + }, + { + "epoch": 0.7348997743262976, + "grad_norm": 0.394562762404538, + "learning_rate": 4.8284697136653115e-06, + "loss": 0.6423, + "step": 2768 + }, + { + "epoch": 0.7351652727996814, + "grad_norm": 0.3911591541800308, + "learning_rate": 4.828342598516244e-06, + "loss": 0.6325, + "step": 2769 + }, + { + "epoch": 0.7354307712730652, + "grad_norm": 0.3805036859948306, + "learning_rate": 4.828215437958796e-06, + "loss": 0.5986, + "step": 2770 + }, + { + "epoch": 0.735696269746449, + "grad_norm": 0.3946856110402343, + "learning_rate": 4.828088231995446e-06, + "loss": 0.6102, + "step": 2771 + }, + { + "epoch": 0.7359617682198327, + "grad_norm": 0.40555422649461653, + "learning_rate": 4.827960980628675e-06, + "loss": 0.6308, + "step": 2772 + }, + { + "epoch": 0.7362272666932165, + "grad_norm": 0.3987021479530642, + "learning_rate": 4.827833683860966e-06, + "loss": 0.6459, + "step": 2773 + }, + { + "epoch": 0.7364927651666003, + "grad_norm": 0.3902535314183136, + "learning_rate": 4.8277063416948e-06, + "loss": 0.6567, + "step": 2774 + }, + { + "epoch": 0.7367582636399841, + "grad_norm": 0.38156721487418765, + "learning_rate": 4.827578954132661e-06, + "loss": 0.6234, + "step": 2775 + }, + { + "epoch": 0.7370237621133678, + "grad_norm": 0.395287256562999, + "learning_rate": 4.827451521177033e-06, + "loss": 0.6515, + "step": 2776 + }, + { + "epoch": 0.7372892605867516, + "grad_norm": 0.3850843482038379, + "learning_rate": 4.827324042830403e-06, + "loss": 0.6254, + "step": 2777 + }, + { + "epoch": 0.7375547590601355, + "grad_norm": 0.37816520076211557, + "learning_rate": 4.8271965190952555e-06, + "loss": 0.5893, + "step": 2778 + }, + { + "epoch": 0.7378202575335192, + "grad_norm": 0.38186737655543224, + "learning_rate": 4.8270689499740776e-06, + "loss": 0.6036, + "step": 2779 + }, + { + "epoch": 0.738085756006903, + "grad_norm": 0.384296951487842, + "learning_rate": 4.826941335469357e-06, + "loss": 0.6115, + "step": 2780 + }, + { + "epoch": 0.7383512544802867, + "grad_norm": 0.40834758981838704, + "learning_rate": 4.8268136755835835e-06, + "loss": 0.6286, + "step": 2781 + }, + { + "epoch": 0.7386167529536705, + "grad_norm": 0.3877035623227378, + "learning_rate": 4.826685970319247e-06, + "loss": 0.6293, + "step": 2782 + }, + { + "epoch": 0.7388822514270543, + "grad_norm": 0.39568721704849596, + "learning_rate": 4.826558219678837e-06, + "loss": 0.6583, + "step": 2783 + }, + { + "epoch": 0.7391477499004381, + "grad_norm": 0.377479796213409, + "learning_rate": 4.8264304236648455e-06, + "loss": 0.6137, + "step": 2784 + }, + { + "epoch": 0.7394132483738218, + "grad_norm": 0.3965867465751187, + "learning_rate": 4.826302582279764e-06, + "loss": 0.6053, + "step": 2785 + }, + { + "epoch": 0.7396787468472056, + "grad_norm": 0.39723326621145977, + "learning_rate": 4.826174695526086e-06, + "loss": 0.6367, + "step": 2786 + }, + { + "epoch": 0.7399442453205894, + "grad_norm": 0.39248336432761927, + "learning_rate": 4.826046763406307e-06, + "loss": 0.6014, + "step": 2787 + }, + { + "epoch": 0.7402097437939732, + "grad_norm": 0.3804681684215838, + "learning_rate": 4.825918785922921e-06, + "loss": 0.6133, + "step": 2788 + }, + { + "epoch": 0.740475242267357, + "grad_norm": 0.38887933143631737, + "learning_rate": 4.825790763078423e-06, + "loss": 0.5993, + "step": 2789 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.38083737374050763, + "learning_rate": 4.825662694875312e-06, + "loss": 0.5987, + "step": 2790 + }, + { + "epoch": 0.7410062392141246, + "grad_norm": 0.3675860024020678, + "learning_rate": 4.825534581316083e-06, + "loss": 0.6026, + "step": 2791 + }, + { + "epoch": 0.7412717376875083, + "grad_norm": 0.37996911363114316, + "learning_rate": 4.825406422403235e-06, + "loss": 0.6402, + "step": 2792 + }, + { + "epoch": 0.7415372361608921, + "grad_norm": 0.3976011733980962, + "learning_rate": 4.82527821813927e-06, + "loss": 0.651, + "step": 2793 + }, + { + "epoch": 0.7418027346342758, + "grad_norm": 0.42514282046473584, + "learning_rate": 4.825149968526686e-06, + "loss": 0.6119, + "step": 2794 + }, + { + "epoch": 0.7420682331076597, + "grad_norm": 0.3837758468126048, + "learning_rate": 4.8250216735679844e-06, + "loss": 0.6138, + "step": 2795 + }, + { + "epoch": 0.7423337315810434, + "grad_norm": 0.3797692586416595, + "learning_rate": 4.824893333265667e-06, + "loss": 0.6113, + "step": 2796 + }, + { + "epoch": 0.7425992300544272, + "grad_norm": 0.38163235535132634, + "learning_rate": 4.824764947622238e-06, + "loss": 0.6073, + "step": 2797 + }, + { + "epoch": 0.7428647285278109, + "grad_norm": 0.37031221252924607, + "learning_rate": 4.8246365166402005e-06, + "loss": 0.5934, + "step": 2798 + }, + { + "epoch": 0.7431302270011948, + "grad_norm": 0.3933555108740605, + "learning_rate": 4.824508040322059e-06, + "loss": 0.6373, + "step": 2799 + }, + { + "epoch": 0.7433957254745786, + "grad_norm": 0.3919329621284459, + "learning_rate": 4.824379518670319e-06, + "loss": 0.6325, + "step": 2800 + }, + { + "epoch": 0.7436612239479623, + "grad_norm": 0.38737802704491076, + "learning_rate": 4.824250951687488e-06, + "loss": 0.6119, + "step": 2801 + }, + { + "epoch": 0.743926722421346, + "grad_norm": 0.3892074664217247, + "learning_rate": 4.824122339376072e-06, + "loss": 0.6687, + "step": 2802 + }, + { + "epoch": 0.7441922208947298, + "grad_norm": 0.3848385033170652, + "learning_rate": 4.8239936817385815e-06, + "loss": 0.5734, + "step": 2803 + }, + { + "epoch": 0.7444577193681137, + "grad_norm": 0.38198065376022733, + "learning_rate": 4.823864978777522e-06, + "loss": 0.5928, + "step": 2804 + }, + { + "epoch": 0.7447232178414974, + "grad_norm": 0.375840257615797, + "learning_rate": 4.823736230495407e-06, + "loss": 0.6078, + "step": 2805 + }, + { + "epoch": 0.7449887163148812, + "grad_norm": 0.38617472866324026, + "learning_rate": 4.823607436894746e-06, + "loss": 0.6557, + "step": 2806 + }, + { + "epoch": 0.7452542147882649, + "grad_norm": 0.3964718064426852, + "learning_rate": 4.823478597978049e-06, + "loss": 0.6415, + "step": 2807 + }, + { + "epoch": 0.7455197132616488, + "grad_norm": 0.38162919943729473, + "learning_rate": 4.823349713747832e-06, + "loss": 0.6236, + "step": 2808 + }, + { + "epoch": 0.7457852117350325, + "grad_norm": 0.39130304025533047, + "learning_rate": 4.823220784206608e-06, + "loss": 0.6366, + "step": 2809 + }, + { + "epoch": 0.7460507102084163, + "grad_norm": 0.37975547292941775, + "learning_rate": 4.823091809356889e-06, + "loss": 0.5996, + "step": 2810 + }, + { + "epoch": 0.7463162086818, + "grad_norm": 0.3899161626263363, + "learning_rate": 4.822962789201192e-06, + "loss": 0.5977, + "step": 2811 + }, + { + "epoch": 0.7465817071551839, + "grad_norm": 0.3813481635025675, + "learning_rate": 4.822833723742033e-06, + "loss": 0.5883, + "step": 2812 + }, + { + "epoch": 0.7468472056285677, + "grad_norm": 0.40208793653077435, + "learning_rate": 4.8227046129819295e-06, + "loss": 0.6243, + "step": 2813 + }, + { + "epoch": 0.7471127041019514, + "grad_norm": 0.40462108997191293, + "learning_rate": 4.8225754569234e-06, + "loss": 0.6369, + "step": 2814 + }, + { + "epoch": 0.7473782025753352, + "grad_norm": 0.39903228538964913, + "learning_rate": 4.822446255568961e-06, + "loss": 0.6009, + "step": 2815 + }, + { + "epoch": 0.747643701048719, + "grad_norm": 0.38714760650706337, + "learning_rate": 4.822317008921133e-06, + "loss": 0.6141, + "step": 2816 + }, + { + "epoch": 0.7479091995221028, + "grad_norm": 0.3896480079381763, + "learning_rate": 4.82218771698244e-06, + "loss": 0.6239, + "step": 2817 + }, + { + "epoch": 0.7481746979954865, + "grad_norm": 0.38876274722907134, + "learning_rate": 4.822058379755399e-06, + "loss": 0.5795, + "step": 2818 + }, + { + "epoch": 0.7484401964688703, + "grad_norm": 0.38595288152480245, + "learning_rate": 4.821928997242534e-06, + "loss": 0.5969, + "step": 2819 + }, + { + "epoch": 0.748705694942254, + "grad_norm": 0.3877007577720642, + "learning_rate": 4.821799569446368e-06, + "loss": 0.6484, + "step": 2820 + }, + { + "epoch": 0.7489711934156379, + "grad_norm": 0.37798400269211985, + "learning_rate": 4.821670096369427e-06, + "loss": 0.6363, + "step": 2821 + }, + { + "epoch": 0.7492366918890216, + "grad_norm": 0.3849291661633291, + "learning_rate": 4.821540578014234e-06, + "loss": 0.6673, + "step": 2822 + }, + { + "epoch": 0.7495021903624054, + "grad_norm": 0.40099800685563236, + "learning_rate": 4.821411014383317e-06, + "loss": 0.6039, + "step": 2823 + }, + { + "epoch": 0.7497676888357891, + "grad_norm": 0.3833907503353607, + "learning_rate": 4.8212814054792e-06, + "loss": 0.6504, + "step": 2824 + }, + { + "epoch": 0.750033187309173, + "grad_norm": 0.38224448187116683, + "learning_rate": 4.821151751304412e-06, + "loss": 0.6218, + "step": 2825 + }, + { + "epoch": 0.7502986857825568, + "grad_norm": 0.38988209626322307, + "learning_rate": 4.821022051861482e-06, + "loss": 0.5858, + "step": 2826 + }, + { + "epoch": 0.7505641842559405, + "grad_norm": 0.3754834798575984, + "learning_rate": 4.8208923071529394e-06, + "loss": 0.6076, + "step": 2827 + }, + { + "epoch": 0.7508296827293243, + "grad_norm": 0.39057354478944306, + "learning_rate": 4.820762517181314e-06, + "loss": 0.5906, + "step": 2828 + }, + { + "epoch": 0.7510951812027081, + "grad_norm": 0.3871768544576605, + "learning_rate": 4.820632681949138e-06, + "loss": 0.6351, + "step": 2829 + }, + { + "epoch": 0.7513606796760919, + "grad_norm": 0.3948262560785759, + "learning_rate": 4.820502801458942e-06, + "loss": 0.6198, + "step": 2830 + }, + { + "epoch": 0.7516261781494756, + "grad_norm": 0.38015628806143265, + "learning_rate": 4.82037287571326e-06, + "loss": 0.6152, + "step": 2831 + }, + { + "epoch": 0.7518916766228594, + "grad_norm": 0.38438109287268973, + "learning_rate": 4.820242904714626e-06, + "loss": 0.5904, + "step": 2832 + }, + { + "epoch": 0.7521571750962432, + "grad_norm": 0.4081007225432161, + "learning_rate": 4.820112888465574e-06, + "loss": 0.6372, + "step": 2833 + }, + { + "epoch": 0.752422673569627, + "grad_norm": 0.3821812290906295, + "learning_rate": 4.8199828269686395e-06, + "loss": 0.639, + "step": 2834 + }, + { + "epoch": 0.7526881720430108, + "grad_norm": 0.381324449389942, + "learning_rate": 4.819852720226361e-06, + "loss": 0.6761, + "step": 2835 + }, + { + "epoch": 0.7529536705163945, + "grad_norm": 0.3730766585898743, + "learning_rate": 4.819722568241274e-06, + "loss": 0.5566, + "step": 2836 + }, + { + "epoch": 0.7532191689897783, + "grad_norm": 0.3901910047325791, + "learning_rate": 4.8195923710159165e-06, + "loss": 0.5964, + "step": 2837 + }, + { + "epoch": 0.7534846674631621, + "grad_norm": 0.3888273246622894, + "learning_rate": 4.81946212855283e-06, + "loss": 0.6175, + "step": 2838 + }, + { + "epoch": 0.7537501659365459, + "grad_norm": 0.39070985990796303, + "learning_rate": 4.819331840854551e-06, + "loss": 0.6407, + "step": 2839 + }, + { + "epoch": 0.7540156644099296, + "grad_norm": 0.37407735286231825, + "learning_rate": 4.819201507923623e-06, + "loss": 0.6268, + "step": 2840 + }, + { + "epoch": 0.7542811628833134, + "grad_norm": 0.39490825929126455, + "learning_rate": 4.819071129762588e-06, + "loss": 0.6218, + "step": 2841 + }, + { + "epoch": 0.7545466613566972, + "grad_norm": 0.3904636117438276, + "learning_rate": 4.818940706373988e-06, + "loss": 0.6264, + "step": 2842 + }, + { + "epoch": 0.754812159830081, + "grad_norm": 0.3758749674362929, + "learning_rate": 4.8188102377603655e-06, + "loss": 0.664, + "step": 2843 + }, + { + "epoch": 0.7550776583034647, + "grad_norm": 0.39353049150138575, + "learning_rate": 4.818679723924266e-06, + "loss": 0.6318, + "step": 2844 + }, + { + "epoch": 0.7553431567768485, + "grad_norm": 0.38782710599549575, + "learning_rate": 4.818549164868236e-06, + "loss": 0.6206, + "step": 2845 + }, + { + "epoch": 0.7556086552502324, + "grad_norm": 0.38531966142503277, + "learning_rate": 4.818418560594819e-06, + "loss": 0.6318, + "step": 2846 + }, + { + "epoch": 0.7558741537236161, + "grad_norm": 0.3796290069349175, + "learning_rate": 4.8182879111065635e-06, + "loss": 0.6292, + "step": 2847 + }, + { + "epoch": 0.7561396521969999, + "grad_norm": 0.39733783086606916, + "learning_rate": 4.818157216406019e-06, + "loss": 0.6142, + "step": 2848 + }, + { + "epoch": 0.7564051506703836, + "grad_norm": 0.41336730869352195, + "learning_rate": 4.818026476495732e-06, + "loss": 0.6049, + "step": 2849 + }, + { + "epoch": 0.7566706491437675, + "grad_norm": 0.39396539375502837, + "learning_rate": 4.817895691378253e-06, + "loss": 0.6301, + "step": 2850 + }, + { + "epoch": 0.7569361476171512, + "grad_norm": 0.402771897323766, + "learning_rate": 4.817764861056134e-06, + "loss": 0.6421, + "step": 2851 + }, + { + "epoch": 0.757201646090535, + "grad_norm": 0.4261722074461022, + "learning_rate": 4.817633985531924e-06, + "loss": 0.6338, + "step": 2852 + }, + { + "epoch": 0.7574671445639187, + "grad_norm": 0.39311857332916716, + "learning_rate": 4.817503064808178e-06, + "loss": 0.6399, + "step": 2853 + }, + { + "epoch": 0.7577326430373026, + "grad_norm": 0.37774626411213613, + "learning_rate": 4.8173720988874475e-06, + "loss": 0.6141, + "step": 2854 + }, + { + "epoch": 0.7579981415106863, + "grad_norm": 0.4015965677893637, + "learning_rate": 4.817241087772287e-06, + "loss": 0.6217, + "step": 2855 + }, + { + "epoch": 0.7582636399840701, + "grad_norm": 0.397607753910022, + "learning_rate": 4.817110031465252e-06, + "loss": 0.6223, + "step": 2856 + }, + { + "epoch": 0.7585291384574538, + "grad_norm": 0.38283352590978686, + "learning_rate": 4.816978929968897e-06, + "loss": 0.6668, + "step": 2857 + }, + { + "epoch": 0.7587946369308376, + "grad_norm": 0.3908908754959361, + "learning_rate": 4.816847783285782e-06, + "loss": 0.6343, + "step": 2858 + }, + { + "epoch": 0.7590601354042215, + "grad_norm": 0.39230612259317565, + "learning_rate": 4.816716591418461e-06, + "loss": 0.6514, + "step": 2859 + }, + { + "epoch": 0.7593256338776052, + "grad_norm": 0.4327028388240448, + "learning_rate": 4.816585354369494e-06, + "loss": 0.5903, + "step": 2860 + }, + { + "epoch": 0.759591132350989, + "grad_norm": 0.37954733886715813, + "learning_rate": 4.816454072141442e-06, + "loss": 0.6224, + "step": 2861 + }, + { + "epoch": 0.7598566308243727, + "grad_norm": 0.40265519749244033, + "learning_rate": 4.816322744736863e-06, + "loss": 0.6228, + "step": 2862 + }, + { + "epoch": 0.7601221292977566, + "grad_norm": 0.4099725995712369, + "learning_rate": 4.816191372158318e-06, + "loss": 0.6268, + "step": 2863 + }, + { + "epoch": 0.7603876277711403, + "grad_norm": 0.4045979862349543, + "learning_rate": 4.816059954408372e-06, + "loss": 0.6428, + "step": 2864 + }, + { + "epoch": 0.7606531262445241, + "grad_norm": 0.41086632410810203, + "learning_rate": 4.815928491489587e-06, + "loss": 0.6136, + "step": 2865 + }, + { + "epoch": 0.7609186247179078, + "grad_norm": 0.3972164821152107, + "learning_rate": 4.815796983404524e-06, + "loss": 0.6267, + "step": 2866 + }, + { + "epoch": 0.7611841231912917, + "grad_norm": 0.3902024330743728, + "learning_rate": 4.81566543015575e-06, + "loss": 0.5963, + "step": 2867 + }, + { + "epoch": 0.7614496216646754, + "grad_norm": 0.40330659855741036, + "learning_rate": 4.8155338317458315e-06, + "loss": 0.5927, + "step": 2868 + }, + { + "epoch": 0.7617151201380592, + "grad_norm": 0.38084131951564604, + "learning_rate": 4.815402188177333e-06, + "loss": 0.6365, + "step": 2869 + }, + { + "epoch": 0.761980618611443, + "grad_norm": 0.4221286842267405, + "learning_rate": 4.815270499452823e-06, + "loss": 0.6455, + "step": 2870 + }, + { + "epoch": 0.7622461170848268, + "grad_norm": 0.4028283526570253, + "learning_rate": 4.815138765574869e-06, + "loss": 0.6418, + "step": 2871 + }, + { + "epoch": 0.7625116155582106, + "grad_norm": 0.4105999221222162, + "learning_rate": 4.815006986546042e-06, + "loss": 0.6315, + "step": 2872 + }, + { + "epoch": 0.7627771140315943, + "grad_norm": 0.3939269638636593, + "learning_rate": 4.81487516236891e-06, + "loss": 0.5824, + "step": 2873 + }, + { + "epoch": 0.7630426125049781, + "grad_norm": 0.38876964883627213, + "learning_rate": 4.814743293046044e-06, + "loss": 0.6286, + "step": 2874 + }, + { + "epoch": 0.7633081109783618, + "grad_norm": 0.3913028918686446, + "learning_rate": 4.814611378580016e-06, + "loss": 0.6384, + "step": 2875 + }, + { + "epoch": 0.7635736094517457, + "grad_norm": 0.3811802259981648, + "learning_rate": 4.8144794189734e-06, + "loss": 0.6157, + "step": 2876 + }, + { + "epoch": 0.7638391079251294, + "grad_norm": 0.39224969117165753, + "learning_rate": 4.814347414228768e-06, + "loss": 0.6242, + "step": 2877 + }, + { + "epoch": 0.7641046063985132, + "grad_norm": 0.38710563455825564, + "learning_rate": 4.814215364348695e-06, + "loss": 0.6321, + "step": 2878 + }, + { + "epoch": 0.7643701048718969, + "grad_norm": 0.40701079071417173, + "learning_rate": 4.814083269335757e-06, + "loss": 0.6321, + "step": 2879 + }, + { + "epoch": 0.7646356033452808, + "grad_norm": 0.4154013311391526, + "learning_rate": 4.813951129192528e-06, + "loss": 0.6331, + "step": 2880 + }, + { + "epoch": 0.7649011018186646, + "grad_norm": 0.3926129204062203, + "learning_rate": 4.813818943921587e-06, + "loss": 0.5864, + "step": 2881 + }, + { + "epoch": 0.7651666002920483, + "grad_norm": 0.378144922737672, + "learning_rate": 4.813686713525512e-06, + "loss": 0.6198, + "step": 2882 + }, + { + "epoch": 0.7654320987654321, + "grad_norm": 0.3874636487519552, + "learning_rate": 4.813554438006881e-06, + "loss": 0.6232, + "step": 2883 + }, + { + "epoch": 0.7656975972388159, + "grad_norm": 0.3997421492654902, + "learning_rate": 4.813422117368273e-06, + "loss": 0.6425, + "step": 2884 + }, + { + "epoch": 0.7659630957121997, + "grad_norm": 0.43148803941850866, + "learning_rate": 4.81328975161227e-06, + "loss": 0.6392, + "step": 2885 + }, + { + "epoch": 0.7662285941855834, + "grad_norm": 0.39429568287485456, + "learning_rate": 4.8131573407414534e-06, + "loss": 0.6, + "step": 2886 + }, + { + "epoch": 0.7664940926589672, + "grad_norm": 0.39363939075189897, + "learning_rate": 4.813024884758405e-06, + "loss": 0.594, + "step": 2887 + }, + { + "epoch": 0.766759591132351, + "grad_norm": 0.40246296057980246, + "learning_rate": 4.812892383665708e-06, + "loss": 0.6163, + "step": 2888 + }, + { + "epoch": 0.7670250896057348, + "grad_norm": 0.41114581068698275, + "learning_rate": 4.812759837465946e-06, + "loss": 0.6502, + "step": 2889 + }, + { + "epoch": 0.7672905880791185, + "grad_norm": 0.4113119839828481, + "learning_rate": 4.812627246161705e-06, + "loss": 0.6266, + "step": 2890 + }, + { + "epoch": 0.7675560865525023, + "grad_norm": 0.3899321213433095, + "learning_rate": 4.812494609755571e-06, + "loss": 0.6638, + "step": 2891 + }, + { + "epoch": 0.7678215850258862, + "grad_norm": 0.38660059296230714, + "learning_rate": 4.812361928250129e-06, + "loss": 0.5976, + "step": 2892 + }, + { + "epoch": 0.7680870834992699, + "grad_norm": 0.38584939706221605, + "learning_rate": 4.812229201647967e-06, + "loss": 0.6211, + "step": 2893 + }, + { + "epoch": 0.7683525819726537, + "grad_norm": 0.42629618970007627, + "learning_rate": 4.812096429951677e-06, + "loss": 0.5858, + "step": 2894 + }, + { + "epoch": 0.7686180804460374, + "grad_norm": 0.3880003573730656, + "learning_rate": 4.811963613163844e-06, + "loss": 0.6134, + "step": 2895 + }, + { + "epoch": 0.7688835789194212, + "grad_norm": 0.37682593883713944, + "learning_rate": 4.81183075128706e-06, + "loss": 0.6453, + "step": 2896 + }, + { + "epoch": 0.769149077392805, + "grad_norm": 0.39006645981172405, + "learning_rate": 4.811697844323916e-06, + "loss": 0.6055, + "step": 2897 + }, + { + "epoch": 0.7694145758661888, + "grad_norm": 0.3858947263046375, + "learning_rate": 4.811564892277003e-06, + "loss": 0.6142, + "step": 2898 + }, + { + "epoch": 0.7696800743395725, + "grad_norm": 0.4054651077962247, + "learning_rate": 4.811431895148917e-06, + "loss": 0.6412, + "step": 2899 + }, + { + "epoch": 0.7699455728129563, + "grad_norm": 0.39216972756435564, + "learning_rate": 4.811298852942248e-06, + "loss": 0.6252, + "step": 2900 + }, + { + "epoch": 0.7702110712863401, + "grad_norm": 0.39892334901550697, + "learning_rate": 4.811165765659593e-06, + "loss": 0.6281, + "step": 2901 + }, + { + "epoch": 0.7704765697597239, + "grad_norm": 0.38647458628122094, + "learning_rate": 4.811032633303547e-06, + "loss": 0.6486, + "step": 2902 + }, + { + "epoch": 0.7707420682331076, + "grad_norm": 0.388502486175942, + "learning_rate": 4.810899455876706e-06, + "loss": 0.5853, + "step": 2903 + }, + { + "epoch": 0.7710075667064914, + "grad_norm": 0.38794512919805835, + "learning_rate": 4.8107662333816675e-06, + "loss": 0.6283, + "step": 2904 + }, + { + "epoch": 0.7712730651798753, + "grad_norm": 0.378637211836451, + "learning_rate": 4.81063296582103e-06, + "loss": 0.5796, + "step": 2905 + }, + { + "epoch": 0.771538563653259, + "grad_norm": 0.3895036340486329, + "learning_rate": 4.810499653197393e-06, + "loss": 0.5938, + "step": 2906 + }, + { + "epoch": 0.7718040621266428, + "grad_norm": 0.39562898790582673, + "learning_rate": 4.8103662955133544e-06, + "loss": 0.6298, + "step": 2907 + }, + { + "epoch": 0.7720695606000265, + "grad_norm": 0.3873624183380144, + "learning_rate": 4.810232892771516e-06, + "loss": 0.6062, + "step": 2908 + }, + { + "epoch": 0.7723350590734104, + "grad_norm": 0.3930627184834584, + "learning_rate": 4.810099444974481e-06, + "loss": 0.6229, + "step": 2909 + }, + { + "epoch": 0.7726005575467941, + "grad_norm": 0.38845285356878767, + "learning_rate": 4.80996595212485e-06, + "loss": 0.616, + "step": 2910 + }, + { + "epoch": 0.7728660560201779, + "grad_norm": 0.4057450413210804, + "learning_rate": 4.809832414225227e-06, + "loss": 0.6203, + "step": 2911 + }, + { + "epoch": 0.7731315544935616, + "grad_norm": 0.39552285158857, + "learning_rate": 4.809698831278217e-06, + "loss": 0.6509, + "step": 2912 + }, + { + "epoch": 0.7733970529669454, + "grad_norm": 0.39503753899891747, + "learning_rate": 4.809565203286425e-06, + "loss": 0.6496, + "step": 2913 + }, + { + "epoch": 0.7736625514403292, + "grad_norm": 0.3881227733150097, + "learning_rate": 4.809431530252456e-06, + "loss": 0.6337, + "step": 2914 + }, + { + "epoch": 0.773928049913713, + "grad_norm": 0.39464067899109084, + "learning_rate": 4.809297812178918e-06, + "loss": 0.5801, + "step": 2915 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 0.4063830918132964, + "learning_rate": 4.809164049068419e-06, + "loss": 0.5893, + "step": 2916 + }, + { + "epoch": 0.7744590468604805, + "grad_norm": 0.4114952777520471, + "learning_rate": 4.809030240923566e-06, + "loss": 0.6089, + "step": 2917 + }, + { + "epoch": 0.7747245453338644, + "grad_norm": 0.385084434659045, + "learning_rate": 4.808896387746971e-06, + "loss": 0.619, + "step": 2918 + }, + { + "epoch": 0.7749900438072481, + "grad_norm": 0.38481500748273706, + "learning_rate": 4.808762489541242e-06, + "loss": 0.6394, + "step": 2919 + }, + { + "epoch": 0.7752555422806319, + "grad_norm": 0.3945167554852316, + "learning_rate": 4.808628546308993e-06, + "loss": 0.6578, + "step": 2920 + }, + { + "epoch": 0.7755210407540156, + "grad_norm": 0.3866927988729723, + "learning_rate": 4.808494558052834e-06, + "loss": 0.5707, + "step": 2921 + }, + { + "epoch": 0.7757865392273995, + "grad_norm": 0.3927322070675447, + "learning_rate": 4.80836052477538e-06, + "loss": 0.615, + "step": 2922 + }, + { + "epoch": 0.7760520377007832, + "grad_norm": 0.3790505710213684, + "learning_rate": 4.808226446479242e-06, + "loss": 0.573, + "step": 2923 + }, + { + "epoch": 0.776317536174167, + "grad_norm": 0.3855940690441728, + "learning_rate": 4.808092323167038e-06, + "loss": 0.6013, + "step": 2924 + }, + { + "epoch": 0.7765830346475507, + "grad_norm": 0.38730582839123984, + "learning_rate": 4.807958154841383e-06, + "loss": 0.6087, + "step": 2925 + }, + { + "epoch": 0.7768485331209346, + "grad_norm": 0.3768157541034743, + "learning_rate": 4.807823941504893e-06, + "loss": 0.6005, + "step": 2926 + }, + { + "epoch": 0.7771140315943184, + "grad_norm": 0.3878411660495298, + "learning_rate": 4.807689683160185e-06, + "loss": 0.6224, + "step": 2927 + }, + { + "epoch": 0.7773795300677021, + "grad_norm": 0.3811076776944274, + "learning_rate": 4.807555379809878e-06, + "loss": 0.5965, + "step": 2928 + }, + { + "epoch": 0.7776450285410859, + "grad_norm": 0.39920255595996806, + "learning_rate": 4.807421031456592e-06, + "loss": 0.6436, + "step": 2929 + }, + { + "epoch": 0.7779105270144697, + "grad_norm": 0.3815713523887147, + "learning_rate": 4.807286638102945e-06, + "loss": 0.5779, + "step": 2930 + }, + { + "epoch": 0.7781760254878535, + "grad_norm": 0.39670412837126307, + "learning_rate": 4.80715219975156e-06, + "loss": 0.6039, + "step": 2931 + }, + { + "epoch": 0.7784415239612372, + "grad_norm": 0.3994763151178164, + "learning_rate": 4.807017716405058e-06, + "loss": 0.633, + "step": 2932 + }, + { + "epoch": 0.778707022434621, + "grad_norm": 0.3816882153271918, + "learning_rate": 4.806883188066063e-06, + "loss": 0.6364, + "step": 2933 + }, + { + "epoch": 0.7789725209080047, + "grad_norm": 0.3994844839005148, + "learning_rate": 4.806748614737197e-06, + "loss": 0.6462, + "step": 2934 + }, + { + "epoch": 0.7792380193813886, + "grad_norm": 0.3899630272401776, + "learning_rate": 4.806613996421085e-06, + "loss": 0.6274, + "step": 2935 + }, + { + "epoch": 0.7795035178547723, + "grad_norm": 0.39350037441691904, + "learning_rate": 4.806479333120352e-06, + "loss": 0.6488, + "step": 2936 + }, + { + "epoch": 0.7797690163281561, + "grad_norm": 0.39549162194149007, + "learning_rate": 4.806344624837626e-06, + "loss": 0.5834, + "step": 2937 + }, + { + "epoch": 0.7800345148015398, + "grad_norm": 0.3902836543896936, + "learning_rate": 4.8062098715755315e-06, + "loss": 0.6165, + "step": 2938 + }, + { + "epoch": 0.7803000132749237, + "grad_norm": 0.37882458615267006, + "learning_rate": 4.806075073336699e-06, + "loss": 0.6119, + "step": 2939 + }, + { + "epoch": 0.7805655117483075, + "grad_norm": 0.3945470338531427, + "learning_rate": 4.805940230123755e-06, + "loss": 0.5608, + "step": 2940 + }, + { + "epoch": 0.7808310102216912, + "grad_norm": 0.38574600185236063, + "learning_rate": 4.805805341939333e-06, + "loss": 0.6063, + "step": 2941 + }, + { + "epoch": 0.781096508695075, + "grad_norm": 0.3932120520679073, + "learning_rate": 4.805670408786059e-06, + "loss": 0.6494, + "step": 2942 + }, + { + "epoch": 0.7813620071684588, + "grad_norm": 0.387579262601144, + "learning_rate": 4.805535430666568e-06, + "loss": 0.6258, + "step": 2943 + }, + { + "epoch": 0.7816275056418426, + "grad_norm": 0.38110120339450104, + "learning_rate": 4.805400407583491e-06, + "loss": 0.6331, + "step": 2944 + }, + { + "epoch": 0.7818930041152263, + "grad_norm": 0.39119423003978865, + "learning_rate": 4.805265339539461e-06, + "loss": 0.6603, + "step": 2945 + }, + { + "epoch": 0.7821585025886101, + "grad_norm": 0.3810595461997376, + "learning_rate": 4.805130226537112e-06, + "loss": 0.6058, + "step": 2946 + }, + { + "epoch": 0.782424001061994, + "grad_norm": 0.37985340674377616, + "learning_rate": 4.804995068579082e-06, + "loss": 0.6855, + "step": 2947 + }, + { + "epoch": 0.7826894995353777, + "grad_norm": 0.3964583029794383, + "learning_rate": 4.804859865668002e-06, + "loss": 0.6404, + "step": 2948 + }, + { + "epoch": 0.7829549980087614, + "grad_norm": 0.3906598558363303, + "learning_rate": 4.804724617806512e-06, + "loss": 0.6937, + "step": 2949 + }, + { + "epoch": 0.7832204964821452, + "grad_norm": 0.3827250015372093, + "learning_rate": 4.8045893249972495e-06, + "loss": 0.6098, + "step": 2950 + }, + { + "epoch": 0.783485994955529, + "grad_norm": 0.3957584088405087, + "learning_rate": 4.804453987242853e-06, + "loss": 0.6129, + "step": 2951 + }, + { + "epoch": 0.7837514934289128, + "grad_norm": 0.37950326853201494, + "learning_rate": 4.804318604545961e-06, + "loss": 0.641, + "step": 2952 + }, + { + "epoch": 0.7840169919022966, + "grad_norm": 0.3822250393835054, + "learning_rate": 4.804183176909214e-06, + "loss": 0.6068, + "step": 2953 + }, + { + "epoch": 0.7842824903756803, + "grad_norm": 0.3848591378719966, + "learning_rate": 4.804047704335254e-06, + "loss": 0.6412, + "step": 2954 + }, + { + "epoch": 0.7845479888490641, + "grad_norm": 0.3881268395726478, + "learning_rate": 4.803912186826722e-06, + "loss": 0.6312, + "step": 2955 + }, + { + "epoch": 0.7848134873224479, + "grad_norm": 0.39554484008479746, + "learning_rate": 4.803776624386262e-06, + "loss": 0.6383, + "step": 2956 + }, + { + "epoch": 0.7850789857958317, + "grad_norm": 0.3763153558720562, + "learning_rate": 4.803641017016516e-06, + "loss": 0.6031, + "step": 2957 + }, + { + "epoch": 0.7853444842692154, + "grad_norm": 0.39052628837475367, + "learning_rate": 4.803505364720131e-06, + "loss": 0.63, + "step": 2958 + }, + { + "epoch": 0.7856099827425992, + "grad_norm": 0.36963789589009133, + "learning_rate": 4.803369667499751e-06, + "loss": 0.6048, + "step": 2959 + }, + { + "epoch": 0.785875481215983, + "grad_norm": 0.3815686570433188, + "learning_rate": 4.803233925358023e-06, + "loss": 0.6339, + "step": 2960 + }, + { + "epoch": 0.7861409796893668, + "grad_norm": 0.46078915740846943, + "learning_rate": 4.803098138297594e-06, + "loss": 0.576, + "step": 2961 + }, + { + "epoch": 0.7864064781627506, + "grad_norm": 0.39342741939552883, + "learning_rate": 4.802962306321113e-06, + "loss": 0.6011, + "step": 2962 + }, + { + "epoch": 0.7866719766361343, + "grad_norm": 0.39106682727992903, + "learning_rate": 4.802826429431227e-06, + "loss": 0.6083, + "step": 2963 + }, + { + "epoch": 0.7869374751095182, + "grad_norm": 0.40584316504966633, + "learning_rate": 4.802690507630588e-06, + "loss": 0.6072, + "step": 2964 + }, + { + "epoch": 0.7872029735829019, + "grad_norm": 0.3804267793705631, + "learning_rate": 4.8025545409218465e-06, + "loss": 0.6144, + "step": 2965 + }, + { + "epoch": 0.7874684720562857, + "grad_norm": 0.4229561074639001, + "learning_rate": 4.802418529307654e-06, + "loss": 0.6032, + "step": 2966 + }, + { + "epoch": 0.7877339705296694, + "grad_norm": 0.41496474949956363, + "learning_rate": 4.802282472790663e-06, + "loss": 0.6078, + "step": 2967 + }, + { + "epoch": 0.7879994690030533, + "grad_norm": 0.4162004586289964, + "learning_rate": 4.802146371373525e-06, + "loss": 0.6331, + "step": 2968 + }, + { + "epoch": 0.788264967476437, + "grad_norm": 0.4515606887735363, + "learning_rate": 4.802010225058898e-06, + "loss": 0.5855, + "step": 2969 + }, + { + "epoch": 0.7885304659498208, + "grad_norm": 0.3960843628246435, + "learning_rate": 4.801874033849435e-06, + "loss": 0.619, + "step": 2970 + }, + { + "epoch": 0.7887959644232045, + "grad_norm": 0.4092054627378958, + "learning_rate": 4.801737797747792e-06, + "loss": 0.6023, + "step": 2971 + }, + { + "epoch": 0.7890614628965883, + "grad_norm": 0.41951841062801887, + "learning_rate": 4.8016015167566265e-06, + "loss": 0.6289, + "step": 2972 + }, + { + "epoch": 0.7893269613699722, + "grad_norm": 0.4559277053067011, + "learning_rate": 4.801465190878596e-06, + "loss": 0.6386, + "step": 2973 + }, + { + "epoch": 0.7895924598433559, + "grad_norm": 0.38011225556715383, + "learning_rate": 4.80132882011636e-06, + "loss": 0.6047, + "step": 2974 + }, + { + "epoch": 0.7898579583167397, + "grad_norm": 0.3932016851471219, + "learning_rate": 4.8011924044725765e-06, + "loss": 0.6126, + "step": 2975 + }, + { + "epoch": 0.7901234567901234, + "grad_norm": 0.38368297969397236, + "learning_rate": 4.801055943949907e-06, + "loss": 0.624, + "step": 2976 + }, + { + "epoch": 0.7903889552635073, + "grad_norm": 0.39175187781741116, + "learning_rate": 4.800919438551012e-06, + "loss": 0.6221, + "step": 2977 + }, + { + "epoch": 0.790654453736891, + "grad_norm": 0.3904633241427886, + "learning_rate": 4.800782888278556e-06, + "loss": 0.6207, + "step": 2978 + }, + { + "epoch": 0.7909199522102748, + "grad_norm": 0.38512573140653733, + "learning_rate": 4.8006462931351995e-06, + "loss": 0.591, + "step": 2979 + }, + { + "epoch": 0.7911854506836585, + "grad_norm": 0.40702726558338137, + "learning_rate": 4.800509653123607e-06, + "loss": 0.6233, + "step": 2980 + }, + { + "epoch": 0.7914509491570424, + "grad_norm": 0.3901216916858801, + "learning_rate": 4.800372968246444e-06, + "loss": 0.6272, + "step": 2981 + }, + { + "epoch": 0.7917164476304261, + "grad_norm": 0.38754953990316465, + "learning_rate": 4.800236238506376e-06, + "loss": 0.6047, + "step": 2982 + }, + { + "epoch": 0.7919819461038099, + "grad_norm": 0.40160248185350533, + "learning_rate": 4.80009946390607e-06, + "loss": 0.6607, + "step": 2983 + }, + { + "epoch": 0.7922474445771936, + "grad_norm": 0.3824913621186493, + "learning_rate": 4.799962644448191e-06, + "loss": 0.6089, + "step": 2984 + }, + { + "epoch": 0.7925129430505775, + "grad_norm": 0.38890634812615466, + "learning_rate": 4.799825780135411e-06, + "loss": 0.593, + "step": 2985 + }, + { + "epoch": 0.7927784415239613, + "grad_norm": 0.38539566727529206, + "learning_rate": 4.799688870970396e-06, + "loss": 0.6189, + "step": 2986 + }, + { + "epoch": 0.793043939997345, + "grad_norm": 0.38677878228576895, + "learning_rate": 4.799551916955818e-06, + "loss": 0.6331, + "step": 2987 + }, + { + "epoch": 0.7933094384707288, + "grad_norm": 0.40154225892810613, + "learning_rate": 4.799414918094347e-06, + "loss": 0.6414, + "step": 2988 + }, + { + "epoch": 0.7935749369441125, + "grad_norm": 0.4186541670024924, + "learning_rate": 4.799277874388656e-06, + "loss": 0.6111, + "step": 2989 + }, + { + "epoch": 0.7938404354174964, + "grad_norm": 0.37130180790833417, + "learning_rate": 4.799140785841417e-06, + "loss": 0.6013, + "step": 2990 + }, + { + "epoch": 0.7941059338908801, + "grad_norm": 0.3745858589150191, + "learning_rate": 4.799003652455302e-06, + "loss": 0.5941, + "step": 2991 + }, + { + "epoch": 0.7943714323642639, + "grad_norm": 0.38701578660458447, + "learning_rate": 4.7988664742329875e-06, + "loss": 0.6069, + "step": 2992 + }, + { + "epoch": 0.7946369308376476, + "grad_norm": 0.3941868397833378, + "learning_rate": 4.7987292511771474e-06, + "loss": 0.6286, + "step": 2993 + }, + { + "epoch": 0.7949024293110315, + "grad_norm": 0.38929898414732367, + "learning_rate": 4.798591983290461e-06, + "loss": 0.6376, + "step": 2994 + }, + { + "epoch": 0.7951679277844153, + "grad_norm": 0.39449662455560097, + "learning_rate": 4.7984546705756e-06, + "loss": 0.6434, + "step": 2995 + }, + { + "epoch": 0.795433426257799, + "grad_norm": 0.3778611954502537, + "learning_rate": 4.7983173130352475e-06, + "loss": 0.5915, + "step": 2996 + }, + { + "epoch": 0.7956989247311828, + "grad_norm": 0.4011326295665181, + "learning_rate": 4.798179910672079e-06, + "loss": 0.6001, + "step": 2997 + }, + { + "epoch": 0.7959644232045666, + "grad_norm": 0.382780846431489, + "learning_rate": 4.798042463488775e-06, + "loss": 0.6339, + "step": 2998 + }, + { + "epoch": 0.7962299216779504, + "grad_norm": 0.38543224192435804, + "learning_rate": 4.797904971488019e-06, + "loss": 0.6133, + "step": 2999 + }, + { + "epoch": 0.7964954201513341, + "grad_norm": 0.37802472578784047, + "learning_rate": 4.797767434672488e-06, + "loss": 0.6252, + "step": 3000 + }, + { + "epoch": 0.7967609186247179, + "grad_norm": 0.38678043336886053, + "learning_rate": 4.797629853044865e-06, + "loss": 0.6455, + "step": 3001 + }, + { + "epoch": 0.7970264170981017, + "grad_norm": 0.40459052522651207, + "learning_rate": 4.797492226607836e-06, + "loss": 0.6175, + "step": 3002 + }, + { + "epoch": 0.7972919155714855, + "grad_norm": 0.37964100233076326, + "learning_rate": 4.797354555364082e-06, + "loss": 0.6287, + "step": 3003 + }, + { + "epoch": 0.7975574140448692, + "grad_norm": 0.38341809554558787, + "learning_rate": 4.797216839316291e-06, + "loss": 0.6112, + "step": 3004 + }, + { + "epoch": 0.797822912518253, + "grad_norm": 0.3923328729463677, + "learning_rate": 4.797079078467145e-06, + "loss": 0.6249, + "step": 3005 + }, + { + "epoch": 0.7980884109916367, + "grad_norm": 0.4243101334360585, + "learning_rate": 4.7969412728193345e-06, + "loss": 0.6099, + "step": 3006 + }, + { + "epoch": 0.7983539094650206, + "grad_norm": 0.39538420941156294, + "learning_rate": 4.796803422375544e-06, + "loss": 0.5722, + "step": 3007 + }, + { + "epoch": 0.7986194079384044, + "grad_norm": 0.3942206599636837, + "learning_rate": 4.796665527138465e-06, + "loss": 0.6254, + "step": 3008 + }, + { + "epoch": 0.7988849064117881, + "grad_norm": 0.38938382369631763, + "learning_rate": 4.796527587110783e-06, + "loss": 0.6455, + "step": 3009 + }, + { + "epoch": 0.7991504048851719, + "grad_norm": 0.41905754132214856, + "learning_rate": 4.7963896022951915e-06, + "loss": 0.6177, + "step": 3010 + }, + { + "epoch": 0.7994159033585557, + "grad_norm": 0.4065932144561058, + "learning_rate": 4.796251572694379e-06, + "loss": 0.6143, + "step": 3011 + }, + { + "epoch": 0.7996814018319395, + "grad_norm": 0.37800178823170866, + "learning_rate": 4.79611349831104e-06, + "loss": 0.605, + "step": 3012 + }, + { + "epoch": 0.7999469003053232, + "grad_norm": 0.40352141037193645, + "learning_rate": 4.795975379147865e-06, + "loss": 0.6152, + "step": 3013 + }, + { + "epoch": 0.800212398778707, + "grad_norm": 0.4275551389495503, + "learning_rate": 4.795837215207549e-06, + "loss": 0.5919, + "step": 3014 + }, + { + "epoch": 0.8004778972520908, + "grad_norm": 0.3963273680155388, + "learning_rate": 4.7956990064927865e-06, + "loss": 0.6397, + "step": 3015 + }, + { + "epoch": 0.8007433957254746, + "grad_norm": 0.37794494686677665, + "learning_rate": 4.795560753006272e-06, + "loss": 0.5899, + "step": 3016 + }, + { + "epoch": 0.8010088941988583, + "grad_norm": 0.3839474054456509, + "learning_rate": 4.795422454750703e-06, + "loss": 0.6336, + "step": 3017 + }, + { + "epoch": 0.8012743926722421, + "grad_norm": 0.4085013943268837, + "learning_rate": 4.7952841117287746e-06, + "loss": 0.6243, + "step": 3018 + }, + { + "epoch": 0.801539891145626, + "grad_norm": 0.4033356716760012, + "learning_rate": 4.795145723943187e-06, + "loss": 0.601, + "step": 3019 + }, + { + "epoch": 0.8018053896190097, + "grad_norm": 0.3801297064742477, + "learning_rate": 4.7950072913966386e-06, + "loss": 0.6081, + "step": 3020 + }, + { + "epoch": 0.8020708880923935, + "grad_norm": 0.40534575296039577, + "learning_rate": 4.794868814091828e-06, + "loss": 0.627, + "step": 3021 + }, + { + "epoch": 0.8023363865657772, + "grad_norm": 0.4170024787657662, + "learning_rate": 4.794730292031457e-06, + "loss": 0.6199, + "step": 3022 + }, + { + "epoch": 0.8026018850391611, + "grad_norm": 0.3877307879508813, + "learning_rate": 4.794591725218227e-06, + "loss": 0.5959, + "step": 3023 + }, + { + "epoch": 0.8028673835125448, + "grad_norm": 0.385260307210276, + "learning_rate": 4.794453113654841e-06, + "loss": 0.6484, + "step": 3024 + }, + { + "epoch": 0.8031328819859286, + "grad_norm": 0.3958538084046821, + "learning_rate": 4.794314457344001e-06, + "loss": 0.6455, + "step": 3025 + }, + { + "epoch": 0.8033983804593123, + "grad_norm": 0.49005574904683974, + "learning_rate": 4.794175756288411e-06, + "loss": 0.6302, + "step": 3026 + }, + { + "epoch": 0.8036638789326961, + "grad_norm": 0.38787925883372026, + "learning_rate": 4.794037010490777e-06, + "loss": 0.6124, + "step": 3027 + }, + { + "epoch": 0.80392937740608, + "grad_norm": 0.41030266965415774, + "learning_rate": 4.793898219953804e-06, + "loss": 0.6451, + "step": 3028 + }, + { + "epoch": 0.8041948758794637, + "grad_norm": 0.4338130007013215, + "learning_rate": 4.7937593846802e-06, + "loss": 0.6171, + "step": 3029 + }, + { + "epoch": 0.8044603743528475, + "grad_norm": 0.4287021615609794, + "learning_rate": 4.793620504672673e-06, + "loss": 0.5859, + "step": 3030 + }, + { + "epoch": 0.8047258728262312, + "grad_norm": 0.3792493028037663, + "learning_rate": 4.7934815799339285e-06, + "loss": 0.6322, + "step": 3031 + }, + { + "epoch": 0.8049913712996151, + "grad_norm": 0.44063062639660205, + "learning_rate": 4.793342610466678e-06, + "loss": 0.5823, + "step": 3032 + }, + { + "epoch": 0.8052568697729988, + "grad_norm": 0.44977973699506585, + "learning_rate": 4.793203596273632e-06, + "loss": 0.5843, + "step": 3033 + }, + { + "epoch": 0.8055223682463826, + "grad_norm": 0.39183705702570304, + "learning_rate": 4.793064537357502e-06, + "loss": 0.6619, + "step": 3034 + }, + { + "epoch": 0.8057878667197663, + "grad_norm": 0.3853020531081753, + "learning_rate": 4.792925433720998e-06, + "loss": 0.6281, + "step": 3035 + }, + { + "epoch": 0.8060533651931502, + "grad_norm": 0.40941377101087645, + "learning_rate": 4.792786285366834e-06, + "loss": 0.6207, + "step": 3036 + }, + { + "epoch": 0.8063188636665339, + "grad_norm": 0.39984320559078634, + "learning_rate": 4.792647092297724e-06, + "loss": 0.6414, + "step": 3037 + }, + { + "epoch": 0.8065843621399177, + "grad_norm": 0.38670405384388634, + "learning_rate": 4.792507854516383e-06, + "loss": 0.6016, + "step": 3038 + }, + { + "epoch": 0.8068498606133014, + "grad_norm": 0.38147800309502916, + "learning_rate": 4.792368572025525e-06, + "loss": 0.6208, + "step": 3039 + }, + { + "epoch": 0.8071153590866853, + "grad_norm": 0.386319652032236, + "learning_rate": 4.792229244827867e-06, + "loss": 0.5851, + "step": 3040 + }, + { + "epoch": 0.807380857560069, + "grad_norm": 0.4053514067391913, + "learning_rate": 4.792089872926127e-06, + "loss": 0.5931, + "step": 3041 + }, + { + "epoch": 0.8076463560334528, + "grad_norm": 0.3863874156057099, + "learning_rate": 4.791950456323023e-06, + "loss": 0.6266, + "step": 3042 + }, + { + "epoch": 0.8079118545068366, + "grad_norm": 0.394406759418258, + "learning_rate": 4.791810995021271e-06, + "loss": 0.6011, + "step": 3043 + }, + { + "epoch": 0.8081773529802203, + "grad_norm": 0.3934387726763986, + "learning_rate": 4.791671489023595e-06, + "loss": 0.6039, + "step": 3044 + }, + { + "epoch": 0.8084428514536042, + "grad_norm": 0.39613581161129774, + "learning_rate": 4.791531938332714e-06, + "loss": 0.6487, + "step": 3045 + }, + { + "epoch": 0.8087083499269879, + "grad_norm": 0.3826439695550616, + "learning_rate": 4.79139234295135e-06, + "loss": 0.6022, + "step": 3046 + }, + { + "epoch": 0.8089738484003717, + "grad_norm": 0.38489715956190756, + "learning_rate": 4.7912527028822246e-06, + "loss": 0.6346, + "step": 3047 + }, + { + "epoch": 0.8092393468737554, + "grad_norm": 0.40076861754465976, + "learning_rate": 4.7911130181280605e-06, + "loss": 0.6041, + "step": 3048 + }, + { + "epoch": 0.8095048453471393, + "grad_norm": 0.38486072268242794, + "learning_rate": 4.790973288691585e-06, + "loss": 0.6315, + "step": 3049 + }, + { + "epoch": 0.809770343820523, + "grad_norm": 0.3890072312613263, + "learning_rate": 4.790833514575519e-06, + "loss": 0.6338, + "step": 3050 + }, + { + "epoch": 0.8100358422939068, + "grad_norm": 0.40063802865243636, + "learning_rate": 4.790693695782592e-06, + "loss": 0.6212, + "step": 3051 + }, + { + "epoch": 0.8103013407672905, + "grad_norm": 0.3805255829024729, + "learning_rate": 4.790553832315529e-06, + "loss": 0.6088, + "step": 3052 + }, + { + "epoch": 0.8105668392406744, + "grad_norm": 0.38052266688049624, + "learning_rate": 4.790413924177058e-06, + "loss": 0.6271, + "step": 3053 + }, + { + "epoch": 0.8108323377140582, + "grad_norm": 0.3891470925112324, + "learning_rate": 4.790273971369906e-06, + "loss": 0.6033, + "step": 3054 + }, + { + "epoch": 0.8110978361874419, + "grad_norm": 0.39702011828000344, + "learning_rate": 4.790133973896806e-06, + "loss": 0.6173, + "step": 3055 + }, + { + "epoch": 0.8113633346608257, + "grad_norm": 0.3871313317783108, + "learning_rate": 4.789993931760486e-06, + "loss": 0.6416, + "step": 3056 + }, + { + "epoch": 0.8116288331342095, + "grad_norm": 0.3881859027432485, + "learning_rate": 4.789853844963677e-06, + "loss": 0.6218, + "step": 3057 + }, + { + "epoch": 0.8118943316075933, + "grad_norm": 0.3868948385668171, + "learning_rate": 4.789713713509112e-06, + "loss": 0.6059, + "step": 3058 + }, + { + "epoch": 0.812159830080977, + "grad_norm": 0.40151321349355845, + "learning_rate": 4.7895735373995225e-06, + "loss": 0.5989, + "step": 3059 + }, + { + "epoch": 0.8124253285543608, + "grad_norm": 0.37704399463899213, + "learning_rate": 4.789433316637644e-06, + "loss": 0.6125, + "step": 3060 + }, + { + "epoch": 0.8126908270277446, + "grad_norm": 0.3837903681264029, + "learning_rate": 4.789293051226211e-06, + "loss": 0.6498, + "step": 3061 + }, + { + "epoch": 0.8129563255011284, + "grad_norm": 0.4075595595495999, + "learning_rate": 4.789152741167956e-06, + "loss": 0.5957, + "step": 3062 + }, + { + "epoch": 0.8132218239745121, + "grad_norm": 0.40736896177681364, + "learning_rate": 4.78901238646562e-06, + "loss": 0.6468, + "step": 3063 + }, + { + "epoch": 0.8134873224478959, + "grad_norm": 0.3881459920702017, + "learning_rate": 4.788871987121937e-06, + "loss": 0.598, + "step": 3064 + }, + { + "epoch": 0.8137528209212797, + "grad_norm": 0.38756965470984517, + "learning_rate": 4.788731543139646e-06, + "loss": 0.6308, + "step": 3065 + }, + { + "epoch": 0.8140183193946635, + "grad_norm": 0.38766933747438825, + "learning_rate": 4.788591054521486e-06, + "loss": 0.6319, + "step": 3066 + }, + { + "epoch": 0.8142838178680473, + "grad_norm": 0.4253101243468969, + "learning_rate": 4.788450521270198e-06, + "loss": 0.5777, + "step": 3067 + }, + { + "epoch": 0.814549316341431, + "grad_norm": 0.3811032855153689, + "learning_rate": 4.78830994338852e-06, + "loss": 0.6002, + "step": 3068 + }, + { + "epoch": 0.8148148148148148, + "grad_norm": 0.387554693583662, + "learning_rate": 4.788169320879197e-06, + "loss": 0.6345, + "step": 3069 + }, + { + "epoch": 0.8150803132881986, + "grad_norm": 0.40243570860162275, + "learning_rate": 4.788028653744969e-06, + "loss": 0.613, + "step": 3070 + }, + { + "epoch": 0.8153458117615824, + "grad_norm": 0.3985264563294363, + "learning_rate": 4.787887941988581e-06, + "loss": 0.6387, + "step": 3071 + }, + { + "epoch": 0.8156113102349661, + "grad_norm": 0.3927562343922202, + "learning_rate": 4.787747185612776e-06, + "loss": 0.6154, + "step": 3072 + }, + { + "epoch": 0.8158768087083499, + "grad_norm": 0.38124945600392735, + "learning_rate": 4.7876063846203e-06, + "loss": 0.6446, + "step": 3073 + }, + { + "epoch": 0.8161423071817338, + "grad_norm": 0.40006847227750797, + "learning_rate": 4.787465539013897e-06, + "loss": 0.6038, + "step": 3074 + }, + { + "epoch": 0.8164078056551175, + "grad_norm": 0.39533456184373295, + "learning_rate": 4.787324648796316e-06, + "loss": 0.6187, + "step": 3075 + }, + { + "epoch": 0.8166733041285013, + "grad_norm": 0.4061119856435276, + "learning_rate": 4.787183713970304e-06, + "loss": 0.5932, + "step": 3076 + }, + { + "epoch": 0.816938802601885, + "grad_norm": 0.38498866901121387, + "learning_rate": 4.787042734538611e-06, + "loss": 0.586, + "step": 3077 + }, + { + "epoch": 0.8172043010752689, + "grad_norm": 0.38604882667142726, + "learning_rate": 4.786901710503984e-06, + "loss": 0.623, + "step": 3078 + }, + { + "epoch": 0.8174697995486526, + "grad_norm": 0.39481088801397074, + "learning_rate": 4.786760641869174e-06, + "loss": 0.6211, + "step": 3079 + }, + { + "epoch": 0.8177352980220364, + "grad_norm": 0.38895916363318683, + "learning_rate": 4.786619528636933e-06, + "loss": 0.633, + "step": 3080 + }, + { + "epoch": 0.8180007964954201, + "grad_norm": 0.390630967657958, + "learning_rate": 4.786478370810012e-06, + "loss": 0.6079, + "step": 3081 + }, + { + "epoch": 0.8182662949688039, + "grad_norm": 0.3998832046417734, + "learning_rate": 4.7863371683911644e-06, + "loss": 0.5956, + "step": 3082 + }, + { + "epoch": 0.8185317934421877, + "grad_norm": 0.4311476377342335, + "learning_rate": 4.786195921383145e-06, + "loss": 0.5949, + "step": 3083 + }, + { + "epoch": 0.8187972919155715, + "grad_norm": 0.3999998209449262, + "learning_rate": 4.7860546297887065e-06, + "loss": 0.6349, + "step": 3084 + }, + { + "epoch": 0.8190627903889552, + "grad_norm": 0.38991995662173307, + "learning_rate": 4.785913293610607e-06, + "loss": 0.564, + "step": 3085 + }, + { + "epoch": 0.819328288862339, + "grad_norm": 0.4045768827897694, + "learning_rate": 4.7857719128516e-06, + "loss": 0.6189, + "step": 3086 + }, + { + "epoch": 0.8195937873357229, + "grad_norm": 0.39984430576344626, + "learning_rate": 4.785630487514444e-06, + "loss": 0.5802, + "step": 3087 + }, + { + "epoch": 0.8198592858091066, + "grad_norm": 0.3852432445950339, + "learning_rate": 4.785489017601899e-06, + "loss": 0.6612, + "step": 3088 + }, + { + "epoch": 0.8201247842824904, + "grad_norm": 0.3818704358266958, + "learning_rate": 4.785347503116721e-06, + "loss": 0.5754, + "step": 3089 + }, + { + "epoch": 0.8203902827558741, + "grad_norm": 0.3681576688362433, + "learning_rate": 4.7852059440616715e-06, + "loss": 0.627, + "step": 3090 + }, + { + "epoch": 0.820655781229258, + "grad_norm": 0.393464260171437, + "learning_rate": 4.78506434043951e-06, + "loss": 0.6264, + "step": 3091 + }, + { + "epoch": 0.8209212797026417, + "grad_norm": 0.378244620539641, + "learning_rate": 4.7849226922530006e-06, + "loss": 0.6095, + "step": 3092 + }, + { + "epoch": 0.8211867781760255, + "grad_norm": 0.38019313440902974, + "learning_rate": 4.784780999504903e-06, + "loss": 0.6331, + "step": 3093 + }, + { + "epoch": 0.8214522766494092, + "grad_norm": 0.4074252468749414, + "learning_rate": 4.784639262197983e-06, + "loss": 0.6605, + "step": 3094 + }, + { + "epoch": 0.8217177751227931, + "grad_norm": 0.38585300792180804, + "learning_rate": 4.784497480335003e-06, + "loss": 0.6291, + "step": 3095 + }, + { + "epoch": 0.8219832735961768, + "grad_norm": 0.3876931187738847, + "learning_rate": 4.784355653918729e-06, + "loss": 0.6453, + "step": 3096 + }, + { + "epoch": 0.8222487720695606, + "grad_norm": 0.36332652523024367, + "learning_rate": 4.784213782951926e-06, + "loss": 0.5722, + "step": 3097 + }, + { + "epoch": 0.8225142705429443, + "grad_norm": 0.3887959834336381, + "learning_rate": 4.784071867437362e-06, + "loss": 0.5863, + "step": 3098 + }, + { + "epoch": 0.8227797690163282, + "grad_norm": 0.38212226745974903, + "learning_rate": 4.783929907377805e-06, + "loss": 0.5543, + "step": 3099 + }, + { + "epoch": 0.823045267489712, + "grad_norm": 0.38555522311114687, + "learning_rate": 4.783787902776021e-06, + "loss": 0.6055, + "step": 3100 + }, + { + "epoch": 0.8233107659630957, + "grad_norm": 0.38390738971620475, + "learning_rate": 4.783645853634782e-06, + "loss": 0.6126, + "step": 3101 + }, + { + "epoch": 0.8235762644364795, + "grad_norm": 0.390567561772916, + "learning_rate": 4.783503759956858e-06, + "loss": 0.6273, + "step": 3102 + }, + { + "epoch": 0.8238417629098632, + "grad_norm": 0.38487795571260974, + "learning_rate": 4.78336162174502e-06, + "loss": 0.6588, + "step": 3103 + }, + { + "epoch": 0.8241072613832471, + "grad_norm": 0.38965158774468656, + "learning_rate": 4.783219439002038e-06, + "loss": 0.5916, + "step": 3104 + }, + { + "epoch": 0.8243727598566308, + "grad_norm": 0.37582919918188046, + "learning_rate": 4.7830772117306875e-06, + "loss": 0.5936, + "step": 3105 + }, + { + "epoch": 0.8246382583300146, + "grad_norm": 0.38482312988662143, + "learning_rate": 4.782934939933742e-06, + "loss": 0.6116, + "step": 3106 + }, + { + "epoch": 0.8249037568033983, + "grad_norm": 0.4039339620970389, + "learning_rate": 4.7827926236139754e-06, + "loss": 0.5906, + "step": 3107 + }, + { + "epoch": 0.8251692552767822, + "grad_norm": 0.40081933198032615, + "learning_rate": 4.782650262774164e-06, + "loss": 0.6003, + "step": 3108 + }, + { + "epoch": 0.825434753750166, + "grad_norm": 0.3776685090144088, + "learning_rate": 4.782507857417083e-06, + "loss": 0.5958, + "step": 3109 + }, + { + "epoch": 0.8257002522235497, + "grad_norm": 0.38276870586488, + "learning_rate": 4.782365407545509e-06, + "loss": 0.6117, + "step": 3110 + }, + { + "epoch": 0.8259657506969335, + "grad_norm": 0.3884070994966659, + "learning_rate": 4.782222913162223e-06, + "loss": 0.5878, + "step": 3111 + }, + { + "epoch": 0.8262312491703173, + "grad_norm": 0.39344114122789803, + "learning_rate": 4.782080374270002e-06, + "loss": 0.612, + "step": 3112 + }, + { + "epoch": 0.8264967476437011, + "grad_norm": 0.3851587563270667, + "learning_rate": 4.781937790871626e-06, + "loss": 0.5895, + "step": 3113 + }, + { + "epoch": 0.8267622461170848, + "grad_norm": 0.39150494638776406, + "learning_rate": 4.781795162969875e-06, + "loss": 0.593, + "step": 3114 + }, + { + "epoch": 0.8270277445904686, + "grad_norm": 0.3872467177140655, + "learning_rate": 4.781652490567533e-06, + "loss": 0.6437, + "step": 3115 + }, + { + "epoch": 0.8272932430638524, + "grad_norm": 0.3865888914631022, + "learning_rate": 4.78150977366738e-06, + "loss": 0.6057, + "step": 3116 + }, + { + "epoch": 0.8275587415372362, + "grad_norm": 0.3980511947596024, + "learning_rate": 4.7813670122722015e-06, + "loss": 0.6116, + "step": 3117 + }, + { + "epoch": 0.8278242400106199, + "grad_norm": 0.37673976668592946, + "learning_rate": 4.781224206384779e-06, + "loss": 0.608, + "step": 3118 + }, + { + "epoch": 0.8280897384840037, + "grad_norm": 0.38914364170088517, + "learning_rate": 4.781081356007901e-06, + "loss": 0.6177, + "step": 3119 + }, + { + "epoch": 0.8283552369573874, + "grad_norm": 0.3962658556356825, + "learning_rate": 4.7809384611443496e-06, + "loss": 0.6527, + "step": 3120 + }, + { + "epoch": 0.8286207354307713, + "grad_norm": 0.38940717741034075, + "learning_rate": 4.780795521796914e-06, + "loss": 0.6262, + "step": 3121 + }, + { + "epoch": 0.8288862339041551, + "grad_norm": 0.40627445492556524, + "learning_rate": 4.780652537968382e-06, + "loss": 0.6396, + "step": 3122 + }, + { + "epoch": 0.8291517323775388, + "grad_norm": 0.3889625376418175, + "learning_rate": 4.780509509661541e-06, + "loss": 0.6138, + "step": 3123 + }, + { + "epoch": 0.8294172308509226, + "grad_norm": 0.394228670778925, + "learning_rate": 4.780366436879181e-06, + "loss": 0.6125, + "step": 3124 + }, + { + "epoch": 0.8296827293243064, + "grad_norm": 0.38510912767694816, + "learning_rate": 4.780223319624092e-06, + "loss": 0.6233, + "step": 3125 + }, + { + "epoch": 0.8299482277976902, + "grad_norm": 0.3905933874343222, + "learning_rate": 4.780080157899066e-06, + "loss": 0.629, + "step": 3126 + }, + { + "epoch": 0.8302137262710739, + "grad_norm": 0.40519786211821524, + "learning_rate": 4.7799369517068935e-06, + "loss": 0.6182, + "step": 3127 + }, + { + "epoch": 0.8304792247444577, + "grad_norm": 0.4109984779764674, + "learning_rate": 4.779793701050369e-06, + "loss": 0.6109, + "step": 3128 + }, + { + "epoch": 0.8307447232178415, + "grad_norm": 0.3799831099592614, + "learning_rate": 4.779650405932285e-06, + "loss": 0.6174, + "step": 3129 + }, + { + "epoch": 0.8310102216912253, + "grad_norm": 0.3932220390496914, + "learning_rate": 4.779507066355437e-06, + "loss": 0.6151, + "step": 3130 + }, + { + "epoch": 0.831275720164609, + "grad_norm": 0.3866879722158227, + "learning_rate": 4.779363682322619e-06, + "loss": 0.6373, + "step": 3131 + }, + { + "epoch": 0.8315412186379928, + "grad_norm": 0.3809366409326176, + "learning_rate": 4.779220253836629e-06, + "loss": 0.598, + "step": 3132 + }, + { + "epoch": 0.8318067171113767, + "grad_norm": 0.3849597152962915, + "learning_rate": 4.779076780900264e-06, + "loss": 0.6174, + "step": 3133 + }, + { + "epoch": 0.8320722155847604, + "grad_norm": 0.3905174099243532, + "learning_rate": 4.77893326351632e-06, + "loss": 0.6035, + "step": 3134 + }, + { + "epoch": 0.8323377140581442, + "grad_norm": 0.3917365719394999, + "learning_rate": 4.7787897016876e-06, + "loss": 0.6056, + "step": 3135 + }, + { + "epoch": 0.8326032125315279, + "grad_norm": 0.37880885390080776, + "learning_rate": 4.778646095416899e-06, + "loss": 0.5776, + "step": 3136 + }, + { + "epoch": 0.8328687110049118, + "grad_norm": 0.3848780838439989, + "learning_rate": 4.778502444707022e-06, + "loss": 0.5827, + "step": 3137 + }, + { + "epoch": 0.8331342094782955, + "grad_norm": 0.40166021223746573, + "learning_rate": 4.778358749560767e-06, + "loss": 0.5953, + "step": 3138 + }, + { + "epoch": 0.8333997079516793, + "grad_norm": 0.38804772882322447, + "learning_rate": 4.778215009980939e-06, + "loss": 0.6234, + "step": 3139 + }, + { + "epoch": 0.833665206425063, + "grad_norm": 0.39274118005395675, + "learning_rate": 4.77807122597034e-06, + "loss": 0.6135, + "step": 3140 + }, + { + "epoch": 0.8339307048984468, + "grad_norm": 0.3835964630609637, + "learning_rate": 4.777927397531774e-06, + "loss": 0.6446, + "step": 3141 + }, + { + "epoch": 0.8341962033718306, + "grad_norm": 0.39095336947029047, + "learning_rate": 4.7777835246680474e-06, + "loss": 0.6388, + "step": 3142 + }, + { + "epoch": 0.8344617018452144, + "grad_norm": 0.3880369156107484, + "learning_rate": 4.7776396073819644e-06, + "loss": 0.6282, + "step": 3143 + }, + { + "epoch": 0.8347272003185982, + "grad_norm": 0.3938324515828304, + "learning_rate": 4.777495645676332e-06, + "loss": 0.6383, + "step": 3144 + }, + { + "epoch": 0.8349926987919819, + "grad_norm": 0.390770988372887, + "learning_rate": 4.7773516395539585e-06, + "loss": 0.5779, + "step": 3145 + }, + { + "epoch": 0.8352581972653658, + "grad_norm": 0.3868153973519988, + "learning_rate": 4.777207589017653e-06, + "loss": 0.5991, + "step": 3146 + }, + { + "epoch": 0.8355236957387495, + "grad_norm": 0.39428931749911, + "learning_rate": 4.777063494070224e-06, + "loss": 0.6196, + "step": 3147 + }, + { + "epoch": 0.8357891942121333, + "grad_norm": 0.386015801009133, + "learning_rate": 4.7769193547144796e-06, + "loss": 0.6096, + "step": 3148 + }, + { + "epoch": 0.836054692685517, + "grad_norm": 0.38137073746130273, + "learning_rate": 4.776775170953235e-06, + "loss": 0.5956, + "step": 3149 + }, + { + "epoch": 0.8363201911589009, + "grad_norm": 0.37427746563359643, + "learning_rate": 4.776630942789299e-06, + "loss": 0.5862, + "step": 3150 + }, + { + "epoch": 0.8365856896322846, + "grad_norm": 0.3666149995254103, + "learning_rate": 4.776486670225486e-06, + "loss": 0.6312, + "step": 3151 + }, + { + "epoch": 0.8368511881056684, + "grad_norm": 0.40489049933255256, + "learning_rate": 4.776342353264609e-06, + "loss": 0.6218, + "step": 3152 + }, + { + "epoch": 0.8371166865790521, + "grad_norm": 0.40845708285976867, + "learning_rate": 4.776197991909482e-06, + "loss": 0.6024, + "step": 3153 + }, + { + "epoch": 0.837382185052436, + "grad_norm": 0.3818684657208028, + "learning_rate": 4.7760535861629224e-06, + "loss": 0.6286, + "step": 3154 + }, + { + "epoch": 0.8376476835258198, + "grad_norm": 0.395799031268732, + "learning_rate": 4.775909136027744e-06, + "loss": 0.6188, + "step": 3155 + }, + { + "epoch": 0.8379131819992035, + "grad_norm": 0.3887917180500662, + "learning_rate": 4.7757646415067655e-06, + "loss": 0.605, + "step": 3156 + }, + { + "epoch": 0.8381786804725873, + "grad_norm": 0.3691750625253155, + "learning_rate": 4.7756201026028045e-06, + "loss": 0.5931, + "step": 3157 + }, + { + "epoch": 0.838444178945971, + "grad_norm": 0.39371689343704186, + "learning_rate": 4.775475519318679e-06, + "loss": 0.6165, + "step": 3158 + }, + { + "epoch": 0.8387096774193549, + "grad_norm": 0.39668220311237906, + "learning_rate": 4.7753308916572105e-06, + "loss": 0.6363, + "step": 3159 + }, + { + "epoch": 0.8389751758927386, + "grad_norm": 0.4355400970748323, + "learning_rate": 4.775186219621219e-06, + "loss": 0.5781, + "step": 3160 + }, + { + "epoch": 0.8392406743661224, + "grad_norm": 0.3943198735658743, + "learning_rate": 4.775041503213525e-06, + "loss": 0.6506, + "step": 3161 + }, + { + "epoch": 0.8395061728395061, + "grad_norm": 0.4012652048985837, + "learning_rate": 4.774896742436951e-06, + "loss": 0.6253, + "step": 3162 + }, + { + "epoch": 0.83977167131289, + "grad_norm": 0.4292686082847599, + "learning_rate": 4.774751937294321e-06, + "loss": 0.6335, + "step": 3163 + }, + { + "epoch": 0.8400371697862737, + "grad_norm": 0.4069525415717263, + "learning_rate": 4.774607087788459e-06, + "loss": 0.6076, + "step": 3164 + }, + { + "epoch": 0.8403026682596575, + "grad_norm": 0.38379359031761656, + "learning_rate": 4.7744621939221895e-06, + "loss": 0.599, + "step": 3165 + }, + { + "epoch": 0.8405681667330412, + "grad_norm": 0.398286940665663, + "learning_rate": 4.774317255698339e-06, + "loss": 0.6181, + "step": 3166 + }, + { + "epoch": 0.8408336652064251, + "grad_norm": 0.3917339772562374, + "learning_rate": 4.774172273119732e-06, + "loss": 0.6525, + "step": 3167 + }, + { + "epoch": 0.8410991636798089, + "grad_norm": 0.4021805520194308, + "learning_rate": 4.774027246189198e-06, + "loss": 0.5924, + "step": 3168 + }, + { + "epoch": 0.8413646621531926, + "grad_norm": 0.394687312393228, + "learning_rate": 4.773882174909565e-06, + "loss": 0.6059, + "step": 3169 + }, + { + "epoch": 0.8416301606265764, + "grad_norm": 0.393902015013949, + "learning_rate": 4.773737059283663e-06, + "loss": 0.6391, + "step": 3170 + }, + { + "epoch": 0.8418956590999602, + "grad_norm": 0.3926874567469572, + "learning_rate": 4.77359189931432e-06, + "loss": 0.6377, + "step": 3171 + }, + { + "epoch": 0.842161157573344, + "grad_norm": 0.38647410589710635, + "learning_rate": 4.7734466950043686e-06, + "loss": 0.6461, + "step": 3172 + }, + { + "epoch": 0.8424266560467277, + "grad_norm": 0.39205605490187023, + "learning_rate": 4.773301446356642e-06, + "loss": 0.5811, + "step": 3173 + }, + { + "epoch": 0.8426921545201115, + "grad_norm": 0.39220506787691517, + "learning_rate": 4.773156153373969e-06, + "loss": 0.6032, + "step": 3174 + }, + { + "epoch": 0.8429576529934952, + "grad_norm": 0.3900570347682403, + "learning_rate": 4.773010816059186e-06, + "loss": 0.6157, + "step": 3175 + }, + { + "epoch": 0.8432231514668791, + "grad_norm": 0.38721075626040485, + "learning_rate": 4.772865434415127e-06, + "loss": 0.6215, + "step": 3176 + }, + { + "epoch": 0.8434886499402628, + "grad_norm": 0.37977109955804594, + "learning_rate": 4.772720008444627e-06, + "loss": 0.6251, + "step": 3177 + }, + { + "epoch": 0.8437541484136466, + "grad_norm": 0.37568541974395564, + "learning_rate": 4.7725745381505224e-06, + "loss": 0.6136, + "step": 3178 + }, + { + "epoch": 0.8440196468870304, + "grad_norm": 0.38619740189382623, + "learning_rate": 4.772429023535649e-06, + "loss": 0.6187, + "step": 3179 + }, + { + "epoch": 0.8442851453604142, + "grad_norm": 0.4080744813654574, + "learning_rate": 4.772283464602848e-06, + "loss": 0.615, + "step": 3180 + }, + { + "epoch": 0.844550643833798, + "grad_norm": 0.39936099484356835, + "learning_rate": 4.772137861354954e-06, + "loss": 0.638, + "step": 3181 + }, + { + "epoch": 0.8448161423071817, + "grad_norm": 0.3864016093507137, + "learning_rate": 4.771992213794809e-06, + "loss": 0.6085, + "step": 3182 + }, + { + "epoch": 0.8450816407805655, + "grad_norm": 0.40017897090729204, + "learning_rate": 4.771846521925254e-06, + "loss": 0.5825, + "step": 3183 + }, + { + "epoch": 0.8453471392539493, + "grad_norm": 0.4003068420345821, + "learning_rate": 4.771700785749128e-06, + "loss": 0.653, + "step": 3184 + }, + { + "epoch": 0.8456126377273331, + "grad_norm": 0.3769586394165981, + "learning_rate": 4.771555005269276e-06, + "loss": 0.6215, + "step": 3185 + }, + { + "epoch": 0.8458781362007168, + "grad_norm": 0.38817502344943944, + "learning_rate": 4.7714091804885386e-06, + "loss": 0.6301, + "step": 3186 + }, + { + "epoch": 0.8461436346741006, + "grad_norm": 0.3942822918160568, + "learning_rate": 4.7712633114097615e-06, + "loss": 0.6495, + "step": 3187 + }, + { + "epoch": 0.8464091331474844, + "grad_norm": 0.38853609691553753, + "learning_rate": 4.7711173980357886e-06, + "loss": 0.65, + "step": 3188 + }, + { + "epoch": 0.8466746316208682, + "grad_norm": 0.39466017653366076, + "learning_rate": 4.770971440369465e-06, + "loss": 0.6282, + "step": 3189 + }, + { + "epoch": 0.846940130094252, + "grad_norm": 0.37338410635228586, + "learning_rate": 4.770825438413639e-06, + "loss": 0.5808, + "step": 3190 + }, + { + "epoch": 0.8472056285676357, + "grad_norm": 0.3784067564181116, + "learning_rate": 4.770679392171156e-06, + "loss": 0.6213, + "step": 3191 + }, + { + "epoch": 0.8474711270410196, + "grad_norm": 0.43627897176446817, + "learning_rate": 4.770533301644866e-06, + "loss": 0.5834, + "step": 3192 + }, + { + "epoch": 0.8477366255144033, + "grad_norm": 0.3889817148927191, + "learning_rate": 4.770387166837618e-06, + "loss": 0.5672, + "step": 3193 + }, + { + "epoch": 0.8480021239877871, + "grad_norm": 0.3792133359844391, + "learning_rate": 4.77024098775226e-06, + "loss": 0.6271, + "step": 3194 + }, + { + "epoch": 0.8482676224611708, + "grad_norm": 0.3888465738791731, + "learning_rate": 4.770094764391645e-06, + "loss": 0.564, + "step": 3195 + }, + { + "epoch": 0.8485331209345546, + "grad_norm": 0.3920221713124069, + "learning_rate": 4.769948496758624e-06, + "loss": 0.6149, + "step": 3196 + }, + { + "epoch": 0.8487986194079384, + "grad_norm": 0.42496371198852184, + "learning_rate": 4.7698021848560495e-06, + "loss": 0.6031, + "step": 3197 + }, + { + "epoch": 0.8490641178813222, + "grad_norm": 0.38540140272960993, + "learning_rate": 4.769655828686775e-06, + "loss": 0.6429, + "step": 3198 + }, + { + "epoch": 0.8493296163547059, + "grad_norm": 0.4314829857035156, + "learning_rate": 4.769509428253655e-06, + "loss": 0.6424, + "step": 3199 + }, + { + "epoch": 0.8495951148280897, + "grad_norm": 0.39222652379073375, + "learning_rate": 4.769362983559544e-06, + "loss": 0.5902, + "step": 3200 + }, + { + "epoch": 0.8498606133014736, + "grad_norm": 0.4012979678910393, + "learning_rate": 4.769216494607298e-06, + "loss": 0.6151, + "step": 3201 + }, + { + "epoch": 0.8501261117748573, + "grad_norm": 0.38349179695624713, + "learning_rate": 4.7690699613997745e-06, + "loss": 0.6028, + "step": 3202 + }, + { + "epoch": 0.8503916102482411, + "grad_norm": 0.41836772041991915, + "learning_rate": 4.768923383939832e-06, + "loss": 0.5919, + "step": 3203 + }, + { + "epoch": 0.8506571087216248, + "grad_norm": 0.39372308711913767, + "learning_rate": 4.768776762230328e-06, + "loss": 0.6112, + "step": 3204 + }, + { + "epoch": 0.8509226071950087, + "grad_norm": 0.3951894127325376, + "learning_rate": 4.768630096274121e-06, + "loss": 0.6077, + "step": 3205 + }, + { + "epoch": 0.8511881056683924, + "grad_norm": 0.39088160061797267, + "learning_rate": 4.7684833860740746e-06, + "loss": 0.6161, + "step": 3206 + }, + { + "epoch": 0.8514536041417762, + "grad_norm": 0.38585445036304644, + "learning_rate": 4.768336631633046e-06, + "loss": 0.6118, + "step": 3207 + }, + { + "epoch": 0.8517191026151599, + "grad_norm": 0.38547069026437336, + "learning_rate": 4.7681898329539004e-06, + "loss": 0.6158, + "step": 3208 + }, + { + "epoch": 0.8519846010885438, + "grad_norm": 0.39136239017861296, + "learning_rate": 4.768042990039499e-06, + "loss": 0.6758, + "step": 3209 + }, + { + "epoch": 0.8522500995619275, + "grad_norm": 0.3811011756459646, + "learning_rate": 4.767896102892706e-06, + "loss": 0.6718, + "step": 3210 + }, + { + "epoch": 0.8525155980353113, + "grad_norm": 0.38935438062194233, + "learning_rate": 4.767749171516387e-06, + "loss": 0.6134, + "step": 3211 + }, + { + "epoch": 0.852781096508695, + "grad_norm": 0.3931317395063905, + "learning_rate": 4.767602195913407e-06, + "loss": 0.6247, + "step": 3212 + }, + { + "epoch": 0.8530465949820788, + "grad_norm": 0.40413026725746803, + "learning_rate": 4.767455176086631e-06, + "loss": 0.617, + "step": 3213 + }, + { + "epoch": 0.8533120934554627, + "grad_norm": 0.3768135899049395, + "learning_rate": 4.767308112038928e-06, + "loss": 0.6218, + "step": 3214 + }, + { + "epoch": 0.8535775919288464, + "grad_norm": 0.4069314027079967, + "learning_rate": 4.767161003773165e-06, + "loss": 0.6224, + "step": 3215 + }, + { + "epoch": 0.8538430904022302, + "grad_norm": 0.39850720930153566, + "learning_rate": 4.767013851292213e-06, + "loss": 0.6111, + "step": 3216 + }, + { + "epoch": 0.8541085888756139, + "grad_norm": 0.4005002568480204, + "learning_rate": 4.766866654598938e-06, + "loss": 0.6544, + "step": 3217 + }, + { + "epoch": 0.8543740873489978, + "grad_norm": 0.44980922914111277, + "learning_rate": 4.766719413696215e-06, + "loss": 0.6142, + "step": 3218 + }, + { + "epoch": 0.8546395858223815, + "grad_norm": 0.38353505496742896, + "learning_rate": 4.7665721285869124e-06, + "loss": 0.5871, + "step": 3219 + }, + { + "epoch": 0.8549050842957653, + "grad_norm": 0.3764916703833999, + "learning_rate": 4.766424799273905e-06, + "loss": 0.5943, + "step": 3220 + }, + { + "epoch": 0.855170582769149, + "grad_norm": 0.38866086443464615, + "learning_rate": 4.766277425760063e-06, + "loss": 0.6284, + "step": 3221 + }, + { + "epoch": 0.8554360812425329, + "grad_norm": 0.392596700224273, + "learning_rate": 4.7661300080482635e-06, + "loss": 0.6329, + "step": 3222 + }, + { + "epoch": 0.8557015797159166, + "grad_norm": 0.4082030501237004, + "learning_rate": 4.76598254614138e-06, + "loss": 0.5855, + "step": 3223 + }, + { + "epoch": 0.8559670781893004, + "grad_norm": 0.41373048475368934, + "learning_rate": 4.765835040042288e-06, + "loss": 0.5874, + "step": 3224 + }, + { + "epoch": 0.8562325766626842, + "grad_norm": 0.38405175079068105, + "learning_rate": 4.765687489753866e-06, + "loss": 0.6002, + "step": 3225 + }, + { + "epoch": 0.856498075136068, + "grad_norm": 0.3854283651083716, + "learning_rate": 4.765539895278991e-06, + "loss": 0.6149, + "step": 3226 + }, + { + "epoch": 0.8567635736094518, + "grad_norm": 0.4031568729841069, + "learning_rate": 4.7653922566205414e-06, + "loss": 0.5757, + "step": 3227 + }, + { + "epoch": 0.8570290720828355, + "grad_norm": 0.3836051323887502, + "learning_rate": 4.765244573781394e-06, + "loss": 0.6389, + "step": 3228 + }, + { + "epoch": 0.8572945705562193, + "grad_norm": 0.3867823167628796, + "learning_rate": 4.765096846764433e-06, + "loss": 0.6013, + "step": 3229 + }, + { + "epoch": 0.8575600690296031, + "grad_norm": 0.4094187066410639, + "learning_rate": 4.764949075572537e-06, + "loss": 0.6035, + "step": 3230 + }, + { + "epoch": 0.8578255675029869, + "grad_norm": 0.4031456084406773, + "learning_rate": 4.764801260208589e-06, + "loss": 0.6202, + "step": 3231 + }, + { + "epoch": 0.8580910659763706, + "grad_norm": 0.39175898887486005, + "learning_rate": 4.7646534006754705e-06, + "loss": 0.6309, + "step": 3232 + }, + { + "epoch": 0.8583565644497544, + "grad_norm": 0.3905216451561184, + "learning_rate": 4.764505496976066e-06, + "loss": 0.6104, + "step": 3233 + }, + { + "epoch": 0.8586220629231381, + "grad_norm": 0.4059328610186368, + "learning_rate": 4.764357549113261e-06, + "loss": 0.621, + "step": 3234 + }, + { + "epoch": 0.858887561396522, + "grad_norm": 0.3959113807772086, + "learning_rate": 4.764209557089938e-06, + "loss": 0.6541, + "step": 3235 + }, + { + "epoch": 0.8591530598699058, + "grad_norm": 0.37569948041243706, + "learning_rate": 4.764061520908986e-06, + "loss": 0.6351, + "step": 3236 + }, + { + "epoch": 0.8594185583432895, + "grad_norm": 0.3933463708060713, + "learning_rate": 4.76391344057329e-06, + "loss": 0.5878, + "step": 3237 + }, + { + "epoch": 0.8596840568166733, + "grad_norm": 0.3967413311947283, + "learning_rate": 4.763765316085739e-06, + "loss": 0.6263, + "step": 3238 + }, + { + "epoch": 0.8599495552900571, + "grad_norm": 0.38374454956475107, + "learning_rate": 4.763617147449222e-06, + "loss": 0.6187, + "step": 3239 + }, + { + "epoch": 0.8602150537634409, + "grad_norm": 0.39735130900058785, + "learning_rate": 4.763468934666629e-06, + "loss": 0.6155, + "step": 3240 + }, + { + "epoch": 0.8604805522368246, + "grad_norm": 0.3858536539765493, + "learning_rate": 4.763320677740849e-06, + "loss": 0.6341, + "step": 3241 + }, + { + "epoch": 0.8607460507102084, + "grad_norm": 0.37599079648790223, + "learning_rate": 4.763172376674775e-06, + "loss": 0.6122, + "step": 3242 + }, + { + "epoch": 0.8610115491835922, + "grad_norm": 0.38521902853267986, + "learning_rate": 4.763024031471298e-06, + "loss": 0.5676, + "step": 3243 + }, + { + "epoch": 0.861277047656976, + "grad_norm": 0.39153599125625715, + "learning_rate": 4.762875642133311e-06, + "loss": 0.5694, + "step": 3244 + }, + { + "epoch": 0.8615425461303597, + "grad_norm": 0.3823507852988624, + "learning_rate": 4.7627272086637085e-06, + "loss": 0.5846, + "step": 3245 + }, + { + "epoch": 0.8618080446037435, + "grad_norm": 0.40803509453668707, + "learning_rate": 4.762578731065387e-06, + "loss": 0.6291, + "step": 3246 + }, + { + "epoch": 0.8620735430771274, + "grad_norm": 0.40715214908263025, + "learning_rate": 4.762430209341239e-06, + "loss": 0.6618, + "step": 3247 + }, + { + "epoch": 0.8623390415505111, + "grad_norm": 0.3945799129136733, + "learning_rate": 4.762281643494163e-06, + "loss": 0.6323, + "step": 3248 + }, + { + "epoch": 0.8626045400238949, + "grad_norm": 0.3882279641623884, + "learning_rate": 4.762133033527056e-06, + "loss": 0.6015, + "step": 3249 + }, + { + "epoch": 0.8628700384972786, + "grad_norm": 0.38923969820469684, + "learning_rate": 4.761984379442816e-06, + "loss": 0.6155, + "step": 3250 + }, + { + "epoch": 0.8631355369706624, + "grad_norm": 0.38452506604652875, + "learning_rate": 4.761835681244342e-06, + "loss": 0.6291, + "step": 3251 + }, + { + "epoch": 0.8634010354440462, + "grad_norm": 0.39722003852402576, + "learning_rate": 4.761686938934534e-06, + "loss": 0.6235, + "step": 3252 + }, + { + "epoch": 0.86366653391743, + "grad_norm": 0.39554783917357644, + "learning_rate": 4.761538152516294e-06, + "loss": 0.6092, + "step": 3253 + }, + { + "epoch": 0.8639320323908137, + "grad_norm": 0.4056793370090963, + "learning_rate": 4.761389321992522e-06, + "loss": 0.6271, + "step": 3254 + }, + { + "epoch": 0.8641975308641975, + "grad_norm": 0.39004068585232793, + "learning_rate": 4.7612404473661225e-06, + "loss": 0.6239, + "step": 3255 + }, + { + "epoch": 0.8644630293375813, + "grad_norm": 0.37458859211804146, + "learning_rate": 4.7610915286399976e-06, + "loss": 0.6052, + "step": 3256 + }, + { + "epoch": 0.8647285278109651, + "grad_norm": 0.3854551154278075, + "learning_rate": 4.760942565817051e-06, + "loss": 0.5855, + "step": 3257 + }, + { + "epoch": 0.8649940262843488, + "grad_norm": 0.39172330553477347, + "learning_rate": 4.7607935589001884e-06, + "loss": 0.5993, + "step": 3258 + }, + { + "epoch": 0.8652595247577326, + "grad_norm": 0.38955921923922726, + "learning_rate": 4.760644507892317e-06, + "loss": 0.6043, + "step": 3259 + }, + { + "epoch": 0.8655250232311165, + "grad_norm": 0.38910605908125334, + "learning_rate": 4.7604954127963425e-06, + "loss": 0.6279, + "step": 3260 + }, + { + "epoch": 0.8657905217045002, + "grad_norm": 0.38661001591718847, + "learning_rate": 4.7603462736151725e-06, + "loss": 0.6092, + "step": 3261 + }, + { + "epoch": 0.866056020177884, + "grad_norm": 0.3775927695644392, + "learning_rate": 4.760197090351716e-06, + "loss": 0.6236, + "step": 3262 + }, + { + "epoch": 0.8663215186512677, + "grad_norm": 0.38930640013674983, + "learning_rate": 4.760047863008883e-06, + "loss": 0.5734, + "step": 3263 + }, + { + "epoch": 0.8665870171246516, + "grad_norm": 0.39780941997169983, + "learning_rate": 4.759898591589582e-06, + "loss": 0.5673, + "step": 3264 + }, + { + "epoch": 0.8668525155980353, + "grad_norm": 0.383324042561951, + "learning_rate": 4.759749276096726e-06, + "loss": 0.6068, + "step": 3265 + }, + { + "epoch": 0.8671180140714191, + "grad_norm": 0.3968484934702401, + "learning_rate": 4.759599916533226e-06, + "loss": 0.6243, + "step": 3266 + }, + { + "epoch": 0.8673835125448028, + "grad_norm": 0.37545830597412905, + "learning_rate": 4.759450512901995e-06, + "loss": 0.5757, + "step": 3267 + }, + { + "epoch": 0.8676490110181867, + "grad_norm": 0.4104189431074705, + "learning_rate": 4.759301065205947e-06, + "loss": 0.6375, + "step": 3268 + }, + { + "epoch": 0.8679145094915705, + "grad_norm": 0.3834421916652933, + "learning_rate": 4.759151573447996e-06, + "loss": 0.6068, + "step": 3269 + }, + { + "epoch": 0.8681800079649542, + "grad_norm": 0.393487425142642, + "learning_rate": 4.75900203763106e-06, + "loss": 0.6218, + "step": 3270 + }, + { + "epoch": 0.868445506438338, + "grad_norm": 0.3808668639964955, + "learning_rate": 4.7588524577580505e-06, + "loss": 0.6066, + "step": 3271 + }, + { + "epoch": 0.8687110049117217, + "grad_norm": 0.38748737323108323, + "learning_rate": 4.75870283383189e-06, + "loss": 0.5993, + "step": 3272 + }, + { + "epoch": 0.8689765033851056, + "grad_norm": 0.3936400929341232, + "learning_rate": 4.758553165855492e-06, + "loss": 0.6576, + "step": 3273 + }, + { + "epoch": 0.8692420018584893, + "grad_norm": 0.40152461480424106, + "learning_rate": 4.758403453831778e-06, + "loss": 0.6477, + "step": 3274 + }, + { + "epoch": 0.8695075003318731, + "grad_norm": 0.3761131885945365, + "learning_rate": 4.758253697763668e-06, + "loss": 0.5842, + "step": 3275 + }, + { + "epoch": 0.8697729988052568, + "grad_norm": 0.3909990154254638, + "learning_rate": 4.758103897654081e-06, + "loss": 0.625, + "step": 3276 + }, + { + "epoch": 0.8700384972786407, + "grad_norm": 0.38964355475994894, + "learning_rate": 4.757954053505939e-06, + "loss": 0.5957, + "step": 3277 + }, + { + "epoch": 0.8703039957520244, + "grad_norm": 0.3880481326523315, + "learning_rate": 4.757804165322165e-06, + "loss": 0.6075, + "step": 3278 + }, + { + "epoch": 0.8705694942254082, + "grad_norm": 0.3894186819851033, + "learning_rate": 4.7576542331056814e-06, + "loss": 0.6033, + "step": 3279 + }, + { + "epoch": 0.8708349926987919, + "grad_norm": 0.38956140968114766, + "learning_rate": 4.757504256859412e-06, + "loss": 0.5867, + "step": 3280 + }, + { + "epoch": 0.8711004911721758, + "grad_norm": 0.39246319443338373, + "learning_rate": 4.757354236586283e-06, + "loss": 0.6222, + "step": 3281 + }, + { + "epoch": 0.8713659896455596, + "grad_norm": 0.39588894736088304, + "learning_rate": 4.75720417228922e-06, + "loss": 0.6361, + "step": 3282 + }, + { + "epoch": 0.8716314881189433, + "grad_norm": 0.3897914130351932, + "learning_rate": 4.757054063971148e-06, + "loss": 0.6113, + "step": 3283 + }, + { + "epoch": 0.8718969865923271, + "grad_norm": 0.3939227850561614, + "learning_rate": 4.756903911634996e-06, + "loss": 0.6071, + "step": 3284 + }, + { + "epoch": 0.8721624850657109, + "grad_norm": 0.4076397357244799, + "learning_rate": 4.7567537152836915e-06, + "loss": 0.6469, + "step": 3285 + }, + { + "epoch": 0.8724279835390947, + "grad_norm": 0.4119187367460943, + "learning_rate": 4.756603474920165e-06, + "loss": 0.5813, + "step": 3286 + }, + { + "epoch": 0.8726934820124784, + "grad_norm": 0.39519397283206353, + "learning_rate": 4.756453190547344e-06, + "loss": 0.6334, + "step": 3287 + }, + { + "epoch": 0.8729589804858622, + "grad_norm": 0.3871710551227271, + "learning_rate": 4.756302862168163e-06, + "loss": 0.6347, + "step": 3288 + }, + { + "epoch": 0.8732244789592459, + "grad_norm": 0.38669326400346216, + "learning_rate": 4.7561524897855504e-06, + "loss": 0.6172, + "step": 3289 + }, + { + "epoch": 0.8734899774326298, + "grad_norm": 0.39639256465687545, + "learning_rate": 4.756002073402441e-06, + "loss": 0.6305, + "step": 3290 + }, + { + "epoch": 0.8737554759060135, + "grad_norm": 0.39930090714223104, + "learning_rate": 4.755851613021767e-06, + "loss": 0.6039, + "step": 3291 + }, + { + "epoch": 0.8740209743793973, + "grad_norm": 0.3699246895708019, + "learning_rate": 4.7557011086464634e-06, + "loss": 0.5902, + "step": 3292 + }, + { + "epoch": 0.874286472852781, + "grad_norm": 0.3915798091559982, + "learning_rate": 4.755550560279465e-06, + "loss": 0.6154, + "step": 3293 + }, + { + "epoch": 0.8745519713261649, + "grad_norm": 0.41677536311013336, + "learning_rate": 4.755399967923709e-06, + "loss": 0.6049, + "step": 3294 + }, + { + "epoch": 0.8748174697995487, + "grad_norm": 0.3949964007316087, + "learning_rate": 4.755249331582131e-06, + "loss": 0.6511, + "step": 3295 + }, + { + "epoch": 0.8750829682729324, + "grad_norm": 0.39726565234475836, + "learning_rate": 4.755098651257669e-06, + "loss": 0.5981, + "step": 3296 + }, + { + "epoch": 0.8753484667463162, + "grad_norm": 0.4127253070033059, + "learning_rate": 4.7549479269532615e-06, + "loss": 0.608, + "step": 3297 + }, + { + "epoch": 0.8756139652197, + "grad_norm": 0.38828082155939475, + "learning_rate": 4.754797158671849e-06, + "loss": 0.6173, + "step": 3298 + }, + { + "epoch": 0.8758794636930838, + "grad_norm": 0.38841361287621073, + "learning_rate": 4.754646346416371e-06, + "loss": 0.5959, + "step": 3299 + }, + { + "epoch": 0.8761449621664675, + "grad_norm": 0.38566806534210335, + "learning_rate": 4.754495490189769e-06, + "loss": 0.6195, + "step": 3300 + }, + { + "epoch": 0.8764104606398513, + "grad_norm": 0.3907819137410776, + "learning_rate": 4.754344589994985e-06, + "loss": 0.6131, + "step": 3301 + }, + { + "epoch": 0.8766759591132351, + "grad_norm": 0.38308247558070607, + "learning_rate": 4.754193645834963e-06, + "loss": 0.598, + "step": 3302 + }, + { + "epoch": 0.8769414575866189, + "grad_norm": 0.40440391073340404, + "learning_rate": 4.754042657712643e-06, + "loss": 0.5883, + "step": 3303 + }, + { + "epoch": 0.8772069560600027, + "grad_norm": 0.39233713811219384, + "learning_rate": 4.753891625630975e-06, + "loss": 0.6387, + "step": 3304 + }, + { + "epoch": 0.8774724545333864, + "grad_norm": 0.39130254882310145, + "learning_rate": 4.7537405495929e-06, + "loss": 0.6092, + "step": 3305 + }, + { + "epoch": 0.8777379530067703, + "grad_norm": 0.39412519967687676, + "learning_rate": 4.753589429601367e-06, + "loss": 0.5972, + "step": 3306 + }, + { + "epoch": 0.878003451480154, + "grad_norm": 0.40428041354450067, + "learning_rate": 4.753438265659322e-06, + "loss": 0.5991, + "step": 3307 + }, + { + "epoch": 0.8782689499535378, + "grad_norm": 0.37504294957402573, + "learning_rate": 4.753287057769714e-06, + "loss": 0.6327, + "step": 3308 + }, + { + "epoch": 0.8785344484269215, + "grad_norm": 0.3839356147912363, + "learning_rate": 4.753135805935492e-06, + "loss": 0.6204, + "step": 3309 + }, + { + "epoch": 0.8787999469003053, + "grad_norm": 0.40923237791155004, + "learning_rate": 4.752984510159604e-06, + "loss": 0.592, + "step": 3310 + }, + { + "epoch": 0.8790654453736891, + "grad_norm": 0.39458197109693294, + "learning_rate": 4.752833170445002e-06, + "loss": 0.6846, + "step": 3311 + }, + { + "epoch": 0.8793309438470729, + "grad_norm": 0.3956103479023119, + "learning_rate": 4.752681786794637e-06, + "loss": 0.6084, + "step": 3312 + }, + { + "epoch": 0.8795964423204566, + "grad_norm": 0.3800192075546216, + "learning_rate": 4.752530359211462e-06, + "loss": 0.601, + "step": 3313 + }, + { + "epoch": 0.8798619407938404, + "grad_norm": 0.38879181208594316, + "learning_rate": 4.75237888769843e-06, + "loss": 0.616, + "step": 3314 + }, + { + "epoch": 0.8801274392672243, + "grad_norm": 0.388194199082558, + "learning_rate": 4.752227372258495e-06, + "loss": 0.6089, + "step": 3315 + }, + { + "epoch": 0.880392937740608, + "grad_norm": 0.3725905196810677, + "learning_rate": 4.752075812894611e-06, + "loss": 0.5965, + "step": 3316 + }, + { + "epoch": 0.8806584362139918, + "grad_norm": 0.39300519418278473, + "learning_rate": 4.751924209609735e-06, + "loss": 0.6254, + "step": 3317 + }, + { + "epoch": 0.8809239346873755, + "grad_norm": 0.4064612647247102, + "learning_rate": 4.751772562406824e-06, + "loss": 0.6028, + "step": 3318 + }, + { + "epoch": 0.8811894331607594, + "grad_norm": 0.38658665977171214, + "learning_rate": 4.751620871288834e-06, + "loss": 0.6104, + "step": 3319 + }, + { + "epoch": 0.8814549316341431, + "grad_norm": 0.39343046388128616, + "learning_rate": 4.7514691362587245e-06, + "loss": 0.6232, + "step": 3320 + }, + { + "epoch": 0.8817204301075269, + "grad_norm": 0.4038024334947061, + "learning_rate": 4.751317357319454e-06, + "loss": 0.6324, + "step": 3321 + }, + { + "epoch": 0.8819859285809106, + "grad_norm": 0.4241980139379008, + "learning_rate": 4.751165534473983e-06, + "loss": 0.6009, + "step": 3322 + }, + { + "epoch": 0.8822514270542945, + "grad_norm": 0.3860880812837124, + "learning_rate": 4.7510136677252724e-06, + "loss": 0.5575, + "step": 3323 + }, + { + "epoch": 0.8825169255276782, + "grad_norm": 0.39684502196861987, + "learning_rate": 4.7508617570762836e-06, + "loss": 0.6574, + "step": 3324 + }, + { + "epoch": 0.882782424001062, + "grad_norm": 0.4076841287759156, + "learning_rate": 4.750709802529979e-06, + "loss": 0.6226, + "step": 3325 + }, + { + "epoch": 0.8830479224744457, + "grad_norm": 0.49457788706324485, + "learning_rate": 4.7505578040893236e-06, + "loss": 0.5857, + "step": 3326 + }, + { + "epoch": 0.8833134209478295, + "grad_norm": 0.3664050501234694, + "learning_rate": 4.75040576175728e-06, + "loss": 0.6097, + "step": 3327 + }, + { + "epoch": 0.8835789194212134, + "grad_norm": 0.3796588890867391, + "learning_rate": 4.750253675536815e-06, + "loss": 0.6101, + "step": 3328 + }, + { + "epoch": 0.8838444178945971, + "grad_norm": 0.4578942762009108, + "learning_rate": 4.750101545430893e-06, + "loss": 0.6186, + "step": 3329 + }, + { + "epoch": 0.8841099163679809, + "grad_norm": 0.44615251759776303, + "learning_rate": 4.749949371442481e-06, + "loss": 0.6068, + "step": 3330 + }, + { + "epoch": 0.8843754148413646, + "grad_norm": 0.38671244008324474, + "learning_rate": 4.749797153574548e-06, + "loss": 0.6161, + "step": 3331 + }, + { + "epoch": 0.8846409133147485, + "grad_norm": 0.4137124586297112, + "learning_rate": 4.749644891830063e-06, + "loss": 0.6128, + "step": 3332 + }, + { + "epoch": 0.8849064117881322, + "grad_norm": 0.47396625417379273, + "learning_rate": 4.749492586211994e-06, + "loss": 0.6194, + "step": 3333 + }, + { + "epoch": 0.885171910261516, + "grad_norm": 0.4136404060666191, + "learning_rate": 4.749340236723311e-06, + "loss": 0.6364, + "step": 3334 + }, + { + "epoch": 0.8854374087348997, + "grad_norm": 0.39737095985235527, + "learning_rate": 4.749187843366987e-06, + "loss": 0.575, + "step": 3335 + }, + { + "epoch": 0.8857029072082836, + "grad_norm": 0.4016915774106672, + "learning_rate": 4.7490354061459926e-06, + "loss": 0.6214, + "step": 3336 + }, + { + "epoch": 0.8859684056816673, + "grad_norm": 0.3923140730107079, + "learning_rate": 4.7488829250633015e-06, + "loss": 0.6213, + "step": 3337 + }, + { + "epoch": 0.8862339041550511, + "grad_norm": 0.3950996626156589, + "learning_rate": 4.748730400121887e-06, + "loss": 0.6503, + "step": 3338 + }, + { + "epoch": 0.8864994026284349, + "grad_norm": 0.3865926499544235, + "learning_rate": 4.748577831324723e-06, + "loss": 0.6139, + "step": 3339 + }, + { + "epoch": 0.8867649011018187, + "grad_norm": 0.3876237606170458, + "learning_rate": 4.748425218674787e-06, + "loss": 0.6486, + "step": 3340 + }, + { + "epoch": 0.8870303995752025, + "grad_norm": 0.39219201650548463, + "learning_rate": 4.748272562175054e-06, + "loss": 0.6845, + "step": 3341 + }, + { + "epoch": 0.8872958980485862, + "grad_norm": 0.4229854213010335, + "learning_rate": 4.748119861828501e-06, + "loss": 0.5981, + "step": 3342 + }, + { + "epoch": 0.88756139652197, + "grad_norm": 0.3867845738529824, + "learning_rate": 4.747967117638107e-06, + "loss": 0.6451, + "step": 3343 + }, + { + "epoch": 0.8878268949953537, + "grad_norm": 0.4103182291847476, + "learning_rate": 4.7478143296068494e-06, + "loss": 0.6363, + "step": 3344 + }, + { + "epoch": 0.8880923934687376, + "grad_norm": 0.3923960939700569, + "learning_rate": 4.747661497737709e-06, + "loss": 0.5832, + "step": 3345 + }, + { + "epoch": 0.8883578919421213, + "grad_norm": 0.39655784312828, + "learning_rate": 4.747508622033666e-06, + "loss": 0.629, + "step": 3346 + }, + { + "epoch": 0.8886233904155051, + "grad_norm": 0.39563218181553156, + "learning_rate": 4.747355702497702e-06, + "loss": 0.6217, + "step": 3347 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.3894663617075761, + "learning_rate": 4.7472027391328e-06, + "loss": 0.6236, + "step": 3348 + }, + { + "epoch": 0.8891543873622727, + "grad_norm": 0.3794840489106913, + "learning_rate": 4.747049731941943e-06, + "loss": 0.6108, + "step": 3349 + }, + { + "epoch": 0.8894198858356565, + "grad_norm": 0.406072941096683, + "learning_rate": 4.746896680928113e-06, + "loss": 0.6348, + "step": 3350 + }, + { + "epoch": 0.8896853843090402, + "grad_norm": 0.3906669065421325, + "learning_rate": 4.746743586094297e-06, + "loss": 0.5816, + "step": 3351 + }, + { + "epoch": 0.889950882782424, + "grad_norm": 0.4005635751735138, + "learning_rate": 4.7465904474434795e-06, + "loss": 0.6032, + "step": 3352 + }, + { + "epoch": 0.8902163812558078, + "grad_norm": 0.40568533560202646, + "learning_rate": 4.746437264978649e-06, + "loss": 0.6506, + "step": 3353 + }, + { + "epoch": 0.8904818797291916, + "grad_norm": 0.3867746098250967, + "learning_rate": 4.7462840387027905e-06, + "loss": 0.6336, + "step": 3354 + }, + { + "epoch": 0.8907473782025753, + "grad_norm": 0.4134811903339993, + "learning_rate": 4.746130768618894e-06, + "loss": 0.5994, + "step": 3355 + }, + { + "epoch": 0.8910128766759591, + "grad_norm": 0.3959572858923188, + "learning_rate": 4.745977454729947e-06, + "loss": 0.5902, + "step": 3356 + }, + { + "epoch": 0.8912783751493429, + "grad_norm": 0.36989418847389577, + "learning_rate": 4.7458240970389415e-06, + "loss": 0.5872, + "step": 3357 + }, + { + "epoch": 0.8915438736227267, + "grad_norm": 0.39633422708202654, + "learning_rate": 4.7456706955488674e-06, + "loss": 0.6096, + "step": 3358 + }, + { + "epoch": 0.8918093720961104, + "grad_norm": 0.40682819594508984, + "learning_rate": 4.745517250262717e-06, + "loss": 0.6215, + "step": 3359 + }, + { + "epoch": 0.8920748705694942, + "grad_norm": 0.39040005074295636, + "learning_rate": 4.7453637611834805e-06, + "loss": 0.5893, + "step": 3360 + }, + { + "epoch": 0.8923403690428781, + "grad_norm": 0.38470472898196983, + "learning_rate": 4.7452102283141545e-06, + "loss": 0.5662, + "step": 3361 + }, + { + "epoch": 0.8926058675162618, + "grad_norm": 0.38319021046409607, + "learning_rate": 4.745056651657731e-06, + "loss": 0.584, + "step": 3362 + }, + { + "epoch": 0.8928713659896456, + "grad_norm": 0.3924579676587756, + "learning_rate": 4.744903031217206e-06, + "loss": 0.6517, + "step": 3363 + }, + { + "epoch": 0.8931368644630293, + "grad_norm": 0.3940493281798428, + "learning_rate": 4.744749366995576e-06, + "loss": 0.5955, + "step": 3364 + }, + { + "epoch": 0.8934023629364131, + "grad_norm": 0.39069459108657206, + "learning_rate": 4.744595658995837e-06, + "loss": 0.6208, + "step": 3365 + }, + { + "epoch": 0.8936678614097969, + "grad_norm": 0.41369043512590986, + "learning_rate": 4.744441907220987e-06, + "loss": 0.641, + "step": 3366 + }, + { + "epoch": 0.8939333598831807, + "grad_norm": 0.3848688934330599, + "learning_rate": 4.744288111674023e-06, + "loss": 0.5991, + "step": 3367 + }, + { + "epoch": 0.8941988583565644, + "grad_norm": 0.39001922379455345, + "learning_rate": 4.744134272357948e-06, + "loss": 0.624, + "step": 3368 + }, + { + "epoch": 0.8944643568299482, + "grad_norm": 0.3940172810090595, + "learning_rate": 4.74398038927576e-06, + "loss": 0.6004, + "step": 3369 + }, + { + "epoch": 0.894729855303332, + "grad_norm": 0.37253415916519955, + "learning_rate": 4.743826462430459e-06, + "loss": 0.5825, + "step": 3370 + }, + { + "epoch": 0.8949953537767158, + "grad_norm": 0.3808815856012623, + "learning_rate": 4.7436724918250485e-06, + "loss": 0.6093, + "step": 3371 + }, + { + "epoch": 0.8952608522500995, + "grad_norm": 0.39620878451794717, + "learning_rate": 4.743518477462531e-06, + "loss": 0.6272, + "step": 3372 + }, + { + "epoch": 0.8955263507234833, + "grad_norm": 0.39922070142204164, + "learning_rate": 4.7433644193459104e-06, + "loss": 0.6125, + "step": 3373 + }, + { + "epoch": 0.8957918491968672, + "grad_norm": 0.3908614419935643, + "learning_rate": 4.7432103174781906e-06, + "loss": 0.6639, + "step": 3374 + }, + { + "epoch": 0.8960573476702509, + "grad_norm": 0.38921458118132785, + "learning_rate": 4.743056171862378e-06, + "loss": 0.6039, + "step": 3375 + }, + { + "epoch": 0.8963228461436347, + "grad_norm": 0.3945253800621873, + "learning_rate": 4.742901982501476e-06, + "loss": 0.6185, + "step": 3376 + }, + { + "epoch": 0.8965883446170184, + "grad_norm": 0.3828735579396043, + "learning_rate": 4.742747749398497e-06, + "loss": 0.6274, + "step": 3377 + }, + { + "epoch": 0.8968538430904023, + "grad_norm": 0.3895632511687009, + "learning_rate": 4.742593472556442e-06, + "loss": 0.6353, + "step": 3378 + }, + { + "epoch": 0.897119341563786, + "grad_norm": 0.3901858908952081, + "learning_rate": 4.742439151978326e-06, + "loss": 0.605, + "step": 3379 + }, + { + "epoch": 0.8973848400371698, + "grad_norm": 0.40535561330640746, + "learning_rate": 4.7422847876671565e-06, + "loss": 0.6002, + "step": 3380 + }, + { + "epoch": 0.8976503385105535, + "grad_norm": 0.3874398179605058, + "learning_rate": 4.742130379625942e-06, + "loss": 0.5923, + "step": 3381 + }, + { + "epoch": 0.8979158369839373, + "grad_norm": 0.3998501987431258, + "learning_rate": 4.741975927857696e-06, + "loss": 0.601, + "step": 3382 + }, + { + "epoch": 0.8981813354573212, + "grad_norm": 0.37363983255522465, + "learning_rate": 4.74182143236543e-06, + "loss": 0.6063, + "step": 3383 + }, + { + "epoch": 0.8984468339307049, + "grad_norm": 0.3855054525446527, + "learning_rate": 4.741666893152157e-06, + "loss": 0.6124, + "step": 3384 + }, + { + "epoch": 0.8987123324040887, + "grad_norm": 0.3905471841090725, + "learning_rate": 4.741512310220891e-06, + "loss": 0.6232, + "step": 3385 + }, + { + "epoch": 0.8989778308774724, + "grad_norm": 0.3794340488347295, + "learning_rate": 4.7413576835746474e-06, + "loss": 0.5728, + "step": 3386 + }, + { + "epoch": 0.8992433293508563, + "grad_norm": 0.39805627838134156, + "learning_rate": 4.741203013216441e-06, + "loss": 0.6178, + "step": 3387 + }, + { + "epoch": 0.89950882782424, + "grad_norm": 0.38134711964687046, + "learning_rate": 4.741048299149289e-06, + "loss": 0.5873, + "step": 3388 + }, + { + "epoch": 0.8997743262976238, + "grad_norm": 0.3874373189477116, + "learning_rate": 4.740893541376207e-06, + "loss": 0.5988, + "step": 3389 + }, + { + "epoch": 0.9000398247710075, + "grad_norm": 0.39219368851332287, + "learning_rate": 4.740738739900215e-06, + "loss": 0.6251, + "step": 3390 + }, + { + "epoch": 0.9003053232443914, + "grad_norm": 0.37806396193996067, + "learning_rate": 4.740583894724331e-06, + "loss": 0.5789, + "step": 3391 + }, + { + "epoch": 0.9005708217177751, + "grad_norm": 0.38366356775642907, + "learning_rate": 4.740429005851574e-06, + "loss": 0.6234, + "step": 3392 + }, + { + "epoch": 0.9008363201911589, + "grad_norm": 0.3809104078142495, + "learning_rate": 4.740274073284968e-06, + "loss": 0.5959, + "step": 3393 + }, + { + "epoch": 0.9011018186645426, + "grad_norm": 0.37604806239771565, + "learning_rate": 4.740119097027531e-06, + "loss": 0.5905, + "step": 3394 + }, + { + "epoch": 0.9013673171379265, + "grad_norm": 0.38587494226103675, + "learning_rate": 4.739964077082287e-06, + "loss": 0.6012, + "step": 3395 + }, + { + "epoch": 0.9016328156113103, + "grad_norm": 0.3753780403412113, + "learning_rate": 4.73980901345226e-06, + "loss": 0.6302, + "step": 3396 + }, + { + "epoch": 0.901898314084694, + "grad_norm": 0.3802983510698151, + "learning_rate": 4.739653906140473e-06, + "loss": 0.6305, + "step": 3397 + }, + { + "epoch": 0.9021638125580778, + "grad_norm": 0.3881969020196794, + "learning_rate": 4.73949875514995e-06, + "loss": 0.6142, + "step": 3398 + }, + { + "epoch": 0.9024293110314616, + "grad_norm": 0.40957065563154593, + "learning_rate": 4.7393435604837195e-06, + "loss": 0.6184, + "step": 3399 + }, + { + "epoch": 0.9026948095048454, + "grad_norm": 0.3796227686844207, + "learning_rate": 4.739188322144806e-06, + "loss": 0.5881, + "step": 3400 + }, + { + "epoch": 0.9029603079782291, + "grad_norm": 0.3912403746909842, + "learning_rate": 4.739033040136238e-06, + "loss": 0.6012, + "step": 3401 + }, + { + "epoch": 0.9032258064516129, + "grad_norm": 0.3953662526039944, + "learning_rate": 4.738877714461043e-06, + "loss": 0.6003, + "step": 3402 + }, + { + "epoch": 0.9034913049249966, + "grad_norm": 0.3906456290994063, + "learning_rate": 4.7387223451222515e-06, + "loss": 0.604, + "step": 3403 + }, + { + "epoch": 0.9037568033983805, + "grad_norm": 0.37655622906030306, + "learning_rate": 4.738566932122893e-06, + "loss": 0.5846, + "step": 3404 + }, + { + "epoch": 0.9040223018717642, + "grad_norm": 0.39770680349420695, + "learning_rate": 4.738411475465998e-06, + "loss": 0.6327, + "step": 3405 + }, + { + "epoch": 0.904287800345148, + "grad_norm": 0.3878671438040319, + "learning_rate": 4.738255975154599e-06, + "loss": 0.6029, + "step": 3406 + }, + { + "epoch": 0.9045532988185317, + "grad_norm": 0.4007513607231948, + "learning_rate": 4.7381004311917284e-06, + "loss": 0.5809, + "step": 3407 + }, + { + "epoch": 0.9048187972919156, + "grad_norm": 0.37434710592084747, + "learning_rate": 4.7379448435804184e-06, + "loss": 0.5807, + "step": 3408 + }, + { + "epoch": 0.9050842957652994, + "grad_norm": 0.40287565951271437, + "learning_rate": 4.7377892123237055e-06, + "loss": 0.6223, + "step": 3409 + }, + { + "epoch": 0.9053497942386831, + "grad_norm": 0.4027691649707259, + "learning_rate": 4.737633537424624e-06, + "loss": 0.6194, + "step": 3410 + }, + { + "epoch": 0.9056152927120669, + "grad_norm": 0.383371165090648, + "learning_rate": 4.7374778188862105e-06, + "loss": 0.6313, + "step": 3411 + }, + { + "epoch": 0.9058807911854507, + "grad_norm": 0.414209314183217, + "learning_rate": 4.7373220567115e-06, + "loss": 0.6125, + "step": 3412 + }, + { + "epoch": 0.9061462896588345, + "grad_norm": 0.38664384582759515, + "learning_rate": 4.737166250903532e-06, + "loss": 0.6299, + "step": 3413 + }, + { + "epoch": 0.9064117881322182, + "grad_norm": 0.4126284430607286, + "learning_rate": 4.737010401465344e-06, + "loss": 0.6024, + "step": 3414 + }, + { + "epoch": 0.906677286605602, + "grad_norm": 0.39055349680356477, + "learning_rate": 4.736854508399977e-06, + "loss": 0.6471, + "step": 3415 + }, + { + "epoch": 0.9069427850789858, + "grad_norm": 0.38250749709782844, + "learning_rate": 4.73669857171047e-06, + "loss": 0.6004, + "step": 3416 + }, + { + "epoch": 0.9072082835523696, + "grad_norm": 0.4311919139039721, + "learning_rate": 4.736542591399864e-06, + "loss": 0.5911, + "step": 3417 + }, + { + "epoch": 0.9074737820257534, + "grad_norm": 0.389457151028843, + "learning_rate": 4.736386567471202e-06, + "loss": 0.6431, + "step": 3418 + }, + { + "epoch": 0.9077392804991371, + "grad_norm": 0.37651842436689276, + "learning_rate": 4.736230499927525e-06, + "loss": 0.618, + "step": 3419 + }, + { + "epoch": 0.9080047789725209, + "grad_norm": 0.38980235627361093, + "learning_rate": 4.736074388771879e-06, + "loss": 0.5787, + "step": 3420 + }, + { + "epoch": 0.9082702774459047, + "grad_norm": 0.3935241800958016, + "learning_rate": 4.735918234007309e-06, + "loss": 0.6011, + "step": 3421 + }, + { + "epoch": 0.9085357759192885, + "grad_norm": 0.37928593590440823, + "learning_rate": 4.735762035636857e-06, + "loss": 0.5975, + "step": 3422 + }, + { + "epoch": 0.9088012743926722, + "grad_norm": 0.39108513017901275, + "learning_rate": 4.735605793663572e-06, + "loss": 0.6064, + "step": 3423 + }, + { + "epoch": 0.909066772866056, + "grad_norm": 0.39422797484336725, + "learning_rate": 4.7354495080904996e-06, + "loss": 0.6124, + "step": 3424 + }, + { + "epoch": 0.9093322713394398, + "grad_norm": 0.38992106166335516, + "learning_rate": 4.735293178920689e-06, + "loss": 0.6122, + "step": 3425 + }, + { + "epoch": 0.9095977698128236, + "grad_norm": 0.3811299990487383, + "learning_rate": 4.735136806157189e-06, + "loss": 0.6011, + "step": 3426 + }, + { + "epoch": 0.9098632682862073, + "grad_norm": 0.3985867372643103, + "learning_rate": 4.734980389803048e-06, + "loss": 0.6066, + "step": 3427 + }, + { + "epoch": 0.9101287667595911, + "grad_norm": 0.3906465069650725, + "learning_rate": 4.734823929861317e-06, + "loss": 0.5955, + "step": 3428 + }, + { + "epoch": 0.910394265232975, + "grad_norm": 0.3959664146770712, + "learning_rate": 4.734667426335048e-06, + "loss": 0.6121, + "step": 3429 + }, + { + "epoch": 0.9106597637063587, + "grad_norm": 0.40644770166392485, + "learning_rate": 4.734510879227293e-06, + "loss": 0.5867, + "step": 3430 + }, + { + "epoch": 0.9109252621797425, + "grad_norm": 0.40123775455126887, + "learning_rate": 4.734354288541104e-06, + "loss": 0.637, + "step": 3431 + }, + { + "epoch": 0.9111907606531262, + "grad_norm": 0.3817635893801134, + "learning_rate": 4.734197654279537e-06, + "loss": 0.6204, + "step": 3432 + }, + { + "epoch": 0.9114562591265101, + "grad_norm": 0.38367590525080303, + "learning_rate": 4.734040976445645e-06, + "loss": 0.5631, + "step": 3433 + }, + { + "epoch": 0.9117217575998938, + "grad_norm": 0.3918928428740401, + "learning_rate": 4.733884255042484e-06, + "loss": 0.5998, + "step": 3434 + }, + { + "epoch": 0.9119872560732776, + "grad_norm": 0.3838734025692161, + "learning_rate": 4.7337274900731105e-06, + "loss": 0.6316, + "step": 3435 + }, + { + "epoch": 0.9122527545466613, + "grad_norm": 0.3960968956208628, + "learning_rate": 4.733570681540581e-06, + "loss": 0.6327, + "step": 3436 + }, + { + "epoch": 0.9125182530200452, + "grad_norm": 0.3898099642594238, + "learning_rate": 4.7334138294479554e-06, + "loss": 0.5759, + "step": 3437 + }, + { + "epoch": 0.9127837514934289, + "grad_norm": 0.379897145027227, + "learning_rate": 4.733256933798292e-06, + "loss": 0.5834, + "step": 3438 + }, + { + "epoch": 0.9130492499668127, + "grad_norm": 0.3919447951120241, + "learning_rate": 4.7330999945946496e-06, + "loss": 0.6302, + "step": 3439 + }, + { + "epoch": 0.9133147484401964, + "grad_norm": 0.40362527250079516, + "learning_rate": 4.73294301184009e-06, + "loss": 0.5834, + "step": 3440 + }, + { + "epoch": 0.9135802469135802, + "grad_norm": 0.38717567999686087, + "learning_rate": 4.732785985537675e-06, + "loss": 0.6519, + "step": 3441 + }, + { + "epoch": 0.9138457453869641, + "grad_norm": 0.39825444352731054, + "learning_rate": 4.732628915690466e-06, + "loss": 0.6372, + "step": 3442 + }, + { + "epoch": 0.9141112438603478, + "grad_norm": 0.40355891459094184, + "learning_rate": 4.732471802301527e-06, + "loss": 0.6332, + "step": 3443 + }, + { + "epoch": 0.9143767423337316, + "grad_norm": 0.38584603358776676, + "learning_rate": 4.732314645373922e-06, + "loss": 0.5981, + "step": 3444 + }, + { + "epoch": 0.9146422408071153, + "grad_norm": 0.39800617543877354, + "learning_rate": 4.732157444910716e-06, + "loss": 0.6018, + "step": 3445 + }, + { + "epoch": 0.9149077392804992, + "grad_norm": 0.4027673253626878, + "learning_rate": 4.732000200914973e-06, + "loss": 0.5593, + "step": 3446 + }, + { + "epoch": 0.9151732377538829, + "grad_norm": 0.39155546175687, + "learning_rate": 4.7318429133897625e-06, + "loss": 0.6538, + "step": 3447 + }, + { + "epoch": 0.9154387362272667, + "grad_norm": 0.41234558121547255, + "learning_rate": 4.73168558233815e-06, + "loss": 0.5976, + "step": 3448 + }, + { + "epoch": 0.9157042347006504, + "grad_norm": 0.3828924708686552, + "learning_rate": 4.731528207763205e-06, + "loss": 0.6136, + "step": 3449 + }, + { + "epoch": 0.9159697331740343, + "grad_norm": 0.37303850616839945, + "learning_rate": 4.731370789667996e-06, + "loss": 0.6211, + "step": 3450 + }, + { + "epoch": 0.916235231647418, + "grad_norm": 0.4000994455317232, + "learning_rate": 4.731213328055594e-06, + "loss": 0.6245, + "step": 3451 + }, + { + "epoch": 0.9165007301208018, + "grad_norm": 0.4181567365359792, + "learning_rate": 4.7310558229290685e-06, + "loss": 0.6184, + "step": 3452 + }, + { + "epoch": 0.9167662285941856, + "grad_norm": 0.3854924393243661, + "learning_rate": 4.730898274291492e-06, + "loss": 0.6234, + "step": 3453 + }, + { + "epoch": 0.9170317270675694, + "grad_norm": 0.4100989351001039, + "learning_rate": 4.7307406821459364e-06, + "loss": 0.607, + "step": 3454 + }, + { + "epoch": 0.9172972255409532, + "grad_norm": 0.4339623701841104, + "learning_rate": 4.730583046495477e-06, + "loss": 0.626, + "step": 3455 + }, + { + "epoch": 0.9175627240143369, + "grad_norm": 0.3911102828515388, + "learning_rate": 4.730425367343186e-06, + "loss": 0.5989, + "step": 3456 + }, + { + "epoch": 0.9178282224877207, + "grad_norm": 0.4067230998516061, + "learning_rate": 4.730267644692139e-06, + "loss": 0.6503, + "step": 3457 + }, + { + "epoch": 0.9180937209611044, + "grad_norm": 0.40571298322688043, + "learning_rate": 4.7301098785454124e-06, + "loss": 0.617, + "step": 3458 + }, + { + "epoch": 0.9183592194344883, + "grad_norm": 0.391348128842071, + "learning_rate": 4.729952068906084e-06, + "loss": 0.6389, + "step": 3459 + }, + { + "epoch": 0.918624717907872, + "grad_norm": 0.39287424854153175, + "learning_rate": 4.729794215777229e-06, + "loss": 0.6264, + "step": 3460 + }, + { + "epoch": 0.9188902163812558, + "grad_norm": 0.3866999746446962, + "learning_rate": 4.729636319161928e-06, + "loss": 0.6513, + "step": 3461 + }, + { + "epoch": 0.9191557148546395, + "grad_norm": 0.39238238205886716, + "learning_rate": 4.72947837906326e-06, + "loss": 0.5746, + "step": 3462 + }, + { + "epoch": 0.9194212133280234, + "grad_norm": 0.390307595239034, + "learning_rate": 4.729320395484304e-06, + "loss": 0.6036, + "step": 3463 + }, + { + "epoch": 0.9196867118014072, + "grad_norm": 0.3843938231362498, + "learning_rate": 4.729162368428143e-06, + "loss": 0.5853, + "step": 3464 + }, + { + "epoch": 0.9199522102747909, + "grad_norm": 0.37906986999352327, + "learning_rate": 4.729004297897856e-06, + "loss": 0.6009, + "step": 3465 + }, + { + "epoch": 0.9202177087481747, + "grad_norm": 0.39581950280073036, + "learning_rate": 4.72884618389653e-06, + "loss": 0.6245, + "step": 3466 + }, + { + "epoch": 0.9204832072215585, + "grad_norm": 0.37726386807208134, + "learning_rate": 4.728688026427245e-06, + "loss": 0.6221, + "step": 3467 + }, + { + "epoch": 0.9207487056949423, + "grad_norm": 0.3869628549915921, + "learning_rate": 4.7285298254930865e-06, + "loss": 0.5813, + "step": 3468 + }, + { + "epoch": 0.921014204168326, + "grad_norm": 0.39145864774959555, + "learning_rate": 4.728371581097141e-06, + "loss": 0.6027, + "step": 3469 + }, + { + "epoch": 0.9212797026417098, + "grad_norm": 0.38039074340293794, + "learning_rate": 4.7282132932424926e-06, + "loss": 0.6149, + "step": 3470 + }, + { + "epoch": 0.9215452011150936, + "grad_norm": 0.3999847100913083, + "learning_rate": 4.72805496193223e-06, + "loss": 0.6304, + "step": 3471 + }, + { + "epoch": 0.9218106995884774, + "grad_norm": 0.39635395889112085, + "learning_rate": 4.72789658716944e-06, + "loss": 0.6612, + "step": 3472 + }, + { + "epoch": 0.9220761980618611, + "grad_norm": 0.3991550414705968, + "learning_rate": 4.7277381689572125e-06, + "loss": 0.6343, + "step": 3473 + }, + { + "epoch": 0.9223416965352449, + "grad_norm": 0.3798025780095887, + "learning_rate": 4.727579707298636e-06, + "loss": 0.623, + "step": 3474 + }, + { + "epoch": 0.9226071950086288, + "grad_norm": 0.37644007998739926, + "learning_rate": 4.727421202196802e-06, + "loss": 0.6183, + "step": 3475 + }, + { + "epoch": 0.9228726934820125, + "grad_norm": 0.3939535516255843, + "learning_rate": 4.727262653654799e-06, + "loss": 0.5954, + "step": 3476 + }, + { + "epoch": 0.9231381919553963, + "grad_norm": 0.38868606381712056, + "learning_rate": 4.727104061675722e-06, + "loss": 0.6121, + "step": 3477 + }, + { + "epoch": 0.92340369042878, + "grad_norm": 0.3985793206729489, + "learning_rate": 4.726945426262662e-06, + "loss": 0.6002, + "step": 3478 + }, + { + "epoch": 0.9236691889021638, + "grad_norm": 0.3871942692114719, + "learning_rate": 4.726786747418715e-06, + "loss": 0.6089, + "step": 3479 + }, + { + "epoch": 0.9239346873755476, + "grad_norm": 0.39136421575701336, + "learning_rate": 4.726628025146974e-06, + "loss": 0.6353, + "step": 3480 + }, + { + "epoch": 0.9242001858489314, + "grad_norm": 0.3840020408134472, + "learning_rate": 4.726469259450534e-06, + "loss": 0.5982, + "step": 3481 + }, + { + "epoch": 0.9244656843223151, + "grad_norm": 0.40077545721314484, + "learning_rate": 4.726310450332493e-06, + "loss": 0.6326, + "step": 3482 + }, + { + "epoch": 0.9247311827956989, + "grad_norm": 0.39902835161834627, + "learning_rate": 4.726151597795947e-06, + "loss": 0.6385, + "step": 3483 + }, + { + "epoch": 0.9249966812690827, + "grad_norm": 0.3879133606947771, + "learning_rate": 4.725992701843994e-06, + "loss": 0.606, + "step": 3484 + }, + { + "epoch": 0.9252621797424665, + "grad_norm": 0.41207641225646685, + "learning_rate": 4.725833762479733e-06, + "loss": 0.6193, + "step": 3485 + }, + { + "epoch": 0.9255276782158502, + "grad_norm": 0.3849465427336246, + "learning_rate": 4.725674779706264e-06, + "loss": 0.6276, + "step": 3486 + }, + { + "epoch": 0.925793176689234, + "grad_norm": 0.40921801976671845, + "learning_rate": 4.725515753526688e-06, + "loss": 0.5777, + "step": 3487 + }, + { + "epoch": 0.9260586751626179, + "grad_norm": 0.3824138209234103, + "learning_rate": 4.725356683944106e-06, + "loss": 0.586, + "step": 3488 + }, + { + "epoch": 0.9263241736360016, + "grad_norm": 0.3880008217806217, + "learning_rate": 4.725197570961619e-06, + "loss": 0.5813, + "step": 3489 + }, + { + "epoch": 0.9265896721093854, + "grad_norm": 0.3988359982285484, + "learning_rate": 4.725038414582331e-06, + "loss": 0.5844, + "step": 3490 + }, + { + "epoch": 0.9268551705827691, + "grad_norm": 0.41519921823890016, + "learning_rate": 4.724879214809347e-06, + "loss": 0.6233, + "step": 3491 + }, + { + "epoch": 0.927120669056153, + "grad_norm": 0.3898478397602767, + "learning_rate": 4.724719971645771e-06, + "loss": 0.6502, + "step": 3492 + }, + { + "epoch": 0.9273861675295367, + "grad_norm": 0.40898787006693804, + "learning_rate": 4.724560685094708e-06, + "loss": 0.6206, + "step": 3493 + }, + { + "epoch": 0.9276516660029205, + "grad_norm": 0.38256678885372414, + "learning_rate": 4.724401355159265e-06, + "loss": 0.6306, + "step": 3494 + }, + { + "epoch": 0.9279171644763042, + "grad_norm": 0.39631642627366565, + "learning_rate": 4.724241981842549e-06, + "loss": 0.6371, + "step": 3495 + }, + { + "epoch": 0.928182662949688, + "grad_norm": 0.3744770038402985, + "learning_rate": 4.724082565147669e-06, + "loss": 0.5989, + "step": 3496 + }, + { + "epoch": 0.9284481614230718, + "grad_norm": 0.3959072486033822, + "learning_rate": 4.723923105077733e-06, + "loss": 0.6623, + "step": 3497 + }, + { + "epoch": 0.9287136598964556, + "grad_norm": 0.4235396707397403, + "learning_rate": 4.7237636016358515e-06, + "loss": 0.6276, + "step": 3498 + }, + { + "epoch": 0.9289791583698394, + "grad_norm": 0.40392340729904874, + "learning_rate": 4.723604054825135e-06, + "loss": 0.6115, + "step": 3499 + }, + { + "epoch": 0.9292446568432231, + "grad_norm": 0.38258362505809135, + "learning_rate": 4.723444464648696e-06, + "loss": 0.6022, + "step": 3500 + }, + { + "epoch": 0.929510155316607, + "grad_norm": 0.3790529906787704, + "learning_rate": 4.723284831109645e-06, + "loss": 0.5948, + "step": 3501 + }, + { + "epoch": 0.9297756537899907, + "grad_norm": 0.3840540623424871, + "learning_rate": 4.723125154211097e-06, + "loss": 0.6179, + "step": 3502 + }, + { + "epoch": 0.9300411522633745, + "grad_norm": 0.408530282230814, + "learning_rate": 4.7229654339561645e-06, + "loss": 0.5846, + "step": 3503 + }, + { + "epoch": 0.9303066507367582, + "grad_norm": 0.3960898881412602, + "learning_rate": 4.722805670347963e-06, + "loss": 0.5946, + "step": 3504 + }, + { + "epoch": 0.9305721492101421, + "grad_norm": 0.38204270054388545, + "learning_rate": 4.72264586338961e-06, + "loss": 0.6027, + "step": 3505 + }, + { + "epoch": 0.9308376476835258, + "grad_norm": 0.38131700057876367, + "learning_rate": 4.72248601308422e-06, + "loss": 0.593, + "step": 3506 + }, + { + "epoch": 0.9311031461569096, + "grad_norm": 0.3896561283967658, + "learning_rate": 4.722326119434911e-06, + "loss": 0.6047, + "step": 3507 + }, + { + "epoch": 0.9313686446302933, + "grad_norm": 0.3915952652728005, + "learning_rate": 4.722166182444801e-06, + "loss": 0.6572, + "step": 3508 + }, + { + "epoch": 0.9316341431036772, + "grad_norm": 0.42333469278626573, + "learning_rate": 4.72200620211701e-06, + "loss": 0.6157, + "step": 3509 + }, + { + "epoch": 0.931899641577061, + "grad_norm": 0.39543587323385704, + "learning_rate": 4.721846178454658e-06, + "loss": 0.5714, + "step": 3510 + }, + { + "epoch": 0.9321651400504447, + "grad_norm": 0.3845789903687893, + "learning_rate": 4.721686111460866e-06, + "loss": 0.6068, + "step": 3511 + }, + { + "epoch": 0.9324306385238285, + "grad_norm": 0.41802367665992934, + "learning_rate": 4.721526001138754e-06, + "loss": 0.6061, + "step": 3512 + }, + { + "epoch": 0.9326961369972122, + "grad_norm": 0.40571467836760294, + "learning_rate": 4.7213658474914465e-06, + "loss": 0.6382, + "step": 3513 + }, + { + "epoch": 0.9329616354705961, + "grad_norm": 0.3928869829662483, + "learning_rate": 4.721205650522066e-06, + "loss": 0.6213, + "step": 3514 + }, + { + "epoch": 0.9332271339439798, + "grad_norm": 0.3919667381453046, + "learning_rate": 4.721045410233737e-06, + "loss": 0.6364, + "step": 3515 + }, + { + "epoch": 0.9334926324173636, + "grad_norm": 0.3972783164202163, + "learning_rate": 4.720885126629584e-06, + "loss": 0.6023, + "step": 3516 + }, + { + "epoch": 0.9337581308907473, + "grad_norm": 0.407462513465511, + "learning_rate": 4.720724799712734e-06, + "loss": 0.621, + "step": 3517 + }, + { + "epoch": 0.9340236293641312, + "grad_norm": 0.43232414172633815, + "learning_rate": 4.720564429486312e-06, + "loss": 0.5754, + "step": 3518 + }, + { + "epoch": 0.9342891278375149, + "grad_norm": 0.3905489312484764, + "learning_rate": 4.720404015953448e-06, + "loss": 0.6405, + "step": 3519 + }, + { + "epoch": 0.9345546263108987, + "grad_norm": 0.38075055483398174, + "learning_rate": 4.720243559117268e-06, + "loss": 0.6125, + "step": 3520 + }, + { + "epoch": 0.9348201247842824, + "grad_norm": 0.44666240024584875, + "learning_rate": 4.720083058980903e-06, + "loss": 0.5968, + "step": 3521 + }, + { + "epoch": 0.9350856232576663, + "grad_norm": 0.38000039164090604, + "learning_rate": 4.719922515547483e-06, + "loss": 0.6356, + "step": 3522 + }, + { + "epoch": 0.9353511217310501, + "grad_norm": 0.39400085528512446, + "learning_rate": 4.719761928820139e-06, + "loss": 0.6301, + "step": 3523 + }, + { + "epoch": 0.9356166202044338, + "grad_norm": 0.37905520336141546, + "learning_rate": 4.719601298802002e-06, + "loss": 0.5994, + "step": 3524 + }, + { + "epoch": 0.9358821186778176, + "grad_norm": 0.3853796510866678, + "learning_rate": 4.719440625496205e-06, + "loss": 0.6522, + "step": 3525 + }, + { + "epoch": 0.9361476171512014, + "grad_norm": 0.3854573658366218, + "learning_rate": 4.719279908905882e-06, + "loss": 0.6196, + "step": 3526 + }, + { + "epoch": 0.9364131156245852, + "grad_norm": 0.3783878744935739, + "learning_rate": 4.719119149034166e-06, + "loss": 0.6185, + "step": 3527 + }, + { + "epoch": 0.9366786140979689, + "grad_norm": 0.38443751951282223, + "learning_rate": 4.718958345884195e-06, + "loss": 0.6151, + "step": 3528 + }, + { + "epoch": 0.9369441125713527, + "grad_norm": 0.3899471549058231, + "learning_rate": 4.718797499459102e-06, + "loss": 0.6013, + "step": 3529 + }, + { + "epoch": 0.9372096110447365, + "grad_norm": 0.42267009026821517, + "learning_rate": 4.718636609762025e-06, + "loss": 0.646, + "step": 3530 + }, + { + "epoch": 0.9374751095181203, + "grad_norm": 0.39049097494025076, + "learning_rate": 4.718475676796103e-06, + "loss": 0.6459, + "step": 3531 + }, + { + "epoch": 0.937740607991504, + "grad_norm": 0.39948159005145745, + "learning_rate": 4.718314700564473e-06, + "loss": 0.626, + "step": 3532 + }, + { + "epoch": 0.9380061064648878, + "grad_norm": 0.38862287755708075, + "learning_rate": 4.718153681070276e-06, + "loss": 0.6236, + "step": 3533 + }, + { + "epoch": 0.9382716049382716, + "grad_norm": 0.3844487538483707, + "learning_rate": 4.717992618316649e-06, + "loss": 0.6045, + "step": 3534 + }, + { + "epoch": 0.9385371034116554, + "grad_norm": 0.3840615286530079, + "learning_rate": 4.717831512306737e-06, + "loss": 0.5714, + "step": 3535 + }, + { + "epoch": 0.9388026018850392, + "grad_norm": 0.38626787538083596, + "learning_rate": 4.71767036304368e-06, + "loss": 0.6267, + "step": 3536 + }, + { + "epoch": 0.9390681003584229, + "grad_norm": 0.3831602323571539, + "learning_rate": 4.7175091705306206e-06, + "loss": 0.6181, + "step": 3537 + }, + { + "epoch": 0.9393335988318067, + "grad_norm": 0.381564441987076, + "learning_rate": 4.7173479347707044e-06, + "loss": 0.6013, + "step": 3538 + }, + { + "epoch": 0.9395990973051905, + "grad_norm": 0.39919109369221556, + "learning_rate": 4.717186655767073e-06, + "loss": 0.6025, + "step": 3539 + }, + { + "epoch": 0.9398645957785743, + "grad_norm": 0.3831387321488225, + "learning_rate": 4.717025333522874e-06, + "loss": 0.6001, + "step": 3540 + }, + { + "epoch": 0.940130094251958, + "grad_norm": 0.3785704153450314, + "learning_rate": 4.716863968041252e-06, + "loss": 0.6078, + "step": 3541 + }, + { + "epoch": 0.9403955927253418, + "grad_norm": 0.38582635301175494, + "learning_rate": 4.716702559325356e-06, + "loss": 0.5849, + "step": 3542 + }, + { + "epoch": 0.9406610911987257, + "grad_norm": 0.39162354418307566, + "learning_rate": 4.716541107378332e-06, + "loss": 0.6242, + "step": 3543 + }, + { + "epoch": 0.9409265896721094, + "grad_norm": 0.3846172482049233, + "learning_rate": 4.716379612203329e-06, + "loss": 0.6014, + "step": 3544 + }, + { + "epoch": 0.9411920881454932, + "grad_norm": 0.38280507041972966, + "learning_rate": 4.716218073803498e-06, + "loss": 0.602, + "step": 3545 + }, + { + "epoch": 0.9414575866188769, + "grad_norm": 0.3747513761882635, + "learning_rate": 4.716056492181987e-06, + "loss": 0.6126, + "step": 3546 + }, + { + "epoch": 0.9417230850922608, + "grad_norm": 0.42223261439510645, + "learning_rate": 4.715894867341949e-06, + "loss": 0.6025, + "step": 3547 + }, + { + "epoch": 0.9419885835656445, + "grad_norm": 0.38803614841312795, + "learning_rate": 4.715733199286536e-06, + "loss": 0.6299, + "step": 3548 + }, + { + "epoch": 0.9422540820390283, + "grad_norm": 0.39536417979035315, + "learning_rate": 4.715571488018901e-06, + "loss": 0.6409, + "step": 3549 + }, + { + "epoch": 0.942519580512412, + "grad_norm": 0.38119715322976605, + "learning_rate": 4.715409733542196e-06, + "loss": 0.6297, + "step": 3550 + }, + { + "epoch": 0.9427850789857958, + "grad_norm": 0.3912170819860612, + "learning_rate": 4.7152479358595785e-06, + "loss": 0.6347, + "step": 3551 + }, + { + "epoch": 0.9430505774591796, + "grad_norm": 0.4024684886020273, + "learning_rate": 4.7150860949742e-06, + "loss": 0.6081, + "step": 3552 + }, + { + "epoch": 0.9433160759325634, + "grad_norm": 0.4092764083558984, + "learning_rate": 4.714924210889221e-06, + "loss": 0.5879, + "step": 3553 + }, + { + "epoch": 0.9435815744059471, + "grad_norm": 0.37695650207026915, + "learning_rate": 4.714762283607797e-06, + "loss": 0.6049, + "step": 3554 + }, + { + "epoch": 0.9438470728793309, + "grad_norm": 0.4243002297754266, + "learning_rate": 4.714600313133085e-06, + "loss": 0.5838, + "step": 3555 + }, + { + "epoch": 0.9441125713527148, + "grad_norm": 0.41261227594966987, + "learning_rate": 4.714438299468245e-06, + "loss": 0.6082, + "step": 3556 + }, + { + "epoch": 0.9443780698260985, + "grad_norm": 0.38174420139911713, + "learning_rate": 4.714276242616437e-06, + "loss": 0.6132, + "step": 3557 + }, + { + "epoch": 0.9446435682994823, + "grad_norm": 0.3871002909739619, + "learning_rate": 4.71411414258082e-06, + "loss": 0.6121, + "step": 3558 + }, + { + "epoch": 0.944909066772866, + "grad_norm": 0.3986292354860389, + "learning_rate": 4.7139519993645555e-06, + "loss": 0.6046, + "step": 3559 + }, + { + "epoch": 0.9451745652462499, + "grad_norm": 0.3881081172474212, + "learning_rate": 4.713789812970807e-06, + "loss": 0.6115, + "step": 3560 + }, + { + "epoch": 0.9454400637196336, + "grad_norm": 0.3868471666006034, + "learning_rate": 4.713627583402737e-06, + "loss": 0.5882, + "step": 3561 + }, + { + "epoch": 0.9457055621930174, + "grad_norm": 0.3793587646580147, + "learning_rate": 4.713465310663509e-06, + "loss": 0.6272, + "step": 3562 + }, + { + "epoch": 0.9459710606664011, + "grad_norm": 0.3721766806443814, + "learning_rate": 4.713302994756288e-06, + "loss": 0.5809, + "step": 3563 + }, + { + "epoch": 0.946236559139785, + "grad_norm": 0.37253670227246405, + "learning_rate": 4.71314063568424e-06, + "loss": 0.6213, + "step": 3564 + }, + { + "epoch": 0.9465020576131687, + "grad_norm": 0.3856079119107004, + "learning_rate": 4.712978233450531e-06, + "loss": 0.6173, + "step": 3565 + }, + { + "epoch": 0.9467675560865525, + "grad_norm": 0.37916228674395314, + "learning_rate": 4.712815788058327e-06, + "loss": 0.5932, + "step": 3566 + }, + { + "epoch": 0.9470330545599362, + "grad_norm": 0.3683105933076519, + "learning_rate": 4.712653299510798e-06, + "loss": 0.5897, + "step": 3567 + }, + { + "epoch": 0.9472985530333201, + "grad_norm": 0.3812396209762039, + "learning_rate": 4.712490767811111e-06, + "loss": 0.6413, + "step": 3568 + }, + { + "epoch": 0.9475640515067039, + "grad_norm": 0.388790949165087, + "learning_rate": 4.712328192962438e-06, + "loss": 0.6533, + "step": 3569 + }, + { + "epoch": 0.9478295499800876, + "grad_norm": 0.3969476032613877, + "learning_rate": 4.712165574967949e-06, + "loss": 0.6078, + "step": 3570 + }, + { + "epoch": 0.9480950484534714, + "grad_norm": 0.3753287846734859, + "learning_rate": 4.712002913830815e-06, + "loss": 0.5838, + "step": 3571 + }, + { + "epoch": 0.9483605469268551, + "grad_norm": 0.396556859423283, + "learning_rate": 4.711840209554207e-06, + "loss": 0.6016, + "step": 3572 + }, + { + "epoch": 0.948626045400239, + "grad_norm": 0.4065613362655618, + "learning_rate": 4.711677462141299e-06, + "loss": 0.5923, + "step": 3573 + }, + { + "epoch": 0.9488915438736227, + "grad_norm": 0.3901875041360443, + "learning_rate": 4.711514671595267e-06, + "loss": 0.647, + "step": 3574 + }, + { + "epoch": 0.9491570423470065, + "grad_norm": 0.3806874918242015, + "learning_rate": 4.711351837919283e-06, + "loss": 0.606, + "step": 3575 + }, + { + "epoch": 0.9494225408203902, + "grad_norm": 0.3737996868771382, + "learning_rate": 4.7111889611165254e-06, + "loss": 0.5689, + "step": 3576 + }, + { + "epoch": 0.9496880392937741, + "grad_norm": 0.3845205504765776, + "learning_rate": 4.711026041190168e-06, + "loss": 0.6045, + "step": 3577 + }, + { + "epoch": 0.9499535377671579, + "grad_norm": 0.3820802234953147, + "learning_rate": 4.710863078143389e-06, + "loss": 0.6104, + "step": 3578 + }, + { + "epoch": 0.9502190362405416, + "grad_norm": 0.3823997520337332, + "learning_rate": 4.710700071979367e-06, + "loss": 0.6073, + "step": 3579 + }, + { + "epoch": 0.9504845347139254, + "grad_norm": 0.39558462192664146, + "learning_rate": 4.710537022701281e-06, + "loss": 0.5872, + "step": 3580 + }, + { + "epoch": 0.9507500331873092, + "grad_norm": 0.383874294746993, + "learning_rate": 4.710373930312311e-06, + "loss": 0.6032, + "step": 3581 + }, + { + "epoch": 0.951015531660693, + "grad_norm": 0.39124421725543956, + "learning_rate": 4.710210794815637e-06, + "loss": 0.6331, + "step": 3582 + }, + { + "epoch": 0.9512810301340767, + "grad_norm": 0.39622892804406773, + "learning_rate": 4.710047616214441e-06, + "loss": 0.6118, + "step": 3583 + }, + { + "epoch": 0.9515465286074605, + "grad_norm": 0.37741913373008423, + "learning_rate": 4.709884394511906e-06, + "loss": 0.622, + "step": 3584 + }, + { + "epoch": 0.9518120270808443, + "grad_norm": 0.38102130190077316, + "learning_rate": 4.7097211297112145e-06, + "loss": 0.6144, + "step": 3585 + }, + { + "epoch": 0.9520775255542281, + "grad_norm": 0.38955337923468664, + "learning_rate": 4.70955782181555e-06, + "loss": 0.6316, + "step": 3586 + }, + { + "epoch": 0.9523430240276118, + "grad_norm": 0.396638900754623, + "learning_rate": 4.709394470828099e-06, + "loss": 0.6534, + "step": 3587 + }, + { + "epoch": 0.9526085225009956, + "grad_norm": 0.391803800569157, + "learning_rate": 4.709231076752045e-06, + "loss": 0.613, + "step": 3588 + }, + { + "epoch": 0.9528740209743793, + "grad_norm": 0.3984269122357259, + "learning_rate": 4.709067639590576e-06, + "loss": 0.6321, + "step": 3589 + }, + { + "epoch": 0.9531395194477632, + "grad_norm": 0.395700076282968, + "learning_rate": 4.708904159346881e-06, + "loss": 0.6013, + "step": 3590 + }, + { + "epoch": 0.953405017921147, + "grad_norm": 0.38068629903417356, + "learning_rate": 4.708740636024145e-06, + "loss": 0.6488, + "step": 3591 + }, + { + "epoch": 0.9536705163945307, + "grad_norm": 0.39973489616064506, + "learning_rate": 4.708577069625559e-06, + "loss": 0.5966, + "step": 3592 + }, + { + "epoch": 0.9539360148679145, + "grad_norm": 0.3909092586396629, + "learning_rate": 4.708413460154313e-06, + "loss": 0.6394, + "step": 3593 + }, + { + "epoch": 0.9542015133412983, + "grad_norm": 0.40284589959583805, + "learning_rate": 4.708249807613597e-06, + "loss": 0.6098, + "step": 3594 + }, + { + "epoch": 0.9544670118146821, + "grad_norm": 0.4022975164531979, + "learning_rate": 4.708086112006604e-06, + "loss": 0.5786, + "step": 3595 + }, + { + "epoch": 0.9547325102880658, + "grad_norm": 0.37800906327107797, + "learning_rate": 4.707922373336524e-06, + "loss": 0.6603, + "step": 3596 + }, + { + "epoch": 0.9549980087614496, + "grad_norm": 0.3904826018311421, + "learning_rate": 4.707758591606552e-06, + "loss": 0.6495, + "step": 3597 + }, + { + "epoch": 0.9552635072348334, + "grad_norm": 0.38674877034434985, + "learning_rate": 4.7075947668198825e-06, + "loss": 0.593, + "step": 3598 + }, + { + "epoch": 0.9555290057082172, + "grad_norm": 0.3914397071460796, + "learning_rate": 4.70743089897971e-06, + "loss": 0.5818, + "step": 3599 + }, + { + "epoch": 0.955794504181601, + "grad_norm": 0.3933866827836264, + "learning_rate": 4.70726698808923e-06, + "loss": 0.5861, + "step": 3600 + }, + { + "epoch": 0.9560600026549847, + "grad_norm": 0.382664339751929, + "learning_rate": 4.707103034151639e-06, + "loss": 0.5691, + "step": 3601 + }, + { + "epoch": 0.9563255011283686, + "grad_norm": 0.38044517979027814, + "learning_rate": 4.706939037170135e-06, + "loss": 0.5911, + "step": 3602 + }, + { + "epoch": 0.9565909996017523, + "grad_norm": 0.39137606815892434, + "learning_rate": 4.706774997147917e-06, + "loss": 0.6073, + "step": 3603 + }, + { + "epoch": 0.9568564980751361, + "grad_norm": 0.39381494532615724, + "learning_rate": 4.7066109140881824e-06, + "loss": 0.5978, + "step": 3604 + }, + { + "epoch": 0.9571219965485198, + "grad_norm": 0.3825296235402476, + "learning_rate": 4.706446787994134e-06, + "loss": 0.6115, + "step": 3605 + }, + { + "epoch": 0.9573874950219037, + "grad_norm": 0.38969170898807953, + "learning_rate": 4.706282618868969e-06, + "loss": 0.6323, + "step": 3606 + }, + { + "epoch": 0.9576529934952874, + "grad_norm": 0.40403364898354477, + "learning_rate": 4.706118406715892e-06, + "loss": 0.6156, + "step": 3607 + }, + { + "epoch": 0.9579184919686712, + "grad_norm": 0.38786804309152906, + "learning_rate": 4.7059541515381045e-06, + "loss": 0.6305, + "step": 3608 + }, + { + "epoch": 0.9581839904420549, + "grad_norm": 0.38961955782479457, + "learning_rate": 4.70578985333881e-06, + "loss": 0.6283, + "step": 3609 + }, + { + "epoch": 0.9584494889154387, + "grad_norm": 0.38773616697041347, + "learning_rate": 4.705625512121213e-06, + "loss": 0.6125, + "step": 3610 + }, + { + "epoch": 0.9587149873888225, + "grad_norm": 0.3865579178052979, + "learning_rate": 4.705461127888517e-06, + "loss": 0.601, + "step": 3611 + }, + { + "epoch": 0.9589804858622063, + "grad_norm": 0.39613489912379074, + "learning_rate": 4.70529670064393e-06, + "loss": 0.6227, + "step": 3612 + }, + { + "epoch": 0.95924598433559, + "grad_norm": 0.39270094445322157, + "learning_rate": 4.705132230390657e-06, + "loss": 0.607, + "step": 3613 + }, + { + "epoch": 0.9595114828089738, + "grad_norm": 0.3919473897579516, + "learning_rate": 4.704967717131907e-06, + "loss": 0.6279, + "step": 3614 + }, + { + "epoch": 0.9597769812823577, + "grad_norm": 0.3802434003911743, + "learning_rate": 4.704803160870888e-06, + "loss": 0.6512, + "step": 3615 + }, + { + "epoch": 0.9600424797557414, + "grad_norm": 0.40046601659455683, + "learning_rate": 4.704638561610809e-06, + "loss": 0.5767, + "step": 3616 + }, + { + "epoch": 0.9603079782291252, + "grad_norm": 0.3880968097212626, + "learning_rate": 4.70447391935488e-06, + "loss": 0.6055, + "step": 3617 + }, + { + "epoch": 0.9605734767025089, + "grad_norm": 0.39250409904499617, + "learning_rate": 4.704309234106312e-06, + "loss": 0.6154, + "step": 3618 + }, + { + "epoch": 0.9608389751758928, + "grad_norm": 0.38450001318202004, + "learning_rate": 4.704144505868318e-06, + "loss": 0.594, + "step": 3619 + }, + { + "epoch": 0.9611044736492765, + "grad_norm": 0.40137595561487843, + "learning_rate": 4.703979734644107e-06, + "loss": 0.5621, + "step": 3620 + }, + { + "epoch": 0.9613699721226603, + "grad_norm": 0.38427758191234046, + "learning_rate": 4.703814920436897e-06, + "loss": 0.5736, + "step": 3621 + }, + { + "epoch": 0.961635470596044, + "grad_norm": 0.38947722918312283, + "learning_rate": 4.7036500632498995e-06, + "loss": 0.6168, + "step": 3622 + }, + { + "epoch": 0.9619009690694279, + "grad_norm": 0.39007761189984497, + "learning_rate": 4.703485163086331e-06, + "loss": 0.5841, + "step": 3623 + }, + { + "epoch": 0.9621664675428117, + "grad_norm": 0.3985843518177514, + "learning_rate": 4.7033202199494055e-06, + "loss": 0.6365, + "step": 3624 + }, + { + "epoch": 0.9624319660161954, + "grad_norm": 0.3891574803335972, + "learning_rate": 4.703155233842342e-06, + "loss": 0.6074, + "step": 3625 + }, + { + "epoch": 0.9626974644895792, + "grad_norm": 0.3961272823075828, + "learning_rate": 4.7029902047683565e-06, + "loss": 0.6323, + "step": 3626 + }, + { + "epoch": 0.9629629629629629, + "grad_norm": 0.39100088914724596, + "learning_rate": 4.702825132730669e-06, + "loss": 0.5924, + "step": 3627 + }, + { + "epoch": 0.9632284614363468, + "grad_norm": 0.3890924820441184, + "learning_rate": 4.702660017732497e-06, + "loss": 0.6515, + "step": 3628 + }, + { + "epoch": 0.9634939599097305, + "grad_norm": 0.38355615738109367, + "learning_rate": 4.7024948597770625e-06, + "loss": 0.62, + "step": 3629 + }, + { + "epoch": 0.9637594583831143, + "grad_norm": 0.3954073823720711, + "learning_rate": 4.702329658867585e-06, + "loss": 0.6397, + "step": 3630 + }, + { + "epoch": 0.964024956856498, + "grad_norm": 0.37543761854241386, + "learning_rate": 4.702164415007288e-06, + "loss": 0.5817, + "step": 3631 + }, + { + "epoch": 0.9642904553298819, + "grad_norm": 0.3854728665577839, + "learning_rate": 4.701999128199392e-06, + "loss": 0.6234, + "step": 3632 + }, + { + "epoch": 0.9645559538032656, + "grad_norm": 0.40362742567571847, + "learning_rate": 4.701833798447121e-06, + "loss": 0.6091, + "step": 3633 + }, + { + "epoch": 0.9648214522766494, + "grad_norm": 0.3933445105415557, + "learning_rate": 4.701668425753701e-06, + "loss": 0.6076, + "step": 3634 + }, + { + "epoch": 0.9650869507500331, + "grad_norm": 0.380807769650326, + "learning_rate": 4.701503010122356e-06, + "loss": 0.5941, + "step": 3635 + }, + { + "epoch": 0.965352449223417, + "grad_norm": 0.39856075635397675, + "learning_rate": 4.701337551556312e-06, + "loss": 0.5767, + "step": 3636 + }, + { + "epoch": 0.9656179476968008, + "grad_norm": 0.37781771022776084, + "learning_rate": 4.701172050058795e-06, + "loss": 0.5825, + "step": 3637 + }, + { + "epoch": 0.9658834461701845, + "grad_norm": 0.3889588409640584, + "learning_rate": 4.701006505633035e-06, + "loss": 0.642, + "step": 3638 + }, + { + "epoch": 0.9661489446435683, + "grad_norm": 0.3896655514807649, + "learning_rate": 4.700840918282258e-06, + "loss": 0.5552, + "step": 3639 + }, + { + "epoch": 0.9664144431169521, + "grad_norm": 0.391716893041241, + "learning_rate": 4.700675288009695e-06, + "loss": 0.6201, + "step": 3640 + }, + { + "epoch": 0.9666799415903359, + "grad_norm": 0.3936588621740142, + "learning_rate": 4.7005096148185765e-06, + "loss": 0.6386, + "step": 3641 + }, + { + "epoch": 0.9669454400637196, + "grad_norm": 0.37889689828095957, + "learning_rate": 4.700343898712132e-06, + "loss": 0.6298, + "step": 3642 + }, + { + "epoch": 0.9672109385371034, + "grad_norm": 0.4091429640071916, + "learning_rate": 4.7001781396935945e-06, + "loss": 0.6398, + "step": 3643 + }, + { + "epoch": 0.9674764370104872, + "grad_norm": 0.3883545639185824, + "learning_rate": 4.700012337766196e-06, + "loss": 0.6239, + "step": 3644 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 0.38225271118778803, + "learning_rate": 4.69984649293317e-06, + "loss": 0.5958, + "step": 3645 + }, + { + "epoch": 0.9680074339572547, + "grad_norm": 0.39137614154896344, + "learning_rate": 4.699680605197751e-06, + "loss": 0.6224, + "step": 3646 + }, + { + "epoch": 0.9682729324306385, + "grad_norm": 0.3846780556433604, + "learning_rate": 4.6995146745631746e-06, + "loss": 0.6126, + "step": 3647 + }, + { + "epoch": 0.9685384309040223, + "grad_norm": 0.3912733934965504, + "learning_rate": 4.699348701032678e-06, + "loss": 0.634, + "step": 3648 + }, + { + "epoch": 0.9688039293774061, + "grad_norm": 0.40683308534234996, + "learning_rate": 4.699182684609495e-06, + "loss": 0.6115, + "step": 3649 + }, + { + "epoch": 0.9690694278507899, + "grad_norm": 0.3889780902116469, + "learning_rate": 4.699016625296866e-06, + "loss": 0.5928, + "step": 3650 + }, + { + "epoch": 0.9693349263241736, + "grad_norm": 0.3866846816002472, + "learning_rate": 4.69885052309803e-06, + "loss": 0.6016, + "step": 3651 + }, + { + "epoch": 0.9696004247975574, + "grad_norm": 0.3870270892617259, + "learning_rate": 4.698684378016223e-06, + "loss": 0.6454, + "step": 3652 + }, + { + "epoch": 0.9698659232709412, + "grad_norm": 0.3793337172442884, + "learning_rate": 4.698518190054688e-06, + "loss": 0.5917, + "step": 3653 + }, + { + "epoch": 0.970131421744325, + "grad_norm": 0.3943488726168635, + "learning_rate": 4.698351959216666e-06, + "loss": 0.6099, + "step": 3654 + }, + { + "epoch": 0.9703969202177087, + "grad_norm": 0.3949298885078147, + "learning_rate": 4.698185685505397e-06, + "loss": 0.6284, + "step": 3655 + }, + { + "epoch": 0.9706624186910925, + "grad_norm": 0.40927389400452, + "learning_rate": 4.698019368924126e-06, + "loss": 0.634, + "step": 3656 + }, + { + "epoch": 0.9709279171644764, + "grad_norm": 0.3960272796831328, + "learning_rate": 4.697853009476096e-06, + "loss": 0.5853, + "step": 3657 + }, + { + "epoch": 0.9711934156378601, + "grad_norm": 0.3836826859623769, + "learning_rate": 4.69768660716455e-06, + "loss": 0.5875, + "step": 3658 + }, + { + "epoch": 0.9714589141112439, + "grad_norm": 0.3849274510743423, + "learning_rate": 4.697520161992735e-06, + "loss": 0.6226, + "step": 3659 + }, + { + "epoch": 0.9717244125846276, + "grad_norm": 0.39910614750810963, + "learning_rate": 4.6973536739638965e-06, + "loss": 0.6174, + "step": 3660 + }, + { + "epoch": 0.9719899110580115, + "grad_norm": 0.4185146628531675, + "learning_rate": 4.697187143081281e-06, + "loss": 0.6227, + "step": 3661 + }, + { + "epoch": 0.9722554095313952, + "grad_norm": 0.3910465796237792, + "learning_rate": 4.697020569348136e-06, + "loss": 0.6226, + "step": 3662 + }, + { + "epoch": 0.972520908004779, + "grad_norm": 0.3917665963744819, + "learning_rate": 4.696853952767711e-06, + "loss": 0.6157, + "step": 3663 + }, + { + "epoch": 0.9727864064781627, + "grad_norm": 0.3969503845979608, + "learning_rate": 4.6966872933432565e-06, + "loss": 0.6252, + "step": 3664 + }, + { + "epoch": 0.9730519049515465, + "grad_norm": 0.4105211003899395, + "learning_rate": 4.69652059107802e-06, + "loss": 0.5931, + "step": 3665 + }, + { + "epoch": 0.9733174034249303, + "grad_norm": 0.3944654723818686, + "learning_rate": 4.696353845975253e-06, + "loss": 0.6015, + "step": 3666 + }, + { + "epoch": 0.9735829018983141, + "grad_norm": 0.38634470571483776, + "learning_rate": 4.69618705803821e-06, + "loss": 0.5941, + "step": 3667 + }, + { + "epoch": 0.9738484003716978, + "grad_norm": 0.3918507831216075, + "learning_rate": 4.696020227270142e-06, + "loss": 0.6074, + "step": 3668 + }, + { + "epoch": 0.9741138988450816, + "grad_norm": 0.4524254406535516, + "learning_rate": 4.6958533536743025e-06, + "loss": 0.5836, + "step": 3669 + }, + { + "epoch": 0.9743793973184655, + "grad_norm": 0.39216272913247, + "learning_rate": 4.695686437253946e-06, + "loss": 0.5988, + "step": 3670 + }, + { + "epoch": 0.9746448957918492, + "grad_norm": 0.39774952287863624, + "learning_rate": 4.6955194780123285e-06, + "loss": 0.6198, + "step": 3671 + }, + { + "epoch": 0.974910394265233, + "grad_norm": 0.5575594701577312, + "learning_rate": 4.695352475952706e-06, + "loss": 0.5818, + "step": 3672 + }, + { + "epoch": 0.9751758927386167, + "grad_norm": 0.44469383728107376, + "learning_rate": 4.695185431078334e-06, + "loss": 0.6164, + "step": 3673 + }, + { + "epoch": 0.9754413912120006, + "grad_norm": 0.3867963703664239, + "learning_rate": 4.695018343392472e-06, + "loss": 0.5924, + "step": 3674 + }, + { + "epoch": 0.9757068896853843, + "grad_norm": 0.43739384560926065, + "learning_rate": 4.694851212898378e-06, + "loss": 0.6295, + "step": 3675 + }, + { + "epoch": 0.9759723881587681, + "grad_norm": 0.5094909347727954, + "learning_rate": 4.694684039599311e-06, + "loss": 0.6198, + "step": 3676 + }, + { + "epoch": 0.9762378866321518, + "grad_norm": 0.5472220226418029, + "learning_rate": 4.694516823498532e-06, + "loss": 0.6027, + "step": 3677 + }, + { + "epoch": 0.9765033851055357, + "grad_norm": 0.40021750006934775, + "learning_rate": 4.694349564599301e-06, + "loss": 0.6225, + "step": 3678 + }, + { + "epoch": 0.9767688835789194, + "grad_norm": 0.5479388666932355, + "learning_rate": 4.694182262904882e-06, + "loss": 0.5762, + "step": 3679 + }, + { + "epoch": 0.9770343820523032, + "grad_norm": 0.5719635976414318, + "learning_rate": 4.6940149184185355e-06, + "loss": 0.5885, + "step": 3680 + }, + { + "epoch": 0.977299880525687, + "grad_norm": 0.4073858968239127, + "learning_rate": 4.693847531143528e-06, + "loss": 0.6071, + "step": 3681 + }, + { + "epoch": 0.9775653789990707, + "grad_norm": 0.3951570872686861, + "learning_rate": 4.69368010108312e-06, + "loss": 0.6493, + "step": 3682 + }, + { + "epoch": 0.9778308774724546, + "grad_norm": 0.4835789842734578, + "learning_rate": 4.69351262824058e-06, + "loss": 0.6511, + "step": 3683 + }, + { + "epoch": 0.9780963759458383, + "grad_norm": 0.3988660593653177, + "learning_rate": 4.693345112619173e-06, + "loss": 0.6318, + "step": 3684 + }, + { + "epoch": 0.9783618744192221, + "grad_norm": 0.43644186892211223, + "learning_rate": 4.693177554222167e-06, + "loss": 0.5959, + "step": 3685 + }, + { + "epoch": 0.9786273728926058, + "grad_norm": 0.3849105238605806, + "learning_rate": 4.6930099530528275e-06, + "loss": 0.5945, + "step": 3686 + }, + { + "epoch": 0.9788928713659897, + "grad_norm": 0.39202125675770977, + "learning_rate": 4.692842309114425e-06, + "loss": 0.6155, + "step": 3687 + }, + { + "epoch": 0.9791583698393734, + "grad_norm": 0.3937344635542589, + "learning_rate": 4.692674622410229e-06, + "loss": 0.629, + "step": 3688 + }, + { + "epoch": 0.9794238683127572, + "grad_norm": 0.43389821032805437, + "learning_rate": 4.692506892943508e-06, + "loss": 0.5864, + "step": 3689 + }, + { + "epoch": 0.9796893667861409, + "grad_norm": 0.4155245302247665, + "learning_rate": 4.6923391207175356e-06, + "loss": 0.629, + "step": 3690 + }, + { + "epoch": 0.9799548652595248, + "grad_norm": 0.39114439522055067, + "learning_rate": 4.692171305735582e-06, + "loss": 0.6099, + "step": 3691 + }, + { + "epoch": 0.9802203637329086, + "grad_norm": 0.3828856583766105, + "learning_rate": 4.692003448000921e-06, + "loss": 0.5884, + "step": 3692 + }, + { + "epoch": 0.9804858622062923, + "grad_norm": 0.3993874435271273, + "learning_rate": 4.691835547516826e-06, + "loss": 0.6002, + "step": 3693 + }, + { + "epoch": 0.980751360679676, + "grad_norm": 0.43845103690590476, + "learning_rate": 4.691667604286571e-06, + "loss": 0.5995, + "step": 3694 + }, + { + "epoch": 0.9810168591530599, + "grad_norm": 0.39052978718718556, + "learning_rate": 4.691499618313431e-06, + "loss": 0.6299, + "step": 3695 + }, + { + "epoch": 0.9812823576264437, + "grad_norm": 0.37746046189224663, + "learning_rate": 4.691331589600683e-06, + "loss": 0.6394, + "step": 3696 + }, + { + "epoch": 0.9815478560998274, + "grad_norm": 0.40066545179781904, + "learning_rate": 4.691163518151604e-06, + "loss": 0.6313, + "step": 3697 + }, + { + "epoch": 0.9818133545732112, + "grad_norm": 0.3766838231710626, + "learning_rate": 4.690995403969471e-06, + "loss": 0.6082, + "step": 3698 + }, + { + "epoch": 0.982078853046595, + "grad_norm": 0.39961014282509283, + "learning_rate": 4.690827247057563e-06, + "loss": 0.5669, + "step": 3699 + }, + { + "epoch": 0.9823443515199788, + "grad_norm": 0.40501165983081905, + "learning_rate": 4.69065904741916e-06, + "loss": 0.6391, + "step": 3700 + }, + { + "epoch": 0.9826098499933625, + "grad_norm": 0.3857307020929534, + "learning_rate": 4.690490805057543e-06, + "loss": 0.6152, + "step": 3701 + }, + { + "epoch": 0.9828753484667463, + "grad_norm": 0.4099902283354911, + "learning_rate": 4.69032251997599e-06, + "loss": 0.6263, + "step": 3702 + }, + { + "epoch": 0.98314084694013, + "grad_norm": 0.3856394137656872, + "learning_rate": 4.690154192177787e-06, + "loss": 0.5846, + "step": 3703 + }, + { + "epoch": 0.9834063454135139, + "grad_norm": 0.3989168924172415, + "learning_rate": 4.6899858216662136e-06, + "loss": 0.6062, + "step": 3704 + }, + { + "epoch": 0.9836718438868977, + "grad_norm": 0.39889635541321233, + "learning_rate": 4.689817408444555e-06, + "loss": 0.6378, + "step": 3705 + }, + { + "epoch": 0.9839373423602814, + "grad_norm": 0.39406652466850894, + "learning_rate": 4.689648952516095e-06, + "loss": 0.6028, + "step": 3706 + }, + { + "epoch": 0.9842028408336652, + "grad_norm": 0.4033355488192236, + "learning_rate": 4.68948045388412e-06, + "loss": 0.6072, + "step": 3707 + }, + { + "epoch": 0.984468339307049, + "grad_norm": 0.37862305697802934, + "learning_rate": 4.689311912551915e-06, + "loss": 0.606, + "step": 3708 + }, + { + "epoch": 0.9847338377804328, + "grad_norm": 0.3878418648235052, + "learning_rate": 4.689143328522767e-06, + "loss": 0.5976, + "step": 3709 + }, + { + "epoch": 0.9849993362538165, + "grad_norm": 0.3978761070416158, + "learning_rate": 4.688974701799964e-06, + "loss": 0.5996, + "step": 3710 + }, + { + "epoch": 0.9852648347272003, + "grad_norm": 0.3820619528836613, + "learning_rate": 4.688806032386796e-06, + "loss": 0.6028, + "step": 3711 + }, + { + "epoch": 0.9855303332005841, + "grad_norm": 0.3796453250847225, + "learning_rate": 4.6886373202865506e-06, + "loss": 0.6338, + "step": 3712 + }, + { + "epoch": 0.9857958316739679, + "grad_norm": 0.3968556621124762, + "learning_rate": 4.688468565502519e-06, + "loss": 0.6341, + "step": 3713 + }, + { + "epoch": 0.9860613301473516, + "grad_norm": 0.404800194864264, + "learning_rate": 4.688299768037991e-06, + "loss": 0.6604, + "step": 3714 + }, + { + "epoch": 0.9863268286207354, + "grad_norm": 0.36930134446805696, + "learning_rate": 4.688130927896261e-06, + "loss": 0.5828, + "step": 3715 + }, + { + "epoch": 0.9865923270941193, + "grad_norm": 0.3922966539266864, + "learning_rate": 4.687962045080619e-06, + "loss": 0.5727, + "step": 3716 + }, + { + "epoch": 0.986857825567503, + "grad_norm": 0.3955857287808288, + "learning_rate": 4.687793119594361e-06, + "loss": 0.6415, + "step": 3717 + }, + { + "epoch": 0.9871233240408868, + "grad_norm": 0.3949074082518782, + "learning_rate": 4.6876241514407806e-06, + "loss": 0.6329, + "step": 3718 + }, + { + "epoch": 0.9873888225142705, + "grad_norm": 0.39585244516211565, + "learning_rate": 4.687455140623173e-06, + "loss": 0.5627, + "step": 3719 + }, + { + "epoch": 0.9876543209876543, + "grad_norm": 0.416463933185347, + "learning_rate": 4.687286087144833e-06, + "loss": 0.6316, + "step": 3720 + }, + { + "epoch": 0.9879198194610381, + "grad_norm": 0.3867486743578083, + "learning_rate": 4.687116991009061e-06, + "loss": 0.6101, + "step": 3721 + }, + { + "epoch": 0.9881853179344219, + "grad_norm": 0.4012355026380244, + "learning_rate": 4.6869478522191515e-06, + "loss": 0.6185, + "step": 3722 + }, + { + "epoch": 0.9884508164078056, + "grad_norm": 0.4085551394800617, + "learning_rate": 4.686778670778404e-06, + "loss": 0.6213, + "step": 3723 + }, + { + "epoch": 0.9887163148811894, + "grad_norm": 0.39663432013046435, + "learning_rate": 4.686609446690119e-06, + "loss": 0.6044, + "step": 3724 + }, + { + "epoch": 0.9889818133545732, + "grad_norm": 0.4018089291913098, + "learning_rate": 4.686440179957597e-06, + "loss": 0.6178, + "step": 3725 + }, + { + "epoch": 0.989247311827957, + "grad_norm": 0.39496287783320855, + "learning_rate": 4.686270870584137e-06, + "loss": 0.607, + "step": 3726 + }, + { + "epoch": 0.9895128103013408, + "grad_norm": 0.3938774541356936, + "learning_rate": 4.686101518573041e-06, + "loss": 0.5869, + "step": 3727 + }, + { + "epoch": 0.9897783087747245, + "grad_norm": 0.3918321207536686, + "learning_rate": 4.685932123927615e-06, + "loss": 0.6078, + "step": 3728 + }, + { + "epoch": 0.9900438072481084, + "grad_norm": 0.39201053561114496, + "learning_rate": 4.685762686651159e-06, + "loss": 0.6552, + "step": 3729 + }, + { + "epoch": 0.9903093057214921, + "grad_norm": 0.3766261080464049, + "learning_rate": 4.685593206746979e-06, + "loss": 0.5849, + "step": 3730 + }, + { + "epoch": 0.9905748041948759, + "grad_norm": 0.3826967061947365, + "learning_rate": 4.685423684218381e-06, + "loss": 0.5673, + "step": 3731 + }, + { + "epoch": 0.9908403026682596, + "grad_norm": 0.37498966330526406, + "learning_rate": 4.68525411906867e-06, + "loss": 0.6033, + "step": 3732 + }, + { + "epoch": 0.9911058011416435, + "grad_norm": 0.3780543886684855, + "learning_rate": 4.685084511301154e-06, + "loss": 0.6057, + "step": 3733 + }, + { + "epoch": 0.9913712996150272, + "grad_norm": 0.3892072551607135, + "learning_rate": 4.684914860919138e-06, + "loss": 0.6136, + "step": 3734 + }, + { + "epoch": 0.991636798088411, + "grad_norm": 0.37365922048188355, + "learning_rate": 4.684745167925934e-06, + "loss": 0.5994, + "step": 3735 + }, + { + "epoch": 0.9919022965617947, + "grad_norm": 0.39626044475214023, + "learning_rate": 4.6845754323248505e-06, + "loss": 0.6305, + "step": 3736 + }, + { + "epoch": 0.9921677950351786, + "grad_norm": 0.3937902793503907, + "learning_rate": 4.684405654119196e-06, + "loss": 0.6043, + "step": 3737 + }, + { + "epoch": 0.9924332935085624, + "grad_norm": 0.39432004616059513, + "learning_rate": 4.684235833312284e-06, + "loss": 0.5921, + "step": 3738 + }, + { + "epoch": 0.9926987919819461, + "grad_norm": 0.3819503364307295, + "learning_rate": 4.684065969907425e-06, + "loss": 0.6173, + "step": 3739 + }, + { + "epoch": 0.9929642904553299, + "grad_norm": 0.3754592203801274, + "learning_rate": 4.683896063907932e-06, + "loss": 0.6069, + "step": 3740 + }, + { + "epoch": 0.9932297889287136, + "grad_norm": 0.38545667597499994, + "learning_rate": 4.683726115317119e-06, + "loss": 0.613, + "step": 3741 + }, + { + "epoch": 0.9934952874020975, + "grad_norm": 0.39499805200038546, + "learning_rate": 4.6835561241382985e-06, + "loss": 0.6031, + "step": 3742 + }, + { + "epoch": 0.9937607858754812, + "grad_norm": 0.3794613043776615, + "learning_rate": 4.683386090374789e-06, + "loss": 0.6436, + "step": 3743 + }, + { + "epoch": 0.994026284348865, + "grad_norm": 0.3845896428935738, + "learning_rate": 4.683216014029905e-06, + "loss": 0.604, + "step": 3744 + }, + { + "epoch": 0.9942917828222487, + "grad_norm": 0.3886443680703139, + "learning_rate": 4.6830458951069625e-06, + "loss": 0.6094, + "step": 3745 + }, + { + "epoch": 0.9945572812956326, + "grad_norm": 0.3815840078596797, + "learning_rate": 4.68287573360928e-06, + "loss": 0.6372, + "step": 3746 + }, + { + "epoch": 0.9948227797690163, + "grad_norm": 0.3820926262645276, + "learning_rate": 4.682705529540177e-06, + "loss": 0.6035, + "step": 3747 + }, + { + "epoch": 0.9950882782424001, + "grad_norm": 0.3762459209247919, + "learning_rate": 4.6825352829029705e-06, + "loss": 0.6073, + "step": 3748 + }, + { + "epoch": 0.9953537767157838, + "grad_norm": 0.3843193911083839, + "learning_rate": 4.682364993700983e-06, + "loss": 0.5902, + "step": 3749 + }, + { + "epoch": 0.9956192751891677, + "grad_norm": 0.38812051434919137, + "learning_rate": 4.682194661937535e-06, + "loss": 0.6504, + "step": 3750 + }, + { + "epoch": 0.9958847736625515, + "grad_norm": 0.4053891827606106, + "learning_rate": 4.682024287615948e-06, + "loss": 0.6063, + "step": 3751 + }, + { + "epoch": 0.9961502721359352, + "grad_norm": 0.3757484200398291, + "learning_rate": 4.681853870739545e-06, + "loss": 0.6037, + "step": 3752 + }, + { + "epoch": 0.996415770609319, + "grad_norm": 0.3974556813432687, + "learning_rate": 4.681683411311649e-06, + "loss": 0.6236, + "step": 3753 + }, + { + "epoch": 0.9966812690827028, + "grad_norm": 0.38323425943369327, + "learning_rate": 4.681512909335585e-06, + "loss": 0.6031, + "step": 3754 + }, + { + "epoch": 0.9969467675560866, + "grad_norm": 0.37955441716052646, + "learning_rate": 4.681342364814678e-06, + "loss": 0.598, + "step": 3755 + }, + { + "epoch": 0.9972122660294703, + "grad_norm": 0.38406770095360143, + "learning_rate": 4.681171777752255e-06, + "loss": 0.5885, + "step": 3756 + }, + { + "epoch": 0.9974777645028541, + "grad_norm": 0.38121222912205954, + "learning_rate": 4.68100114815164e-06, + "loss": 0.6079, + "step": 3757 + }, + { + "epoch": 0.9977432629762378, + "grad_norm": 0.3986173792183184, + "learning_rate": 4.680830476016164e-06, + "loss": 0.6106, + "step": 3758 + }, + { + "epoch": 0.9980087614496217, + "grad_norm": 0.4013301795791731, + "learning_rate": 4.680659761349154e-06, + "loss": 0.5945, + "step": 3759 + }, + { + "epoch": 0.9982742599230054, + "grad_norm": 0.3969451492757828, + "learning_rate": 4.680489004153939e-06, + "loss": 0.6177, + "step": 3760 + }, + { + "epoch": 0.9985397583963892, + "grad_norm": 0.37550632993955513, + "learning_rate": 4.6803182044338506e-06, + "loss": 0.612, + "step": 3761 + }, + { + "epoch": 0.998805256869773, + "grad_norm": 0.3903113437702318, + "learning_rate": 4.680147362192218e-06, + "loss": 0.6479, + "step": 3762 + }, + { + "epoch": 0.9990707553431568, + "grad_norm": 0.3737328244921276, + "learning_rate": 4.679976477432373e-06, + "loss": 0.5976, + "step": 3763 + }, + { + "epoch": 0.9993362538165406, + "grad_norm": 0.3707473898268587, + "learning_rate": 4.679805550157651e-06, + "loss": 0.5925, + "step": 3764 + }, + { + "epoch": 0.9996017522899243, + "grad_norm": 0.4049322640665022, + "learning_rate": 4.679634580371383e-06, + "loss": 0.5632, + "step": 3765 + }, + { + "epoch": 0.9998672507633081, + "grad_norm": 0.3858564461362987, + "learning_rate": 4.679463568076904e-06, + "loss": 0.5944, + "step": 3766 + }, + { + "epoch": 1.0, + "grad_norm": 0.3858564461362987, + "learning_rate": 4.6792925132775495e-06, + "loss": 0.6066, + "step": 3767 + }, + { + "epoch": 1.0002654984733839, + "grad_norm": 0.6259057377833318, + "learning_rate": 4.679121415976654e-06, + "loss": 0.5663, + "step": 3768 + }, + { + "epoch": 1.0005309969467675, + "grad_norm": 0.3963461731153748, + "learning_rate": 4.678950276177556e-06, + "loss": 0.6094, + "step": 3769 + }, + { + "epoch": 1.0007964954201514, + "grad_norm": 0.41829513680315344, + "learning_rate": 4.678779093883593e-06, + "loss": 0.5822, + "step": 3770 + }, + { + "epoch": 1.001061993893535, + "grad_norm": 0.3897118635509286, + "learning_rate": 4.6786078690981026e-06, + "loss": 0.6003, + "step": 3771 + }, + { + "epoch": 1.0013274923669189, + "grad_norm": 0.40909010762029085, + "learning_rate": 4.678436601824426e-06, + "loss": 0.5824, + "step": 3772 + }, + { + "epoch": 1.0015929908403027, + "grad_norm": 0.4079357338221315, + "learning_rate": 4.678265292065901e-06, + "loss": 0.6177, + "step": 3773 + }, + { + "epoch": 1.0018584893136864, + "grad_norm": 0.3975615536380815, + "learning_rate": 4.678093939825869e-06, + "loss": 0.5731, + "step": 3774 + }, + { + "epoch": 1.0021239877870702, + "grad_norm": 0.39046478516719413, + "learning_rate": 4.677922545107673e-06, + "loss": 0.6115, + "step": 3775 + }, + { + "epoch": 1.002389486260454, + "grad_norm": 0.40044517494805526, + "learning_rate": 4.677751107914654e-06, + "loss": 0.6184, + "step": 3776 + }, + { + "epoch": 1.0026549847338377, + "grad_norm": 0.4019656685191064, + "learning_rate": 4.677579628250157e-06, + "loss": 0.6468, + "step": 3777 + }, + { + "epoch": 1.0029204832072216, + "grad_norm": 0.3869316355567388, + "learning_rate": 4.677408106117526e-06, + "loss": 0.6013, + "step": 3778 + }, + { + "epoch": 1.0031859816806052, + "grad_norm": 0.3951489761742382, + "learning_rate": 4.677236541520105e-06, + "loss": 0.6157, + "step": 3779 + }, + { + "epoch": 1.003451480153989, + "grad_norm": 0.39593968440594485, + "learning_rate": 4.677064934461241e-06, + "loss": 0.5898, + "step": 3780 + }, + { + "epoch": 1.003716978627373, + "grad_norm": 0.4085811771712458, + "learning_rate": 4.676893284944281e-06, + "loss": 0.6061, + "step": 3781 + }, + { + "epoch": 1.0039824771007566, + "grad_norm": 0.408787436234934, + "learning_rate": 4.676721592972571e-06, + "loss": 0.5558, + "step": 3782 + }, + { + "epoch": 1.0042479755741405, + "grad_norm": 0.38579083766709615, + "learning_rate": 4.67654985854946e-06, + "loss": 0.5695, + "step": 3783 + }, + { + "epoch": 1.0045134740475243, + "grad_norm": 0.3855617336310054, + "learning_rate": 4.676378081678299e-06, + "loss": 0.571, + "step": 3784 + }, + { + "epoch": 1.004778972520908, + "grad_norm": 0.37781350078508324, + "learning_rate": 4.676206262362436e-06, + "loss": 0.6223, + "step": 3785 + }, + { + "epoch": 1.0050444709942918, + "grad_norm": 0.40473186389736987, + "learning_rate": 4.676034400605223e-06, + "loss": 0.6438, + "step": 3786 + }, + { + "epoch": 1.0053099694676755, + "grad_norm": 0.3964697621523235, + "learning_rate": 4.6758624964100104e-06, + "loss": 0.5797, + "step": 3787 + }, + { + "epoch": 1.0055754679410593, + "grad_norm": 0.40271319237949815, + "learning_rate": 4.675690549780152e-06, + "loss": 0.5945, + "step": 3788 + }, + { + "epoch": 1.0058409664144432, + "grad_norm": 0.3879779861067464, + "learning_rate": 4.675518560719002e-06, + "loss": 0.604, + "step": 3789 + }, + { + "epoch": 1.0061064648878268, + "grad_norm": 0.39868356294679413, + "learning_rate": 4.675346529229912e-06, + "loss": 0.603, + "step": 3790 + }, + { + "epoch": 1.0063719633612107, + "grad_norm": 0.39674484222218653, + "learning_rate": 4.67517445531624e-06, + "loss": 0.6126, + "step": 3791 + }, + { + "epoch": 1.0066374618345943, + "grad_norm": 0.4022661282035222, + "learning_rate": 4.675002338981339e-06, + "loss": 0.5996, + "step": 3792 + }, + { + "epoch": 1.0069029603079782, + "grad_norm": 0.3975232938458579, + "learning_rate": 4.674830180228568e-06, + "loss": 0.6382, + "step": 3793 + }, + { + "epoch": 1.007168458781362, + "grad_norm": 0.3928531816249291, + "learning_rate": 4.6746579790612824e-06, + "loss": 0.6114, + "step": 3794 + }, + { + "epoch": 1.0074339572547457, + "grad_norm": 0.39539341044791426, + "learning_rate": 4.674485735482843e-06, + "loss": 0.5988, + "step": 3795 + }, + { + "epoch": 1.0076994557281296, + "grad_norm": 0.3965256695581769, + "learning_rate": 4.674313449496607e-06, + "loss": 0.6046, + "step": 3796 + }, + { + "epoch": 1.0079649542015134, + "grad_norm": 0.37929220201522545, + "learning_rate": 4.674141121105935e-06, + "loss": 0.631, + "step": 3797 + }, + { + "epoch": 1.008230452674897, + "grad_norm": 0.3862540210375224, + "learning_rate": 4.673968750314189e-06, + "loss": 0.5596, + "step": 3798 + }, + { + "epoch": 1.008495951148281, + "grad_norm": 0.3873002440554336, + "learning_rate": 4.673796337124729e-06, + "loss": 0.6062, + "step": 3799 + }, + { + "epoch": 1.0087614496216646, + "grad_norm": 0.40964134645451533, + "learning_rate": 4.673623881540917e-06, + "loss": 0.6488, + "step": 3800 + }, + { + "epoch": 1.0090269480950484, + "grad_norm": 0.4012541396609028, + "learning_rate": 4.6734513835661184e-06, + "loss": 0.6168, + "step": 3801 + }, + { + "epoch": 1.0092924465684323, + "grad_norm": 0.3885834564617068, + "learning_rate": 4.673278843203697e-06, + "loss": 0.593, + "step": 3802 + }, + { + "epoch": 1.009557945041816, + "grad_norm": 0.38139183602894294, + "learning_rate": 4.673106260457015e-06, + "loss": 0.604, + "step": 3803 + }, + { + "epoch": 1.0098234435151998, + "grad_norm": 0.39949221812252345, + "learning_rate": 4.6729336353294415e-06, + "loss": 0.5734, + "step": 3804 + }, + { + "epoch": 1.0100889419885837, + "grad_norm": 0.40253401981329706, + "learning_rate": 4.6727609678243426e-06, + "loss": 0.6145, + "step": 3805 + }, + { + "epoch": 1.0103544404619673, + "grad_norm": 0.38671036359841193, + "learning_rate": 4.672588257945085e-06, + "loss": 0.5973, + "step": 3806 + }, + { + "epoch": 1.0106199389353512, + "grad_norm": 0.3972410132459281, + "learning_rate": 4.672415505695035e-06, + "loss": 0.6014, + "step": 3807 + }, + { + "epoch": 1.0108854374087348, + "grad_norm": 0.3938474800282929, + "learning_rate": 4.672242711077565e-06, + "loss": 0.5835, + "step": 3808 + }, + { + "epoch": 1.0111509358821187, + "grad_norm": 0.4080790671272506, + "learning_rate": 4.672069874096044e-06, + "loss": 0.5673, + "step": 3809 + }, + { + "epoch": 1.0114164343555025, + "grad_norm": 0.38773639414212874, + "learning_rate": 4.671896994753841e-06, + "loss": 0.6199, + "step": 3810 + }, + { + "epoch": 1.0116819328288862, + "grad_norm": 0.3855485014693545, + "learning_rate": 4.67172407305433e-06, + "loss": 0.5991, + "step": 3811 + }, + { + "epoch": 1.01194743130227, + "grad_norm": 0.38890547216898175, + "learning_rate": 4.671551109000882e-06, + "loss": 0.6324, + "step": 3812 + }, + { + "epoch": 1.0122129297756537, + "grad_norm": 0.4100699456509582, + "learning_rate": 4.67137810259687e-06, + "loss": 0.6201, + "step": 3813 + }, + { + "epoch": 1.0124784282490376, + "grad_norm": 0.3871235114915657, + "learning_rate": 4.671205053845669e-06, + "loss": 0.6018, + "step": 3814 + }, + { + "epoch": 1.0127439267224214, + "grad_norm": 0.3848621701251693, + "learning_rate": 4.671031962750653e-06, + "loss": 0.577, + "step": 3815 + }, + { + "epoch": 1.013009425195805, + "grad_norm": 0.3979228799548935, + "learning_rate": 4.670858829315198e-06, + "loss": 0.5473, + "step": 3816 + }, + { + "epoch": 1.013274923669189, + "grad_norm": 0.38593714215860553, + "learning_rate": 4.670685653542682e-06, + "loss": 0.6116, + "step": 3817 + }, + { + "epoch": 1.0135404221425728, + "grad_norm": 0.40542261226495707, + "learning_rate": 4.670512435436479e-06, + "loss": 0.5714, + "step": 3818 + }, + { + "epoch": 1.0138059206159564, + "grad_norm": 0.4013574318406041, + "learning_rate": 4.670339174999971e-06, + "loss": 0.6037, + "step": 3819 + }, + { + "epoch": 1.0140714190893403, + "grad_norm": 0.3883892623864832, + "learning_rate": 4.670165872236534e-06, + "loss": 0.5923, + "step": 3820 + }, + { + "epoch": 1.014336917562724, + "grad_norm": 0.3886093085221166, + "learning_rate": 4.6699925271495495e-06, + "loss": 0.5849, + "step": 3821 + }, + { + "epoch": 1.0146024160361078, + "grad_norm": 0.4035935907573044, + "learning_rate": 4.669819139742398e-06, + "loss": 0.6267, + "step": 3822 + }, + { + "epoch": 1.0148679145094917, + "grad_norm": 0.3972272760666861, + "learning_rate": 4.66964571001846e-06, + "loss": 0.5686, + "step": 3823 + }, + { + "epoch": 1.0151334129828753, + "grad_norm": 0.38617956951169913, + "learning_rate": 4.669472237981118e-06, + "loss": 0.6274, + "step": 3824 + }, + { + "epoch": 1.0153989114562592, + "grad_norm": 0.3893845128129732, + "learning_rate": 4.669298723633757e-06, + "loss": 0.594, + "step": 3825 + }, + { + "epoch": 1.0156644099296428, + "grad_norm": 0.3768802731056577, + "learning_rate": 4.669125166979758e-06, + "loss": 0.5987, + "step": 3826 + }, + { + "epoch": 1.0159299084030267, + "grad_norm": 0.4173704274274717, + "learning_rate": 4.668951568022508e-06, + "loss": 0.5663, + "step": 3827 + }, + { + "epoch": 1.0161954068764105, + "grad_norm": 0.4189146773867367, + "learning_rate": 4.668777926765392e-06, + "loss": 0.6065, + "step": 3828 + }, + { + "epoch": 1.0164609053497942, + "grad_norm": 0.39468865392240277, + "learning_rate": 4.668604243211797e-06, + "loss": 0.6022, + "step": 3829 + }, + { + "epoch": 1.016726403823178, + "grad_norm": 0.4012418917127769, + "learning_rate": 4.668430517365109e-06, + "loss": 0.5912, + "step": 3830 + }, + { + "epoch": 1.016991902296562, + "grad_norm": 0.3997728918042466, + "learning_rate": 4.668256749228717e-06, + "loss": 0.6109, + "step": 3831 + }, + { + "epoch": 1.0172574007699455, + "grad_norm": 0.3857273297766302, + "learning_rate": 4.668082938806009e-06, + "loss": 0.5956, + "step": 3832 + }, + { + "epoch": 1.0175228992433294, + "grad_norm": 0.40515089971101936, + "learning_rate": 4.667909086100375e-06, + "loss": 0.6109, + "step": 3833 + }, + { + "epoch": 1.017788397716713, + "grad_norm": 0.40120298631566786, + "learning_rate": 4.667735191115207e-06, + "loss": 0.6109, + "step": 3834 + }, + { + "epoch": 1.018053896190097, + "grad_norm": 0.4098672779003959, + "learning_rate": 4.667561253853894e-06, + "loss": 0.594, + "step": 3835 + }, + { + "epoch": 1.0183193946634808, + "grad_norm": 0.39335799588059045, + "learning_rate": 4.66738727431983e-06, + "loss": 0.5745, + "step": 3836 + }, + { + "epoch": 1.0185848931368644, + "grad_norm": 0.40248586436280626, + "learning_rate": 4.667213252516408e-06, + "loss": 0.6454, + "step": 3837 + }, + { + "epoch": 1.0188503916102483, + "grad_norm": 0.4016739251164001, + "learning_rate": 4.667039188447021e-06, + "loss": 0.5908, + "step": 3838 + }, + { + "epoch": 1.0191158900836321, + "grad_norm": 0.3831738281956251, + "learning_rate": 4.666865082115064e-06, + "loss": 0.6099, + "step": 3839 + }, + { + "epoch": 1.0193813885570158, + "grad_norm": 0.40143125602335383, + "learning_rate": 4.666690933523932e-06, + "loss": 0.556, + "step": 3840 + }, + { + "epoch": 1.0196468870303996, + "grad_norm": 0.40232850105449164, + "learning_rate": 4.666516742677022e-06, + "loss": 0.6159, + "step": 3841 + }, + { + "epoch": 1.0199123855037833, + "grad_norm": 0.39724733564801323, + "learning_rate": 4.666342509577731e-06, + "loss": 0.5946, + "step": 3842 + }, + { + "epoch": 1.0201778839771671, + "grad_norm": 0.3942266242092512, + "learning_rate": 4.666168234229457e-06, + "loss": 0.6218, + "step": 3843 + }, + { + "epoch": 1.020443382450551, + "grad_norm": 0.39671314798907953, + "learning_rate": 4.665993916635599e-06, + "loss": 0.6053, + "step": 3844 + }, + { + "epoch": 1.0207088809239346, + "grad_norm": 0.394670254581841, + "learning_rate": 4.665819556799556e-06, + "loss": 0.6352, + "step": 3845 + }, + { + "epoch": 1.0209743793973185, + "grad_norm": 0.400681560274759, + "learning_rate": 4.66564515472473e-06, + "loss": 0.5699, + "step": 3846 + }, + { + "epoch": 1.0212398778707021, + "grad_norm": 0.3883498993718981, + "learning_rate": 4.66547071041452e-06, + "loss": 0.6036, + "step": 3847 + }, + { + "epoch": 1.021505376344086, + "grad_norm": 0.4089043937265339, + "learning_rate": 4.665296223872328e-06, + "loss": 0.5819, + "step": 3848 + }, + { + "epoch": 1.0217708748174699, + "grad_norm": 0.4073348056144677, + "learning_rate": 4.66512169510156e-06, + "loss": 0.5983, + "step": 3849 + }, + { + "epoch": 1.0220363732908535, + "grad_norm": 0.4043560168257078, + "learning_rate": 4.664947124105617e-06, + "loss": 0.6051, + "step": 3850 + }, + { + "epoch": 1.0223018717642374, + "grad_norm": 0.4039619958705381, + "learning_rate": 4.664772510887905e-06, + "loss": 0.6379, + "step": 3851 + }, + { + "epoch": 1.0225673702376212, + "grad_norm": 0.3965065767538858, + "learning_rate": 4.664597855451828e-06, + "loss": 0.578, + "step": 3852 + }, + { + "epoch": 1.0228328687110049, + "grad_norm": 0.38789913747237514, + "learning_rate": 4.664423157800793e-06, + "loss": 0.6055, + "step": 3853 + }, + { + "epoch": 1.0230983671843887, + "grad_norm": 0.41978173442823347, + "learning_rate": 4.6642484179382075e-06, + "loss": 0.5887, + "step": 3854 + }, + { + "epoch": 1.0233638656577724, + "grad_norm": 0.3968057102416469, + "learning_rate": 4.664073635867479e-06, + "loss": 0.6011, + "step": 3855 + }, + { + "epoch": 1.0236293641311562, + "grad_norm": 0.38760434076691996, + "learning_rate": 4.663898811592015e-06, + "loss": 0.6018, + "step": 3856 + }, + { + "epoch": 1.02389486260454, + "grad_norm": 0.40156851207975036, + "learning_rate": 4.663723945115227e-06, + "loss": 0.5885, + "step": 3857 + }, + { + "epoch": 1.0241603610779237, + "grad_norm": 0.39866949319640016, + "learning_rate": 4.663549036440523e-06, + "loss": 0.6113, + "step": 3858 + }, + { + "epoch": 1.0244258595513076, + "grad_norm": 0.38432397411316843, + "learning_rate": 4.663374085571317e-06, + "loss": 0.5956, + "step": 3859 + }, + { + "epoch": 1.0246913580246915, + "grad_norm": 0.39993289466861925, + "learning_rate": 4.663199092511019e-06, + "loss": 0.5908, + "step": 3860 + }, + { + "epoch": 1.024956856498075, + "grad_norm": 0.40924468509791817, + "learning_rate": 4.663024057263042e-06, + "loss": 0.6431, + "step": 3861 + }, + { + "epoch": 1.025222354971459, + "grad_norm": 0.40891860498148186, + "learning_rate": 4.662848979830801e-06, + "loss": 0.612, + "step": 3862 + }, + { + "epoch": 1.0254878534448426, + "grad_norm": 0.3970686004743134, + "learning_rate": 4.662673860217708e-06, + "loss": 0.5941, + "step": 3863 + }, + { + "epoch": 1.0257533519182265, + "grad_norm": 0.4283242202155573, + "learning_rate": 4.66249869842718e-06, + "loss": 0.6059, + "step": 3864 + }, + { + "epoch": 1.0260188503916103, + "grad_norm": 0.39952309425662036, + "learning_rate": 4.662323494462633e-06, + "loss": 0.5891, + "step": 3865 + }, + { + "epoch": 1.026284348864994, + "grad_norm": 0.39890012480983744, + "learning_rate": 4.662148248327484e-06, + "loss": 0.5883, + "step": 3866 + }, + { + "epoch": 1.0265498473383778, + "grad_norm": 0.39388272422894277, + "learning_rate": 4.661972960025149e-06, + "loss": 0.6069, + "step": 3867 + }, + { + "epoch": 1.0268153458117615, + "grad_norm": 0.4108952603539573, + "learning_rate": 4.661797629559048e-06, + "loss": 0.6498, + "step": 3868 + }, + { + "epoch": 1.0270808442851453, + "grad_norm": 0.4183665697277097, + "learning_rate": 4.661622256932601e-06, + "loss": 0.5946, + "step": 3869 + }, + { + "epoch": 1.0273463427585292, + "grad_norm": 0.40502224138892995, + "learning_rate": 4.661446842149228e-06, + "loss": 0.575, + "step": 3870 + }, + { + "epoch": 1.0276118412319128, + "grad_norm": 0.44647338276446036, + "learning_rate": 4.661271385212348e-06, + "loss": 0.6272, + "step": 3871 + }, + { + "epoch": 1.0278773397052967, + "grad_norm": 0.3888651813594647, + "learning_rate": 4.6610958861253844e-06, + "loss": 0.5596, + "step": 3872 + }, + { + "epoch": 1.0281428381786806, + "grad_norm": 0.383801902184117, + "learning_rate": 4.66092034489176e-06, + "loss": 0.5966, + "step": 3873 + }, + { + "epoch": 1.0284083366520642, + "grad_norm": 0.3906959120853525, + "learning_rate": 4.660744761514899e-06, + "loss": 0.5992, + "step": 3874 + }, + { + "epoch": 1.028673835125448, + "grad_norm": 0.40496452264374355, + "learning_rate": 4.660569135998224e-06, + "loss": 0.6177, + "step": 3875 + }, + { + "epoch": 1.0289393335988317, + "grad_norm": 0.3984213404972599, + "learning_rate": 4.66039346834516e-06, + "loss": 0.59, + "step": 3876 + }, + { + "epoch": 1.0292048320722156, + "grad_norm": 0.39626090226395955, + "learning_rate": 4.660217758559135e-06, + "loss": 0.5982, + "step": 3877 + }, + { + "epoch": 1.0294703305455994, + "grad_norm": 0.39995172426394393, + "learning_rate": 4.660042006643574e-06, + "loss": 0.6101, + "step": 3878 + }, + { + "epoch": 1.029735829018983, + "grad_norm": 0.3985513612786605, + "learning_rate": 4.659866212601906e-06, + "loss": 0.5726, + "step": 3879 + }, + { + "epoch": 1.030001327492367, + "grad_norm": 0.40356926884575833, + "learning_rate": 4.659690376437558e-06, + "loss": 0.5837, + "step": 3880 + }, + { + "epoch": 1.0302668259657506, + "grad_norm": 0.40091577644762116, + "learning_rate": 4.65951449815396e-06, + "loss": 0.6156, + "step": 3881 + }, + { + "epoch": 1.0305323244391345, + "grad_norm": 0.4139339054772377, + "learning_rate": 4.659338577754541e-06, + "loss": 0.587, + "step": 3882 + }, + { + "epoch": 1.0307978229125183, + "grad_norm": 0.4034515787037412, + "learning_rate": 4.659162615242734e-06, + "loss": 0.6472, + "step": 3883 + }, + { + "epoch": 1.031063321385902, + "grad_norm": 0.4065605403973272, + "learning_rate": 4.658986610621969e-06, + "loss": 0.5891, + "step": 3884 + }, + { + "epoch": 1.0313288198592858, + "grad_norm": 0.4000505817178667, + "learning_rate": 4.6588105638956785e-06, + "loss": 0.5814, + "step": 3885 + }, + { + "epoch": 1.0315943183326697, + "grad_norm": 0.3951311139365383, + "learning_rate": 4.658634475067297e-06, + "loss": 0.5969, + "step": 3886 + }, + { + "epoch": 1.0318598168060533, + "grad_norm": 0.3804157236603994, + "learning_rate": 4.658458344140258e-06, + "loss": 0.5734, + "step": 3887 + }, + { + "epoch": 1.0321253152794372, + "grad_norm": 0.39420004854683033, + "learning_rate": 4.658282171117996e-06, + "loss": 0.6018, + "step": 3888 + }, + { + "epoch": 1.0323908137528208, + "grad_norm": 0.4162983486621894, + "learning_rate": 4.6581059560039475e-06, + "loss": 0.5929, + "step": 3889 + }, + { + "epoch": 1.0326563122262047, + "grad_norm": 0.3959197888088827, + "learning_rate": 4.657929698801549e-06, + "loss": 0.6028, + "step": 3890 + }, + { + "epoch": 1.0329218106995885, + "grad_norm": 0.39309517282903017, + "learning_rate": 4.657753399514238e-06, + "loss": 0.6028, + "step": 3891 + }, + { + "epoch": 1.0331873091729722, + "grad_norm": 0.3990977660770755, + "learning_rate": 4.657577058145451e-06, + "loss": 0.6233, + "step": 3892 + }, + { + "epoch": 1.033452807646356, + "grad_norm": 0.4091445503169717, + "learning_rate": 4.657400674698631e-06, + "loss": 0.5942, + "step": 3893 + }, + { + "epoch": 1.03371830611974, + "grad_norm": 0.4000936986328691, + "learning_rate": 4.657224249177215e-06, + "loss": 0.5487, + "step": 3894 + }, + { + "epoch": 1.0339838045931236, + "grad_norm": 0.3829671456833252, + "learning_rate": 4.657047781584643e-06, + "loss": 0.5863, + "step": 3895 + }, + { + "epoch": 1.0342493030665074, + "grad_norm": 0.387430340212757, + "learning_rate": 4.656871271924359e-06, + "loss": 0.5785, + "step": 3896 + }, + { + "epoch": 1.034514801539891, + "grad_norm": 0.4017994748236961, + "learning_rate": 4.656694720199805e-06, + "loss": 0.6127, + "step": 3897 + }, + { + "epoch": 1.034780300013275, + "grad_norm": 0.4037994730988607, + "learning_rate": 4.656518126414422e-06, + "loss": 0.5753, + "step": 3898 + }, + { + "epoch": 1.0350457984866588, + "grad_norm": 0.39784196668284155, + "learning_rate": 4.656341490571657e-06, + "loss": 0.5983, + "step": 3899 + }, + { + "epoch": 1.0353112969600424, + "grad_norm": 0.4018079959932046, + "learning_rate": 4.656164812674952e-06, + "loss": 0.566, + "step": 3900 + }, + { + "epoch": 1.0355767954334263, + "grad_norm": 0.40493489957738565, + "learning_rate": 4.655988092727754e-06, + "loss": 0.594, + "step": 3901 + }, + { + "epoch": 1.03584229390681, + "grad_norm": 0.40057723838207704, + "learning_rate": 4.65581133073351e-06, + "loss": 0.5898, + "step": 3902 + }, + { + "epoch": 1.0361077923801938, + "grad_norm": 0.39390566921995784, + "learning_rate": 4.655634526695666e-06, + "loss": 0.6105, + "step": 3903 + }, + { + "epoch": 1.0363732908535777, + "grad_norm": 0.39611816245475645, + "learning_rate": 4.655457680617672e-06, + "loss": 0.6102, + "step": 3904 + }, + { + "epoch": 1.0366387893269613, + "grad_norm": 0.4199728709384419, + "learning_rate": 4.655280792502975e-06, + "loss": 0.6051, + "step": 3905 + }, + { + "epoch": 1.0369042878003452, + "grad_norm": 0.3889982763926899, + "learning_rate": 4.655103862355026e-06, + "loss": 0.5806, + "step": 3906 + }, + { + "epoch": 1.037169786273729, + "grad_norm": 0.373123833365217, + "learning_rate": 4.654926890177274e-06, + "loss": 0.615, + "step": 3907 + }, + { + "epoch": 1.0374352847471127, + "grad_norm": 0.39676172673818993, + "learning_rate": 4.6547498759731725e-06, + "loss": 0.6125, + "step": 3908 + }, + { + "epoch": 1.0377007832204965, + "grad_norm": 0.39299389999840884, + "learning_rate": 4.654572819746172e-06, + "loss": 0.6378, + "step": 3909 + }, + { + "epoch": 1.0379662816938802, + "grad_norm": 0.3967926998517405, + "learning_rate": 4.6543957214997265e-06, + "loss": 0.578, + "step": 3910 + }, + { + "epoch": 1.038231780167264, + "grad_norm": 0.3940682206990731, + "learning_rate": 4.65421858123729e-06, + "loss": 0.6314, + "step": 3911 + }, + { + "epoch": 1.038497278640648, + "grad_norm": 0.3929337954727868, + "learning_rate": 4.654041398962315e-06, + "loss": 0.5954, + "step": 3912 + }, + { + "epoch": 1.0387627771140315, + "grad_norm": 0.3914247113949542, + "learning_rate": 4.653864174678261e-06, + "loss": 0.6066, + "step": 3913 + }, + { + "epoch": 1.0390282755874154, + "grad_norm": 0.39633798836215656, + "learning_rate": 4.653686908388581e-06, + "loss": 0.5987, + "step": 3914 + }, + { + "epoch": 1.0392937740607993, + "grad_norm": 0.3945795448179149, + "learning_rate": 4.6535096000967325e-06, + "loss": 0.5705, + "step": 3915 + }, + { + "epoch": 1.039559272534183, + "grad_norm": 0.39286220128118704, + "learning_rate": 4.653332249806175e-06, + "loss": 0.6431, + "step": 3916 + }, + { + "epoch": 1.0398247710075668, + "grad_norm": 0.3855911053260191, + "learning_rate": 4.653154857520367e-06, + "loss": 0.5996, + "step": 3917 + }, + { + "epoch": 1.0400902694809504, + "grad_norm": 0.40293359210332946, + "learning_rate": 4.652977423242766e-06, + "loss": 0.6033, + "step": 3918 + }, + { + "epoch": 1.0403557679543343, + "grad_norm": 0.39345500365371766, + "learning_rate": 4.652799946976835e-06, + "loss": 0.5939, + "step": 3919 + }, + { + "epoch": 1.0406212664277181, + "grad_norm": 0.40907690282966475, + "learning_rate": 4.652622428726033e-06, + "loss": 0.6065, + "step": 3920 + }, + { + "epoch": 1.0408867649011018, + "grad_norm": 0.39850429172018254, + "learning_rate": 4.652444868493824e-06, + "loss": 0.6126, + "step": 3921 + }, + { + "epoch": 1.0411522633744856, + "grad_norm": 0.39506095836641636, + "learning_rate": 4.65226726628367e-06, + "loss": 0.5997, + "step": 3922 + }, + { + "epoch": 1.0414177618478693, + "grad_norm": 0.3868743181514466, + "learning_rate": 4.652089622099034e-06, + "loss": 0.5669, + "step": 3923 + }, + { + "epoch": 1.0416832603212531, + "grad_norm": 0.3988298060522133, + "learning_rate": 4.651911935943381e-06, + "loss": 0.6048, + "step": 3924 + }, + { + "epoch": 1.041948758794637, + "grad_norm": 0.39294241088626475, + "learning_rate": 4.651734207820177e-06, + "loss": 0.5982, + "step": 3925 + }, + { + "epoch": 1.0422142572680206, + "grad_norm": 0.39674976460430234, + "learning_rate": 4.6515564377328875e-06, + "loss": 0.5689, + "step": 3926 + }, + { + "epoch": 1.0424797557414045, + "grad_norm": 0.39706026828222624, + "learning_rate": 4.651378625684979e-06, + "loss": 0.6184, + "step": 3927 + }, + { + "epoch": 1.0427452542147884, + "grad_norm": 0.3975827023725938, + "learning_rate": 4.651200771679921e-06, + "loss": 0.5954, + "step": 3928 + }, + { + "epoch": 1.043010752688172, + "grad_norm": 0.39035044752191134, + "learning_rate": 4.65102287572118e-06, + "loss": 0.6116, + "step": 3929 + }, + { + "epoch": 1.0432762511615559, + "grad_norm": 0.4154047059386226, + "learning_rate": 4.650844937812227e-06, + "loss": 0.5963, + "step": 3930 + }, + { + "epoch": 1.0435417496349395, + "grad_norm": 0.4149390956710434, + "learning_rate": 4.65066695795653e-06, + "loss": 0.5663, + "step": 3931 + }, + { + "epoch": 1.0438072481083234, + "grad_norm": 0.3862636405477702, + "learning_rate": 4.650488936157564e-06, + "loss": 0.6071, + "step": 3932 + }, + { + "epoch": 1.0440727465817072, + "grad_norm": 0.42258029916521017, + "learning_rate": 4.650310872418797e-06, + "loss": 0.6158, + "step": 3933 + }, + { + "epoch": 1.0443382450550909, + "grad_norm": 0.39155900609428707, + "learning_rate": 4.650132766743703e-06, + "loss": 0.6031, + "step": 3934 + }, + { + "epoch": 1.0446037435284747, + "grad_norm": 0.4000197891652038, + "learning_rate": 4.649954619135755e-06, + "loss": 0.6021, + "step": 3935 + }, + { + "epoch": 1.0448692420018584, + "grad_norm": 0.39435809474804107, + "learning_rate": 4.6497764295984295e-06, + "loss": 0.5934, + "step": 3936 + }, + { + "epoch": 1.0451347404752422, + "grad_norm": 0.3932886718245807, + "learning_rate": 4.649598198135199e-06, + "loss": 0.5785, + "step": 3937 + }, + { + "epoch": 1.045400238948626, + "grad_norm": 0.3856928734277222, + "learning_rate": 4.649419924749541e-06, + "loss": 0.585, + "step": 3938 + }, + { + "epoch": 1.0456657374220097, + "grad_norm": 0.397321231385105, + "learning_rate": 4.649241609444931e-06, + "loss": 0.5848, + "step": 3939 + }, + { + "epoch": 1.0459312358953936, + "grad_norm": 0.3959339300378574, + "learning_rate": 4.6490632522248485e-06, + "loss": 0.6104, + "step": 3940 + }, + { + "epoch": 1.0461967343687775, + "grad_norm": 0.4174649763601148, + "learning_rate": 4.64888485309277e-06, + "loss": 0.6006, + "step": 3941 + }, + { + "epoch": 1.046462232842161, + "grad_norm": 0.39572924729913433, + "learning_rate": 4.648706412052176e-06, + "loss": 0.6005, + "step": 3942 + }, + { + "epoch": 1.046727731315545, + "grad_norm": 0.4050920069964945, + "learning_rate": 4.648527929106545e-06, + "loss": 0.5916, + "step": 3943 + }, + { + "epoch": 1.0469932297889286, + "grad_norm": 0.40300721216178076, + "learning_rate": 4.64834940425936e-06, + "loss": 0.6268, + "step": 3944 + }, + { + "epoch": 1.0472587282623125, + "grad_norm": 0.41351572312069973, + "learning_rate": 4.648170837514101e-06, + "loss": 0.6354, + "step": 3945 + }, + { + "epoch": 1.0475242267356963, + "grad_norm": 0.41676418583413, + "learning_rate": 4.647992228874252e-06, + "loss": 0.6013, + "step": 3946 + }, + { + "epoch": 1.04778972520908, + "grad_norm": 0.42266597449064286, + "learning_rate": 4.6478135783432945e-06, + "loss": 0.6302, + "step": 3947 + }, + { + "epoch": 1.0480552236824638, + "grad_norm": 0.4046531734540872, + "learning_rate": 4.647634885924713e-06, + "loss": 0.613, + "step": 3948 + }, + { + "epoch": 1.0483207221558477, + "grad_norm": 0.40537812088325564, + "learning_rate": 4.647456151621994e-06, + "loss": 0.6054, + "step": 3949 + }, + { + "epoch": 1.0485862206292313, + "grad_norm": 0.41246438042069344, + "learning_rate": 4.647277375438621e-06, + "loss": 0.6388, + "step": 3950 + }, + { + "epoch": 1.0488517191026152, + "grad_norm": 0.395286229298196, + "learning_rate": 4.647098557378082e-06, + "loss": 0.5996, + "step": 3951 + }, + { + "epoch": 1.0491172175759989, + "grad_norm": 0.4242085118227341, + "learning_rate": 4.646919697443865e-06, + "loss": 0.5566, + "step": 3952 + }, + { + "epoch": 1.0493827160493827, + "grad_norm": 0.3948842848693423, + "learning_rate": 4.646740795639457e-06, + "loss": 0.6018, + "step": 3953 + }, + { + "epoch": 1.0496482145227666, + "grad_norm": 0.3971262371796073, + "learning_rate": 4.646561851968348e-06, + "loss": 0.623, + "step": 3954 + }, + { + "epoch": 1.0499137129961502, + "grad_norm": 0.404608872432813, + "learning_rate": 4.646382866434025e-06, + "loss": 0.6155, + "step": 3955 + }, + { + "epoch": 1.050179211469534, + "grad_norm": 0.4391896833401388, + "learning_rate": 4.646203839039983e-06, + "loss": 0.5745, + "step": 3956 + }, + { + "epoch": 1.050444709942918, + "grad_norm": 0.4086218137284621, + "learning_rate": 4.6460247697897104e-06, + "loss": 0.5631, + "step": 3957 + }, + { + "epoch": 1.0507102084163016, + "grad_norm": 0.38294863021613434, + "learning_rate": 4.6458456586867005e-06, + "loss": 0.5427, + "step": 3958 + }, + { + "epoch": 1.0509757068896854, + "grad_norm": 0.3965531203963847, + "learning_rate": 4.645666505734446e-06, + "loss": 0.6063, + "step": 3959 + }, + { + "epoch": 1.051241205363069, + "grad_norm": 0.45196652342293364, + "learning_rate": 4.645487310936442e-06, + "loss": 0.5812, + "step": 3960 + }, + { + "epoch": 1.051506703836453, + "grad_norm": 0.4229436816368648, + "learning_rate": 4.645308074296182e-06, + "loss": 0.5793, + "step": 3961 + }, + { + "epoch": 1.0517722023098368, + "grad_norm": 0.4039056274450764, + "learning_rate": 4.645128795817162e-06, + "loss": 0.6321, + "step": 3962 + }, + { + "epoch": 1.0520377007832205, + "grad_norm": 0.4078630946423279, + "learning_rate": 4.644949475502878e-06, + "loss": 0.5885, + "step": 3963 + }, + { + "epoch": 1.0523031992566043, + "grad_norm": 0.4281255109122617, + "learning_rate": 4.644770113356827e-06, + "loss": 0.6252, + "step": 3964 + }, + { + "epoch": 1.052568697729988, + "grad_norm": 0.42492414306687437, + "learning_rate": 4.644590709382508e-06, + "loss": 0.6275, + "step": 3965 + }, + { + "epoch": 1.0528341962033718, + "grad_norm": 0.3994959922056805, + "learning_rate": 4.64441126358342e-06, + "loss": 0.6098, + "step": 3966 + }, + { + "epoch": 1.0530996946767557, + "grad_norm": 0.430295370667777, + "learning_rate": 4.644231775963061e-06, + "loss": 0.6003, + "step": 3967 + }, + { + "epoch": 1.0533651931501393, + "grad_norm": 0.4025262604522123, + "learning_rate": 4.6440522465249325e-06, + "loss": 0.5705, + "step": 3968 + }, + { + "epoch": 1.0536306916235232, + "grad_norm": 0.40095141611684554, + "learning_rate": 4.643872675272536e-06, + "loss": 0.5755, + "step": 3969 + }, + { + "epoch": 1.053896190096907, + "grad_norm": 0.3855975982673741, + "learning_rate": 4.6436930622093736e-06, + "loss": 0.5841, + "step": 3970 + }, + { + "epoch": 1.0541616885702907, + "grad_norm": 0.3763769382386772, + "learning_rate": 4.6435134073389476e-06, + "loss": 0.5582, + "step": 3971 + }, + { + "epoch": 1.0544271870436746, + "grad_norm": 0.3898341243635561, + "learning_rate": 4.643333710664761e-06, + "loss": 0.6118, + "step": 3972 + }, + { + "epoch": 1.0546926855170582, + "grad_norm": 0.4072152099711562, + "learning_rate": 4.64315397219032e-06, + "loss": 0.5526, + "step": 3973 + }, + { + "epoch": 1.054958183990442, + "grad_norm": 0.39817168003443787, + "learning_rate": 4.642974191919129e-06, + "loss": 0.6171, + "step": 3974 + }, + { + "epoch": 1.055223682463826, + "grad_norm": 0.39582321590602365, + "learning_rate": 4.642794369854695e-06, + "loss": 0.5945, + "step": 3975 + }, + { + "epoch": 1.0554891809372096, + "grad_norm": 0.39713867375513295, + "learning_rate": 4.642614506000524e-06, + "loss": 0.591, + "step": 3976 + }, + { + "epoch": 1.0557546794105934, + "grad_norm": 0.39985740999871866, + "learning_rate": 4.642434600360124e-06, + "loss": 0.6041, + "step": 3977 + }, + { + "epoch": 1.056020177883977, + "grad_norm": 0.40421810047672957, + "learning_rate": 4.642254652937003e-06, + "loss": 0.5846, + "step": 3978 + }, + { + "epoch": 1.056285676357361, + "grad_norm": 0.4135223492425173, + "learning_rate": 4.6420746637346716e-06, + "loss": 0.5858, + "step": 3979 + }, + { + "epoch": 1.0565511748307448, + "grad_norm": 0.39687827660462077, + "learning_rate": 4.64189463275664e-06, + "loss": 0.569, + "step": 3980 + }, + { + "epoch": 1.0568166733041284, + "grad_norm": 0.3977025868764313, + "learning_rate": 4.641714560006418e-06, + "loss": 0.5852, + "step": 3981 + }, + { + "epoch": 1.0570821717775123, + "grad_norm": 0.40460146590018226, + "learning_rate": 4.641534445487519e-06, + "loss": 0.6124, + "step": 3982 + }, + { + "epoch": 1.0573476702508962, + "grad_norm": 0.3801516478573456, + "learning_rate": 4.641354289203455e-06, + "loss": 0.6004, + "step": 3983 + }, + { + "epoch": 1.0576131687242798, + "grad_norm": 0.4046332239665869, + "learning_rate": 4.6411740911577386e-06, + "loss": 0.6091, + "step": 3984 + }, + { + "epoch": 1.0578786671976637, + "grad_norm": 0.4187030458623271, + "learning_rate": 4.640993851353885e-06, + "loss": 0.5953, + "step": 3985 + }, + { + "epoch": 1.0581441656710473, + "grad_norm": 0.38875551995967106, + "learning_rate": 4.640813569795411e-06, + "loss": 0.5498, + "step": 3986 + }, + { + "epoch": 1.0584096641444312, + "grad_norm": 0.3942155624605592, + "learning_rate": 4.640633246485828e-06, + "loss": 0.5853, + "step": 3987 + }, + { + "epoch": 1.058675162617815, + "grad_norm": 0.4051403453068996, + "learning_rate": 4.6404528814286575e-06, + "loss": 0.6415, + "step": 3988 + }, + { + "epoch": 1.0589406610911987, + "grad_norm": 0.4110702761747505, + "learning_rate": 4.640272474627414e-06, + "loss": 0.5783, + "step": 3989 + }, + { + "epoch": 1.0592061595645825, + "grad_norm": 0.38866081164380356, + "learning_rate": 4.6400920260856184e-06, + "loss": 0.6032, + "step": 3990 + }, + { + "epoch": 1.0594716580379662, + "grad_norm": 0.4053554002886409, + "learning_rate": 4.639911535806787e-06, + "loss": 0.5987, + "step": 3991 + }, + { + "epoch": 1.05973715651135, + "grad_norm": 0.39492411529486454, + "learning_rate": 4.639731003794442e-06, + "loss": 0.5614, + "step": 3992 + }, + { + "epoch": 1.060002654984734, + "grad_norm": 0.4114656610011355, + "learning_rate": 4.639550430052103e-06, + "loss": 0.6325, + "step": 3993 + }, + { + "epoch": 1.0602681534581175, + "grad_norm": 0.4020381155232606, + "learning_rate": 4.639369814583292e-06, + "loss": 0.6457, + "step": 3994 + }, + { + "epoch": 1.0605336519315014, + "grad_norm": 0.39583993671126844, + "learning_rate": 4.6391891573915324e-06, + "loss": 0.6063, + "step": 3995 + }, + { + "epoch": 1.0607991504048853, + "grad_norm": 0.39995343768694624, + "learning_rate": 4.639008458480346e-06, + "loss": 0.5923, + "step": 3996 + }, + { + "epoch": 1.061064648878269, + "grad_norm": 0.3962011462952487, + "learning_rate": 4.638827717853258e-06, + "loss": 0.5937, + "step": 3997 + }, + { + "epoch": 1.0613301473516528, + "grad_norm": 0.3997531766832948, + "learning_rate": 4.638646935513792e-06, + "loss": 0.5774, + "step": 3998 + }, + { + "epoch": 1.0615956458250364, + "grad_norm": 0.38683666221284196, + "learning_rate": 4.638466111465474e-06, + "loss": 0.5843, + "step": 3999 + }, + { + "epoch": 1.0618611442984203, + "grad_norm": 0.4108375489718136, + "learning_rate": 4.638285245711832e-06, + "loss": 0.5789, + "step": 4000 + }, + { + "epoch": 1.0621266427718041, + "grad_norm": 0.3936205346293354, + "learning_rate": 4.638104338256392e-06, + "loss": 0.5664, + "step": 4001 + }, + { + "epoch": 1.0623921412451878, + "grad_norm": 0.39522341551196094, + "learning_rate": 4.6379233891026825e-06, + "loss": 0.6, + "step": 4002 + }, + { + "epoch": 1.0626576397185716, + "grad_norm": 0.3915513163077133, + "learning_rate": 4.637742398254232e-06, + "loss": 0.598, + "step": 4003 + }, + { + "epoch": 1.0629231381919555, + "grad_norm": 0.3895599518886543, + "learning_rate": 4.637561365714571e-06, + "loss": 0.5873, + "step": 4004 + }, + { + "epoch": 1.0631886366653391, + "grad_norm": 0.41083908834248734, + "learning_rate": 4.63738029148723e-06, + "loss": 0.5793, + "step": 4005 + }, + { + "epoch": 1.063454135138723, + "grad_norm": 0.3924829090295871, + "learning_rate": 4.63719917557574e-06, + "loss": 0.6103, + "step": 4006 + }, + { + "epoch": 1.0637196336121066, + "grad_norm": 0.4037497914256407, + "learning_rate": 4.6370180179836325e-06, + "loss": 0.5627, + "step": 4007 + }, + { + "epoch": 1.0639851320854905, + "grad_norm": 0.40973722859646755, + "learning_rate": 4.6368368187144415e-06, + "loss": 0.599, + "step": 4008 + }, + { + "epoch": 1.0642506305588744, + "grad_norm": 0.4088499627365484, + "learning_rate": 4.636655577771701e-06, + "loss": 0.5812, + "step": 4009 + }, + { + "epoch": 1.064516129032258, + "grad_norm": 0.3923980546324671, + "learning_rate": 4.636474295158946e-06, + "loss": 0.5899, + "step": 4010 + }, + { + "epoch": 1.0647816275056419, + "grad_norm": 0.40326377613545966, + "learning_rate": 4.63629297087971e-06, + "loss": 0.5754, + "step": 4011 + }, + { + "epoch": 1.0650471259790257, + "grad_norm": 0.40855081579138747, + "learning_rate": 4.636111604937531e-06, + "loss": 0.623, + "step": 4012 + }, + { + "epoch": 1.0653126244524094, + "grad_norm": 0.40059093995638795, + "learning_rate": 4.635930197335946e-06, + "loss": 0.5479, + "step": 4013 + }, + { + "epoch": 1.0655781229257932, + "grad_norm": 0.3994927713733089, + "learning_rate": 4.635748748078493e-06, + "loss": 0.6155, + "step": 4014 + }, + { + "epoch": 1.0658436213991769, + "grad_norm": 0.4137964448651529, + "learning_rate": 4.635567257168708e-06, + "loss": 0.5958, + "step": 4015 + }, + { + "epoch": 1.0661091198725607, + "grad_norm": 0.41072798377557773, + "learning_rate": 4.635385724610134e-06, + "loss": 0.6297, + "step": 4016 + }, + { + "epoch": 1.0663746183459446, + "grad_norm": 0.395282013123413, + "learning_rate": 4.635204150406311e-06, + "loss": 0.592, + "step": 4017 + }, + { + "epoch": 1.0666401168193282, + "grad_norm": 0.4005879453539046, + "learning_rate": 4.635022534560778e-06, + "loss": 0.6176, + "step": 4018 + }, + { + "epoch": 1.066905615292712, + "grad_norm": 0.3991568310917474, + "learning_rate": 4.634840877077078e-06, + "loss": 0.5751, + "step": 4019 + }, + { + "epoch": 1.0671711137660957, + "grad_norm": 0.3945368982901071, + "learning_rate": 4.634659177958754e-06, + "loss": 0.5831, + "step": 4020 + }, + { + "epoch": 1.0674366122394796, + "grad_norm": 0.38894346471271196, + "learning_rate": 4.634477437209351e-06, + "loss": 0.606, + "step": 4021 + }, + { + "epoch": 1.0677021107128635, + "grad_norm": 0.40818729118424146, + "learning_rate": 4.63429565483241e-06, + "loss": 0.5838, + "step": 4022 + }, + { + "epoch": 1.0679676091862471, + "grad_norm": 0.4020142420116359, + "learning_rate": 4.6341138308314795e-06, + "loss": 0.63, + "step": 4023 + }, + { + "epoch": 1.068233107659631, + "grad_norm": 0.40365132656974057, + "learning_rate": 4.633931965210103e-06, + "loss": 0.5859, + "step": 4024 + }, + { + "epoch": 1.0684986061330148, + "grad_norm": 0.4131720649015307, + "learning_rate": 4.633750057971829e-06, + "loss": 0.5625, + "step": 4025 + }, + { + "epoch": 1.0687641046063985, + "grad_norm": 0.41242328153385804, + "learning_rate": 4.6335681091202035e-06, + "loss": 0.6144, + "step": 4026 + }, + { + "epoch": 1.0690296030797823, + "grad_norm": 0.39882614654718024, + "learning_rate": 4.633386118658777e-06, + "loss": 0.5778, + "step": 4027 + }, + { + "epoch": 1.069295101553166, + "grad_norm": 0.40762356706951475, + "learning_rate": 4.6332040865910975e-06, + "loss": 0.6065, + "step": 4028 + }, + { + "epoch": 1.0695606000265498, + "grad_norm": 0.4094125740556995, + "learning_rate": 4.633022012920716e-06, + "loss": 0.6325, + "step": 4029 + }, + { + "epoch": 1.0698260984999337, + "grad_norm": 0.4146056476165413, + "learning_rate": 4.632839897651182e-06, + "loss": 0.5754, + "step": 4030 + }, + { + "epoch": 1.0700915969733173, + "grad_norm": 0.40852967614332164, + "learning_rate": 4.6326577407860476e-06, + "loss": 0.565, + "step": 4031 + }, + { + "epoch": 1.0703570954467012, + "grad_norm": 0.3899578116984801, + "learning_rate": 4.632475542328866e-06, + "loss": 0.6179, + "step": 4032 + }, + { + "epoch": 1.0706225939200849, + "grad_norm": 0.397426110489581, + "learning_rate": 4.63229330228319e-06, + "loss": 0.5967, + "step": 4033 + }, + { + "epoch": 1.0708880923934687, + "grad_norm": 0.42837850860275356, + "learning_rate": 4.632111020652574e-06, + "loss": 0.5747, + "step": 4034 + }, + { + "epoch": 1.0711535908668526, + "grad_norm": 0.41237492707396645, + "learning_rate": 4.631928697440573e-06, + "loss": 0.6062, + "step": 4035 + }, + { + "epoch": 1.0714190893402362, + "grad_norm": 0.3887398665098403, + "learning_rate": 4.631746332650743e-06, + "loss": 0.5838, + "step": 4036 + }, + { + "epoch": 1.07168458781362, + "grad_norm": 0.4034091547286304, + "learning_rate": 4.631563926286639e-06, + "loss": 0.6039, + "step": 4037 + }, + { + "epoch": 1.071950086287004, + "grad_norm": 0.4087668165035704, + "learning_rate": 4.63138147835182e-06, + "loss": 0.6066, + "step": 4038 + }, + { + "epoch": 1.0722155847603876, + "grad_norm": 0.4166615103916408, + "learning_rate": 4.631198988849843e-06, + "loss": 0.6068, + "step": 4039 + }, + { + "epoch": 1.0724810832337714, + "grad_norm": 0.43581208237862434, + "learning_rate": 4.631016457784269e-06, + "loss": 0.6402, + "step": 4040 + }, + { + "epoch": 1.072746581707155, + "grad_norm": 0.4001290711025062, + "learning_rate": 4.630833885158656e-06, + "loss": 0.6372, + "step": 4041 + }, + { + "epoch": 1.073012080180539, + "grad_norm": 0.40624572138555, + "learning_rate": 4.630651270976564e-06, + "loss": 0.6018, + "step": 4042 + }, + { + "epoch": 1.0732775786539228, + "grad_norm": 0.391118087302682, + "learning_rate": 4.630468615241556e-06, + "loss": 0.5695, + "step": 4043 + }, + { + "epoch": 1.0735430771273065, + "grad_norm": 0.4027048041466175, + "learning_rate": 4.630285917957195e-06, + "loss": 0.5849, + "step": 4044 + }, + { + "epoch": 1.0738085756006903, + "grad_norm": 0.4530159977697565, + "learning_rate": 4.630103179127041e-06, + "loss": 0.5549, + "step": 4045 + }, + { + "epoch": 1.074074074074074, + "grad_norm": 0.40028759639303, + "learning_rate": 4.629920398754661e-06, + "loss": 0.5684, + "step": 4046 + }, + { + "epoch": 1.0743395725474578, + "grad_norm": 0.4210963743094575, + "learning_rate": 4.629737576843617e-06, + "loss": 0.5887, + "step": 4047 + }, + { + "epoch": 1.0746050710208417, + "grad_norm": 0.4128290360089716, + "learning_rate": 4.629554713397476e-06, + "loss": 0.5944, + "step": 4048 + }, + { + "epoch": 1.0748705694942253, + "grad_norm": 0.40629500661141016, + "learning_rate": 4.629371808419805e-06, + "loss": 0.6134, + "step": 4049 + }, + { + "epoch": 1.0751360679676092, + "grad_norm": 0.41294341940243207, + "learning_rate": 4.629188861914169e-06, + "loss": 0.619, + "step": 4050 + }, + { + "epoch": 1.075401566440993, + "grad_norm": 0.39827831627336674, + "learning_rate": 4.629005873884137e-06, + "loss": 0.6145, + "step": 4051 + }, + { + "epoch": 1.0756670649143767, + "grad_norm": 0.38773555444590707, + "learning_rate": 4.6288228443332786e-06, + "loss": 0.5963, + "step": 4052 + }, + { + "epoch": 1.0759325633877606, + "grad_norm": 0.40527077192532657, + "learning_rate": 4.628639773265162e-06, + "loss": 0.6337, + "step": 4053 + }, + { + "epoch": 1.0761980618611442, + "grad_norm": 0.3985440810812894, + "learning_rate": 4.628456660683358e-06, + "loss": 0.5574, + "step": 4054 + }, + { + "epoch": 1.076463560334528, + "grad_norm": 0.39393920952517114, + "learning_rate": 4.628273506591439e-06, + "loss": 0.5839, + "step": 4055 + }, + { + "epoch": 1.076729058807912, + "grad_norm": 0.3954995575609508, + "learning_rate": 4.628090310992975e-06, + "loss": 0.5815, + "step": 4056 + }, + { + "epoch": 1.0769945572812956, + "grad_norm": 0.4053163964827301, + "learning_rate": 4.627907073891538e-06, + "loss": 0.5851, + "step": 4057 + }, + { + "epoch": 1.0772600557546794, + "grad_norm": 0.4087608732005017, + "learning_rate": 4.627723795290704e-06, + "loss": 0.594, + "step": 4058 + }, + { + "epoch": 1.0775255542280633, + "grad_norm": 0.41284048213269314, + "learning_rate": 4.6275404751940465e-06, + "loss": 0.6287, + "step": 4059 + }, + { + "epoch": 1.077791052701447, + "grad_norm": 0.4075695186606942, + "learning_rate": 4.627357113605141e-06, + "loss": 0.5922, + "step": 4060 + }, + { + "epoch": 1.0780565511748308, + "grad_norm": 0.4081051314218715, + "learning_rate": 4.627173710527563e-06, + "loss": 0.5752, + "step": 4061 + }, + { + "epoch": 1.0783220496482144, + "grad_norm": 0.4187394365033306, + "learning_rate": 4.626990265964889e-06, + "loss": 0.5719, + "step": 4062 + }, + { + "epoch": 1.0785875481215983, + "grad_norm": 0.4112295225806692, + "learning_rate": 4.626806779920697e-06, + "loss": 0.6388, + "step": 4063 + }, + { + "epoch": 1.0788530465949822, + "grad_norm": 0.4100271543627126, + "learning_rate": 4.626623252398566e-06, + "loss": 0.6075, + "step": 4064 + }, + { + "epoch": 1.0791185450683658, + "grad_norm": 0.41631544639993207, + "learning_rate": 4.626439683402074e-06, + "loss": 0.5845, + "step": 4065 + }, + { + "epoch": 1.0793840435417497, + "grad_norm": 0.40470529300569175, + "learning_rate": 4.6262560729348025e-06, + "loss": 0.5815, + "step": 4066 + }, + { + "epoch": 1.0796495420151335, + "grad_norm": 0.3973788458301625, + "learning_rate": 4.6260724210003315e-06, + "loss": 0.6086, + "step": 4067 + }, + { + "epoch": 1.0799150404885172, + "grad_norm": 0.39142176023047, + "learning_rate": 4.6258887276022425e-06, + "loss": 0.5778, + "step": 4068 + }, + { + "epoch": 1.080180538961901, + "grad_norm": 0.40524163985892986, + "learning_rate": 4.625704992744118e-06, + "loss": 0.6152, + "step": 4069 + }, + { + "epoch": 1.0804460374352847, + "grad_norm": 0.3999464567193446, + "learning_rate": 4.625521216429543e-06, + "loss": 0.5923, + "step": 4070 + }, + { + "epoch": 1.0807115359086685, + "grad_norm": 0.40619490805430397, + "learning_rate": 4.6253373986620985e-06, + "loss": 0.5934, + "step": 4071 + }, + { + "epoch": 1.0809770343820524, + "grad_norm": 0.3931726727012223, + "learning_rate": 4.625153539445371e-06, + "loss": 0.5917, + "step": 4072 + }, + { + "epoch": 1.081242532855436, + "grad_norm": 0.3857731390731141, + "learning_rate": 4.624969638782946e-06, + "loss": 0.5842, + "step": 4073 + }, + { + "epoch": 1.08150803132882, + "grad_norm": 0.39874269463638917, + "learning_rate": 4.6247856966784115e-06, + "loss": 0.5879, + "step": 4074 + }, + { + "epoch": 1.0817735298022035, + "grad_norm": 0.40993896377612815, + "learning_rate": 4.624601713135353e-06, + "loss": 0.5853, + "step": 4075 + }, + { + "epoch": 1.0820390282755874, + "grad_norm": 0.4205507913908924, + "learning_rate": 4.624417688157359e-06, + "loss": 0.6193, + "step": 4076 + }, + { + "epoch": 1.0823045267489713, + "grad_norm": 0.39417773110000665, + "learning_rate": 4.624233621748019e-06, + "loss": 0.5884, + "step": 4077 + }, + { + "epoch": 1.082570025222355, + "grad_norm": 0.39894065437122356, + "learning_rate": 4.624049513910922e-06, + "loss": 0.6197, + "step": 4078 + }, + { + "epoch": 1.0828355236957388, + "grad_norm": 0.4062838970200478, + "learning_rate": 4.623865364649659e-06, + "loss": 0.6042, + "step": 4079 + }, + { + "epoch": 1.0831010221691226, + "grad_norm": 0.40102762554442134, + "learning_rate": 4.623681173967821e-06, + "loss": 0.5829, + "step": 4080 + }, + { + "epoch": 1.0833665206425063, + "grad_norm": 0.38647607303578524, + "learning_rate": 4.623496941869e-06, + "loss": 0.5842, + "step": 4081 + }, + { + "epoch": 1.0836320191158901, + "grad_norm": 0.4074918791365656, + "learning_rate": 4.623312668356791e-06, + "loss": 0.5584, + "step": 4082 + }, + { + "epoch": 1.0838975175892738, + "grad_norm": 0.4098707063384309, + "learning_rate": 4.623128353434785e-06, + "loss": 0.6217, + "step": 4083 + }, + { + "epoch": 1.0841630160626576, + "grad_norm": 0.3963502183367459, + "learning_rate": 4.622943997106578e-06, + "loss": 0.6107, + "step": 4084 + }, + { + "epoch": 1.0844285145360415, + "grad_norm": 0.39437021559905555, + "learning_rate": 4.6227595993757655e-06, + "loss": 0.5812, + "step": 4085 + }, + { + "epoch": 1.0846940130094251, + "grad_norm": 0.3948263729511105, + "learning_rate": 4.622575160245944e-06, + "loss": 0.5957, + "step": 4086 + }, + { + "epoch": 1.084959511482809, + "grad_norm": 0.39799456259942495, + "learning_rate": 4.622390679720709e-06, + "loss": 0.6196, + "step": 4087 + }, + { + "epoch": 1.0852250099561926, + "grad_norm": 0.4061189324122895, + "learning_rate": 4.622206157803659e-06, + "loss": 0.5821, + "step": 4088 + }, + { + "epoch": 1.0854905084295765, + "grad_norm": 0.4047046102093522, + "learning_rate": 4.622021594498393e-06, + "loss": 0.5982, + "step": 4089 + }, + { + "epoch": 1.0857560069029604, + "grad_norm": 0.3858058221957461, + "learning_rate": 4.6218369898085105e-06, + "loss": 0.6049, + "step": 4090 + }, + { + "epoch": 1.086021505376344, + "grad_norm": 0.3951353034048394, + "learning_rate": 4.621652343737611e-06, + "loss": 0.6026, + "step": 4091 + }, + { + "epoch": 1.0862870038497279, + "grad_norm": 0.39666346551425696, + "learning_rate": 4.621467656289297e-06, + "loss": 0.6027, + "step": 4092 + }, + { + "epoch": 1.0865525023231117, + "grad_norm": 0.40686510723042235, + "learning_rate": 4.621282927467169e-06, + "loss": 0.6458, + "step": 4093 + }, + { + "epoch": 1.0868180007964954, + "grad_norm": 0.4139525417350658, + "learning_rate": 4.6210981572748305e-06, + "loss": 0.6135, + "step": 4094 + }, + { + "epoch": 1.0870834992698792, + "grad_norm": 0.39562366154066525, + "learning_rate": 4.620913345715884e-06, + "loss": 0.5636, + "step": 4095 + }, + { + "epoch": 1.0873489977432629, + "grad_norm": 0.39662945248108944, + "learning_rate": 4.620728492793934e-06, + "loss": 0.5956, + "step": 4096 + }, + { + "epoch": 1.0876144962166467, + "grad_norm": 0.40064245378846836, + "learning_rate": 4.620543598512587e-06, + "loss": 0.5605, + "step": 4097 + }, + { + "epoch": 1.0878799946900306, + "grad_norm": 0.40143468481824596, + "learning_rate": 4.620358662875448e-06, + "loss": 0.6084, + "step": 4098 + }, + { + "epoch": 1.0881454931634142, + "grad_norm": 0.40287424263354066, + "learning_rate": 4.620173685886123e-06, + "loss": 0.6268, + "step": 4099 + }, + { + "epoch": 1.088410991636798, + "grad_norm": 0.4098648566606787, + "learning_rate": 4.6199886675482195e-06, + "loss": 0.5562, + "step": 4100 + }, + { + "epoch": 1.0886764901101817, + "grad_norm": 0.3864751162833137, + "learning_rate": 4.619803607865348e-06, + "loss": 0.5767, + "step": 4101 + }, + { + "epoch": 1.0889419885835656, + "grad_norm": 0.39170264086987516, + "learning_rate": 4.6196185068411145e-06, + "loss": 0.6308, + "step": 4102 + }, + { + "epoch": 1.0892074870569495, + "grad_norm": 0.3981892055421346, + "learning_rate": 4.619433364479131e-06, + "loss": 0.6012, + "step": 4103 + }, + { + "epoch": 1.0894729855303331, + "grad_norm": 0.4061562311151115, + "learning_rate": 4.619248180783007e-06, + "loss": 0.6128, + "step": 4104 + }, + { + "epoch": 1.089738484003717, + "grad_norm": 0.4023449299639812, + "learning_rate": 4.619062955756355e-06, + "loss": 0.5969, + "step": 4105 + }, + { + "epoch": 1.0900039824771008, + "grad_norm": 0.3977184108415491, + "learning_rate": 4.618877689402787e-06, + "loss": 0.5763, + "step": 4106 + }, + { + "epoch": 1.0902694809504845, + "grad_norm": 0.3961411915816232, + "learning_rate": 4.618692381725916e-06, + "loss": 0.6095, + "step": 4107 + }, + { + "epoch": 1.0905349794238683, + "grad_norm": 0.3917624753300087, + "learning_rate": 4.618507032729357e-06, + "loss": 0.6034, + "step": 4108 + }, + { + "epoch": 1.0908004778972522, + "grad_norm": 0.39639060661463604, + "learning_rate": 4.618321642416722e-06, + "loss": 0.5721, + "step": 4109 + }, + { + "epoch": 1.0910659763706358, + "grad_norm": 0.40190680241737353, + "learning_rate": 4.618136210791631e-06, + "loss": 0.5951, + "step": 4110 + }, + { + "epoch": 1.0913314748440197, + "grad_norm": 0.39548479513380613, + "learning_rate": 4.617950737857696e-06, + "loss": 0.6094, + "step": 4111 + }, + { + "epoch": 1.0915969733174034, + "grad_norm": 0.401331861874179, + "learning_rate": 4.617765223618536e-06, + "loss": 0.5679, + "step": 4112 + }, + { + "epoch": 1.0918624717907872, + "grad_norm": 0.3916152837209789, + "learning_rate": 4.6175796680777695e-06, + "loss": 0.5906, + "step": 4113 + }, + { + "epoch": 1.092127970264171, + "grad_norm": 0.4029644015801374, + "learning_rate": 4.617394071239014e-06, + "loss": 0.6126, + "step": 4114 + }, + { + "epoch": 1.0923934687375547, + "grad_norm": 0.40721124317873747, + "learning_rate": 4.617208433105891e-06, + "loss": 0.5757, + "step": 4115 + }, + { + "epoch": 1.0926589672109386, + "grad_norm": 0.3918991687289311, + "learning_rate": 4.617022753682019e-06, + "loss": 0.6086, + "step": 4116 + }, + { + "epoch": 1.0929244656843222, + "grad_norm": 0.3966487156633864, + "learning_rate": 4.61683703297102e-06, + "loss": 0.594, + "step": 4117 + }, + { + "epoch": 1.093189964157706, + "grad_norm": 0.4056797273915033, + "learning_rate": 4.616651270976516e-06, + "loss": 0.5956, + "step": 4118 + }, + { + "epoch": 1.09345546263109, + "grad_norm": 0.3897052293265004, + "learning_rate": 4.61646546770213e-06, + "loss": 0.5925, + "step": 4119 + }, + { + "epoch": 1.0937209611044736, + "grad_norm": 0.38918292541649774, + "learning_rate": 4.616279623151485e-06, + "loss": 0.5926, + "step": 4120 + }, + { + "epoch": 1.0939864595778575, + "grad_norm": 0.3915180438517737, + "learning_rate": 4.616093737328206e-06, + "loss": 0.6066, + "step": 4121 + }, + { + "epoch": 1.0942519580512413, + "grad_norm": 0.4032178103254784, + "learning_rate": 4.615907810235918e-06, + "loss": 0.5918, + "step": 4122 + }, + { + "epoch": 1.094517456524625, + "grad_norm": 0.40296944404598123, + "learning_rate": 4.615721841878247e-06, + "loss": 0.5664, + "step": 4123 + }, + { + "epoch": 1.0947829549980088, + "grad_norm": 0.4109243861097142, + "learning_rate": 4.61553583225882e-06, + "loss": 0.6136, + "step": 4124 + }, + { + "epoch": 1.0950484534713925, + "grad_norm": 0.4008234294995445, + "learning_rate": 4.6153497813812645e-06, + "loss": 0.6106, + "step": 4125 + }, + { + "epoch": 1.0953139519447763, + "grad_norm": 0.4075644704495962, + "learning_rate": 4.615163689249209e-06, + "loss": 0.5858, + "step": 4126 + }, + { + "epoch": 1.0955794504181602, + "grad_norm": 0.47906943921243966, + "learning_rate": 4.614977555866283e-06, + "loss": 0.5796, + "step": 4127 + }, + { + "epoch": 1.0958449488915438, + "grad_norm": 0.401882044679437, + "learning_rate": 4.614791381236116e-06, + "loss": 0.6101, + "step": 4128 + }, + { + "epoch": 1.0961104473649277, + "grad_norm": 0.4225134550070061, + "learning_rate": 4.614605165362339e-06, + "loss": 0.5785, + "step": 4129 + }, + { + "epoch": 1.0963759458383113, + "grad_norm": 0.41874741040074764, + "learning_rate": 4.6144189082485845e-06, + "loss": 0.6114, + "step": 4130 + }, + { + "epoch": 1.0966414443116952, + "grad_norm": 0.41127132375103914, + "learning_rate": 4.614232609898483e-06, + "loss": 0.6138, + "step": 4131 + }, + { + "epoch": 1.096906942785079, + "grad_norm": 0.40405057586061777, + "learning_rate": 4.614046270315671e-06, + "loss": 0.607, + "step": 4132 + }, + { + "epoch": 1.0971724412584627, + "grad_norm": 0.3952100579595354, + "learning_rate": 4.613859889503779e-06, + "loss": 0.5574, + "step": 4133 + }, + { + "epoch": 1.0974379397318466, + "grad_norm": 0.4123951325547869, + "learning_rate": 4.613673467466444e-06, + "loss": 0.6101, + "step": 4134 + }, + { + "epoch": 1.0977034382052304, + "grad_norm": 0.42640950665596883, + "learning_rate": 4.613487004207302e-06, + "loss": 0.6331, + "step": 4135 + }, + { + "epoch": 1.097968936678614, + "grad_norm": 0.42120442898790655, + "learning_rate": 4.613300499729988e-06, + "loss": 0.598, + "step": 4136 + }, + { + "epoch": 1.098234435151998, + "grad_norm": 0.3941650373034087, + "learning_rate": 4.61311395403814e-06, + "loss": 0.5913, + "step": 4137 + }, + { + "epoch": 1.0984999336253816, + "grad_norm": 0.4280069565805221, + "learning_rate": 4.612927367135396e-06, + "loss": 0.6019, + "step": 4138 + }, + { + "epoch": 1.0987654320987654, + "grad_norm": 0.3945544039784049, + "learning_rate": 4.6127407390253944e-06, + "loss": 0.5817, + "step": 4139 + }, + { + "epoch": 1.0990309305721493, + "grad_norm": 0.3899870461460316, + "learning_rate": 4.612554069711777e-06, + "loss": 0.5725, + "step": 4140 + }, + { + "epoch": 1.099296429045533, + "grad_norm": 0.40030549711494107, + "learning_rate": 4.612367359198182e-06, + "loss": 0.6193, + "step": 4141 + }, + { + "epoch": 1.0995619275189168, + "grad_norm": 0.4184377233830072, + "learning_rate": 4.612180607488251e-06, + "loss": 0.6379, + "step": 4142 + }, + { + "epoch": 1.0998274259923004, + "grad_norm": 0.4247987828366459, + "learning_rate": 4.611993814585628e-06, + "loss": 0.5804, + "step": 4143 + }, + { + "epoch": 1.1000929244656843, + "grad_norm": 0.3950831529379711, + "learning_rate": 4.611806980493954e-06, + "loss": 0.6401, + "step": 4144 + }, + { + "epoch": 1.1003584229390682, + "grad_norm": 0.4384869221097367, + "learning_rate": 4.611620105216872e-06, + "loss": 0.5582, + "step": 4145 + }, + { + "epoch": 1.1006239214124518, + "grad_norm": 0.40799503340071896, + "learning_rate": 4.611433188758029e-06, + "loss": 0.579, + "step": 4146 + }, + { + "epoch": 1.1008894198858357, + "grad_norm": 0.4006083766317225, + "learning_rate": 4.6112462311210695e-06, + "loss": 0.6246, + "step": 4147 + }, + { + "epoch": 1.1011549183592195, + "grad_norm": 0.40156418204366257, + "learning_rate": 4.611059232309639e-06, + "loss": 0.6143, + "step": 4148 + }, + { + "epoch": 1.1014204168326032, + "grad_norm": 0.39776408041954836, + "learning_rate": 4.610872192327385e-06, + "loss": 0.5993, + "step": 4149 + }, + { + "epoch": 1.101685915305987, + "grad_norm": 0.41781668304688074, + "learning_rate": 4.610685111177954e-06, + "loss": 0.566, + "step": 4150 + }, + { + "epoch": 1.1019514137793707, + "grad_norm": 0.40190497953337084, + "learning_rate": 4.610497988864996e-06, + "loss": 0.6125, + "step": 4151 + }, + { + "epoch": 1.1022169122527545, + "grad_norm": 0.393687751210559, + "learning_rate": 4.610310825392159e-06, + "loss": 0.5782, + "step": 4152 + }, + { + "epoch": 1.1024824107261384, + "grad_norm": 0.39370126201878136, + "learning_rate": 4.610123620763095e-06, + "loss": 0.5889, + "step": 4153 + }, + { + "epoch": 1.102747909199522, + "grad_norm": 0.4067620143883082, + "learning_rate": 4.609936374981454e-06, + "loss": 0.6114, + "step": 4154 + }, + { + "epoch": 1.103013407672906, + "grad_norm": 0.40541603036720636, + "learning_rate": 4.6097490880508875e-06, + "loss": 0.5635, + "step": 4155 + }, + { + "epoch": 1.1032789061462898, + "grad_norm": 0.40218555063238504, + "learning_rate": 4.609561759975047e-06, + "loss": 0.6061, + "step": 4156 + }, + { + "epoch": 1.1035444046196734, + "grad_norm": 0.43697665314011647, + "learning_rate": 4.609374390757589e-06, + "loss": 0.6246, + "step": 4157 + }, + { + "epoch": 1.1038099030930573, + "grad_norm": 0.4184297102929878, + "learning_rate": 4.609186980402165e-06, + "loss": 0.5659, + "step": 4158 + }, + { + "epoch": 1.104075401566441, + "grad_norm": 0.4053245845440966, + "learning_rate": 4.60899952891243e-06, + "loss": 0.6146, + "step": 4159 + }, + { + "epoch": 1.1043409000398248, + "grad_norm": 0.38890252961138627, + "learning_rate": 4.6088120362920416e-06, + "loss": 0.5911, + "step": 4160 + }, + { + "epoch": 1.1046063985132086, + "grad_norm": 0.40299281327320086, + "learning_rate": 4.6086245025446544e-06, + "loss": 0.5902, + "step": 4161 + }, + { + "epoch": 1.1048718969865923, + "grad_norm": 0.38227546403827056, + "learning_rate": 4.608436927673927e-06, + "loss": 0.6026, + "step": 4162 + }, + { + "epoch": 1.1051373954599761, + "grad_norm": 0.40271843035801363, + "learning_rate": 4.608249311683517e-06, + "loss": 0.58, + "step": 4163 + }, + { + "epoch": 1.10540289393336, + "grad_norm": 0.39399921190873083, + "learning_rate": 4.608061654577084e-06, + "loss": 0.6008, + "step": 4164 + }, + { + "epoch": 1.1056683924067436, + "grad_norm": 0.3894925190695123, + "learning_rate": 4.607873956358286e-06, + "loss": 0.6032, + "step": 4165 + }, + { + "epoch": 1.1059338908801275, + "grad_norm": 0.4099472601363349, + "learning_rate": 4.6076862170307865e-06, + "loss": 0.6161, + "step": 4166 + }, + { + "epoch": 1.1061993893535111, + "grad_norm": 0.4024169834283145, + "learning_rate": 4.607498436598244e-06, + "loss": 0.5597, + "step": 4167 + }, + { + "epoch": 1.106464887826895, + "grad_norm": 0.38204717576175373, + "learning_rate": 4.607310615064323e-06, + "loss": 0.5775, + "step": 4168 + }, + { + "epoch": 1.1067303863002789, + "grad_norm": 0.3989533946769941, + "learning_rate": 4.607122752432685e-06, + "loss": 0.6101, + "step": 4169 + }, + { + "epoch": 1.1069958847736625, + "grad_norm": 0.3876225944596314, + "learning_rate": 4.606934848706994e-06, + "loss": 0.564, + "step": 4170 + }, + { + "epoch": 1.1072613832470464, + "grad_norm": 0.4050181428459777, + "learning_rate": 4.606746903890914e-06, + "loss": 0.5484, + "step": 4171 + }, + { + "epoch": 1.10752688172043, + "grad_norm": 0.39445860470503064, + "learning_rate": 4.606558917988113e-06, + "loss": 0.5662, + "step": 4172 + }, + { + "epoch": 1.1077923801938139, + "grad_norm": 0.4192376845576687, + "learning_rate": 4.606370891002254e-06, + "loss": 0.6145, + "step": 4173 + }, + { + "epoch": 1.1080578786671977, + "grad_norm": 0.40777181982949945, + "learning_rate": 4.606182822937006e-06, + "loss": 0.5729, + "step": 4174 + }, + { + "epoch": 1.1083233771405814, + "grad_norm": 0.40447915141031576, + "learning_rate": 4.605994713796036e-06, + "loss": 0.6128, + "step": 4175 + }, + { + "epoch": 1.1085888756139652, + "grad_norm": 0.40851286920143803, + "learning_rate": 4.605806563583012e-06, + "loss": 0.6629, + "step": 4176 + }, + { + "epoch": 1.108854374087349, + "grad_norm": 0.43285146242118144, + "learning_rate": 4.605618372301605e-06, + "loss": 0.5715, + "step": 4177 + }, + { + "epoch": 1.1091198725607327, + "grad_norm": 0.42470963243260174, + "learning_rate": 4.605430139955484e-06, + "loss": 0.6126, + "step": 4178 + }, + { + "epoch": 1.1093853710341166, + "grad_norm": 0.411138195282551, + "learning_rate": 4.6052418665483204e-06, + "loss": 0.5907, + "step": 4179 + }, + { + "epoch": 1.1096508695075002, + "grad_norm": 0.40846999919174365, + "learning_rate": 4.605053552083786e-06, + "loss": 0.6031, + "step": 4180 + }, + { + "epoch": 1.109916367980884, + "grad_norm": 0.4038190181208699, + "learning_rate": 4.604865196565553e-06, + "loss": 0.5914, + "step": 4181 + }, + { + "epoch": 1.110181866454268, + "grad_norm": 0.41855550063869873, + "learning_rate": 4.604676799997295e-06, + "loss": 0.609, + "step": 4182 + }, + { + "epoch": 1.1104473649276516, + "grad_norm": 0.39641413680209786, + "learning_rate": 4.6044883623826866e-06, + "loss": 0.5857, + "step": 4183 + }, + { + "epoch": 1.1107128634010355, + "grad_norm": 0.3887209537307622, + "learning_rate": 4.604299883725401e-06, + "loss": 0.6112, + "step": 4184 + }, + { + "epoch": 1.1109783618744191, + "grad_norm": 0.3997013232689751, + "learning_rate": 4.604111364029118e-06, + "loss": 0.5903, + "step": 4185 + }, + { + "epoch": 1.111243860347803, + "grad_norm": 0.4085127662316873, + "learning_rate": 4.60392280329751e-06, + "loss": 0.6012, + "step": 4186 + }, + { + "epoch": 1.1115093588211868, + "grad_norm": 0.41870204612786327, + "learning_rate": 4.603734201534257e-06, + "loss": 0.6106, + "step": 4187 + }, + { + "epoch": 1.1117748572945705, + "grad_norm": 0.4047874010785604, + "learning_rate": 4.603545558743035e-06, + "loss": 0.611, + "step": 4188 + }, + { + "epoch": 1.1120403557679543, + "grad_norm": 0.4086861990194707, + "learning_rate": 4.603356874927525e-06, + "loss": 0.578, + "step": 4189 + }, + { + "epoch": 1.1123058542413382, + "grad_norm": 0.4185807284404185, + "learning_rate": 4.603168150091406e-06, + "loss": 0.634, + "step": 4190 + }, + { + "epoch": 1.1125713527147219, + "grad_norm": 0.4091818793904963, + "learning_rate": 4.602979384238359e-06, + "loss": 0.5847, + "step": 4191 + }, + { + "epoch": 1.1128368511881057, + "grad_norm": 0.4168123995392763, + "learning_rate": 4.602790577372064e-06, + "loss": 0.5905, + "step": 4192 + }, + { + "epoch": 1.1131023496614894, + "grad_norm": 0.4003934481060536, + "learning_rate": 4.602601729496204e-06, + "loss": 0.6077, + "step": 4193 + }, + { + "epoch": 1.1133678481348732, + "grad_norm": 0.4031037033735679, + "learning_rate": 4.602412840614463e-06, + "loss": 0.5864, + "step": 4194 + }, + { + "epoch": 1.113633346608257, + "grad_norm": 0.40682334686164157, + "learning_rate": 4.602223910730525e-06, + "loss": 0.5964, + "step": 4195 + }, + { + "epoch": 1.1138988450816407, + "grad_norm": 0.4145387287475848, + "learning_rate": 4.602034939848072e-06, + "loss": 0.6053, + "step": 4196 + }, + { + "epoch": 1.1141643435550246, + "grad_norm": 0.40240899343747966, + "learning_rate": 4.601845927970792e-06, + "loss": 0.587, + "step": 4197 + }, + { + "epoch": 1.1144298420284082, + "grad_norm": 0.40101775629577413, + "learning_rate": 4.60165687510237e-06, + "loss": 0.5692, + "step": 4198 + }, + { + "epoch": 1.114695340501792, + "grad_norm": 0.3935512473913428, + "learning_rate": 4.601467781246492e-06, + "loss": 0.5788, + "step": 4199 + }, + { + "epoch": 1.114960838975176, + "grad_norm": 0.4044175962249614, + "learning_rate": 4.601278646406849e-06, + "loss": 0.5709, + "step": 4200 + }, + { + "epoch": 1.1152263374485596, + "grad_norm": 0.42054058538944394, + "learning_rate": 4.601089470587126e-06, + "loss": 0.5743, + "step": 4201 + }, + { + "epoch": 1.1154918359219435, + "grad_norm": 0.41230296755811063, + "learning_rate": 4.600900253791015e-06, + "loss": 0.631, + "step": 4202 + }, + { + "epoch": 1.1157573343953273, + "grad_norm": 0.40862673492388485, + "learning_rate": 4.6007109960222045e-06, + "loss": 0.5977, + "step": 4203 + }, + { + "epoch": 1.116022832868711, + "grad_norm": 0.3962651528454553, + "learning_rate": 4.600521697284387e-06, + "loss": 0.5681, + "step": 4204 + }, + { + "epoch": 1.1162883313420948, + "grad_norm": 0.41969620413148845, + "learning_rate": 4.600332357581252e-06, + "loss": 0.6299, + "step": 4205 + }, + { + "epoch": 1.1165538298154785, + "grad_norm": 0.41104348682358244, + "learning_rate": 4.600142976916494e-06, + "loss": 0.6239, + "step": 4206 + }, + { + "epoch": 1.1168193282888623, + "grad_norm": 0.39792093294818626, + "learning_rate": 4.599953555293807e-06, + "loss": 0.614, + "step": 4207 + }, + { + "epoch": 1.1170848267622462, + "grad_norm": 0.41806139008693155, + "learning_rate": 4.599764092716883e-06, + "loss": 0.588, + "step": 4208 + }, + { + "epoch": 1.1173503252356298, + "grad_norm": 0.4290109737688029, + "learning_rate": 4.599574589189419e-06, + "loss": 0.5726, + "step": 4209 + }, + { + "epoch": 1.1176158237090137, + "grad_norm": 0.40931412337661655, + "learning_rate": 4.599385044715109e-06, + "loss": 0.5904, + "step": 4210 + }, + { + "epoch": 1.1178813221823976, + "grad_norm": 0.41267087407676684, + "learning_rate": 4.5991954592976504e-06, + "loss": 0.5555, + "step": 4211 + }, + { + "epoch": 1.1181468206557812, + "grad_norm": 0.395245460021575, + "learning_rate": 4.5990058329407405e-06, + "loss": 0.6214, + "step": 4212 + }, + { + "epoch": 1.118412319129165, + "grad_norm": 0.40110320470953037, + "learning_rate": 4.5988161656480785e-06, + "loss": 0.6143, + "step": 4213 + }, + { + "epoch": 1.1186778176025487, + "grad_norm": 0.3890728206523804, + "learning_rate": 4.5986264574233615e-06, + "loss": 0.6062, + "step": 4214 + }, + { + "epoch": 1.1189433160759326, + "grad_norm": 0.3981589273212458, + "learning_rate": 4.598436708270291e-06, + "loss": 0.6165, + "step": 4215 + }, + { + "epoch": 1.1192088145493164, + "grad_norm": 0.40279097620937515, + "learning_rate": 4.5982469181925675e-06, + "loss": 0.6039, + "step": 4216 + }, + { + "epoch": 1.1194743130227, + "grad_norm": 0.402564655796646, + "learning_rate": 4.59805708719389e-06, + "loss": 0.6026, + "step": 4217 + }, + { + "epoch": 1.119739811496084, + "grad_norm": 0.3979570934000749, + "learning_rate": 4.597867215277964e-06, + "loss": 0.6059, + "step": 4218 + }, + { + "epoch": 1.1200053099694678, + "grad_norm": 0.4145037547485939, + "learning_rate": 4.59767730244849e-06, + "loss": 0.614, + "step": 4219 + }, + { + "epoch": 1.1202708084428514, + "grad_norm": 0.4095470127521354, + "learning_rate": 4.597487348709172e-06, + "loss": 0.556, + "step": 4220 + }, + { + "epoch": 1.1205363069162353, + "grad_norm": 0.398477200715919, + "learning_rate": 4.597297354063716e-06, + "loss": 0.5985, + "step": 4221 + }, + { + "epoch": 1.120801805389619, + "grad_norm": 0.4088072729297395, + "learning_rate": 4.597107318515826e-06, + "loss": 0.6221, + "step": 4222 + }, + { + "epoch": 1.1210673038630028, + "grad_norm": 0.3893122622904414, + "learning_rate": 4.596917242069209e-06, + "loss": 0.5848, + "step": 4223 + }, + { + "epoch": 1.1213328023363867, + "grad_norm": 0.4255312349984939, + "learning_rate": 4.596727124727572e-06, + "loss": 0.5795, + "step": 4224 + }, + { + "epoch": 1.1215983008097703, + "grad_norm": 0.40142865884755996, + "learning_rate": 4.596536966494622e-06, + "loss": 0.5786, + "step": 4225 + }, + { + "epoch": 1.1218637992831542, + "grad_norm": 0.40047260435012183, + "learning_rate": 4.596346767374068e-06, + "loss": 0.5806, + "step": 4226 + }, + { + "epoch": 1.1221292977565378, + "grad_norm": 0.42233976959533925, + "learning_rate": 4.5961565273696194e-06, + "loss": 0.6002, + "step": 4227 + }, + { + "epoch": 1.1223947962299217, + "grad_norm": 0.40616003543509005, + "learning_rate": 4.595966246484986e-06, + "loss": 0.5399, + "step": 4228 + }, + { + "epoch": 1.1226602947033055, + "grad_norm": 0.4098600200834036, + "learning_rate": 4.595775924723879e-06, + "loss": 0.6076, + "step": 4229 + }, + { + "epoch": 1.1229257931766892, + "grad_norm": 0.4134511063073729, + "learning_rate": 4.59558556209001e-06, + "loss": 0.5387, + "step": 4230 + }, + { + "epoch": 1.123191291650073, + "grad_norm": 0.38366695581028937, + "learning_rate": 4.595395158587093e-06, + "loss": 0.6061, + "step": 4231 + }, + { + "epoch": 1.123456790123457, + "grad_norm": 0.4034998142744694, + "learning_rate": 4.595204714218838e-06, + "loss": 0.6211, + "step": 4232 + }, + { + "epoch": 1.1237222885968405, + "grad_norm": 0.3951363817294016, + "learning_rate": 4.595014228988962e-06, + "loss": 0.5799, + "step": 4233 + }, + { + "epoch": 1.1239877870702244, + "grad_norm": 0.3979057271878824, + "learning_rate": 4.594823702901179e-06, + "loss": 0.5857, + "step": 4234 + }, + { + "epoch": 1.124253285543608, + "grad_norm": 0.38888340991930065, + "learning_rate": 4.594633135959205e-06, + "loss": 0.6034, + "step": 4235 + }, + { + "epoch": 1.124518784016992, + "grad_norm": 0.3955025703873869, + "learning_rate": 4.594442528166756e-06, + "loss": 0.5922, + "step": 4236 + }, + { + "epoch": 1.1247842824903758, + "grad_norm": 0.39095324444771157, + "learning_rate": 4.594251879527551e-06, + "loss": 0.5933, + "step": 4237 + }, + { + "epoch": 1.1250497809637594, + "grad_norm": 0.41100452779958446, + "learning_rate": 4.594061190045305e-06, + "loss": 0.6418, + "step": 4238 + }, + { + "epoch": 1.1253152794371433, + "grad_norm": 0.4002378528278809, + "learning_rate": 4.593870459723738e-06, + "loss": 0.5866, + "step": 4239 + }, + { + "epoch": 1.125580777910527, + "grad_norm": 0.41591673394122, + "learning_rate": 4.5936796885665725e-06, + "loss": 0.5953, + "step": 4240 + }, + { + "epoch": 1.1258462763839108, + "grad_norm": 0.4015448820218692, + "learning_rate": 4.593488876577526e-06, + "loss": 0.6108, + "step": 4241 + }, + { + "epoch": 1.1261117748572946, + "grad_norm": 0.3976611257233658, + "learning_rate": 4.5932980237603195e-06, + "loss": 0.586, + "step": 4242 + }, + { + "epoch": 1.1263772733306783, + "grad_norm": 0.3977989655418618, + "learning_rate": 4.593107130118678e-06, + "loss": 0.5393, + "step": 4243 + }, + { + "epoch": 1.1266427718040621, + "grad_norm": 0.4022883236779997, + "learning_rate": 4.592916195656322e-06, + "loss": 0.6202, + "step": 4244 + }, + { + "epoch": 1.126908270277446, + "grad_norm": 0.3966396273278027, + "learning_rate": 4.592725220376976e-06, + "loss": 0.614, + "step": 4245 + }, + { + "epoch": 1.1271737687508296, + "grad_norm": 0.39230237441538535, + "learning_rate": 4.592534204284364e-06, + "loss": 0.5658, + "step": 4246 + }, + { + "epoch": 1.1274392672242135, + "grad_norm": 0.40309995569915774, + "learning_rate": 4.592343147382211e-06, + "loss": 0.6273, + "step": 4247 + }, + { + "epoch": 1.1277047656975971, + "grad_norm": 0.39282030743083224, + "learning_rate": 4.592152049674245e-06, + "loss": 0.6398, + "step": 4248 + }, + { + "epoch": 1.127970264170981, + "grad_norm": 0.39215854767023556, + "learning_rate": 4.59196091116419e-06, + "loss": 0.5592, + "step": 4249 + }, + { + "epoch": 1.1282357626443649, + "grad_norm": 0.4024622746384169, + "learning_rate": 4.591769731855777e-06, + "loss": 0.6334, + "step": 4250 + }, + { + "epoch": 1.1285012611177485, + "grad_norm": 0.39404603695945745, + "learning_rate": 4.591578511752733e-06, + "loss": 0.5959, + "step": 4251 + }, + { + "epoch": 1.1287667595911324, + "grad_norm": 0.40931903864851577, + "learning_rate": 4.591387250858785e-06, + "loss": 0.6112, + "step": 4252 + }, + { + "epoch": 1.129032258064516, + "grad_norm": 0.4150392582605503, + "learning_rate": 4.591195949177666e-06, + "loss": 0.6164, + "step": 4253 + }, + { + "epoch": 1.1292977565378999, + "grad_norm": 0.4067786036855695, + "learning_rate": 4.591004606713105e-06, + "loss": 0.6024, + "step": 4254 + }, + { + "epoch": 1.1295632550112837, + "grad_norm": 0.40194099726084326, + "learning_rate": 4.590813223468836e-06, + "loss": 0.6233, + "step": 4255 + }, + { + "epoch": 1.1298287534846674, + "grad_norm": 0.3902873137041122, + "learning_rate": 4.590621799448589e-06, + "loss": 0.5662, + "step": 4256 + }, + { + "epoch": 1.1300942519580512, + "grad_norm": 0.429037891563439, + "learning_rate": 4.5904303346560984e-06, + "loss": 0.6001, + "step": 4257 + }, + { + "epoch": 1.130359750431435, + "grad_norm": 0.40232873097465577, + "learning_rate": 4.590238829095098e-06, + "loss": 0.5579, + "step": 4258 + }, + { + "epoch": 1.1306252489048187, + "grad_norm": 0.3838087291615794, + "learning_rate": 4.590047282769322e-06, + "loss": 0.5901, + "step": 4259 + }, + { + "epoch": 1.1308907473782026, + "grad_norm": 0.4049598737557053, + "learning_rate": 4.589855695682508e-06, + "loss": 0.602, + "step": 4260 + }, + { + "epoch": 1.1311562458515865, + "grad_norm": 0.40249650479098137, + "learning_rate": 4.5896640678383895e-06, + "loss": 0.5591, + "step": 4261 + }, + { + "epoch": 1.1314217443249701, + "grad_norm": 0.3848978703388012, + "learning_rate": 4.589472399240707e-06, + "loss": 0.5842, + "step": 4262 + }, + { + "epoch": 1.131687242798354, + "grad_norm": 0.4147613415923532, + "learning_rate": 4.589280689893195e-06, + "loss": 0.5776, + "step": 4263 + }, + { + "epoch": 1.1319527412717376, + "grad_norm": 0.4241405181007222, + "learning_rate": 4.589088939799596e-06, + "loss": 0.5243, + "step": 4264 + }, + { + "epoch": 1.1322182397451215, + "grad_norm": 0.4002949365799512, + "learning_rate": 4.588897148963647e-06, + "loss": 0.5627, + "step": 4265 + }, + { + "epoch": 1.1324837382185051, + "grad_norm": 0.4269931837453073, + "learning_rate": 4.588705317389088e-06, + "loss": 0.6095, + "step": 4266 + }, + { + "epoch": 1.132749236691889, + "grad_norm": 0.41039316719331886, + "learning_rate": 4.588513445079663e-06, + "loss": 0.6113, + "step": 4267 + }, + { + "epoch": 1.1330147351652728, + "grad_norm": 0.403715450210771, + "learning_rate": 4.588321532039111e-06, + "loss": 0.6006, + "step": 4268 + }, + { + "epoch": 1.1332802336386565, + "grad_norm": 0.40248862211140435, + "learning_rate": 4.5881295782711764e-06, + "loss": 0.593, + "step": 4269 + }, + { + "epoch": 1.1335457321120403, + "grad_norm": 0.4179157629800408, + "learning_rate": 4.5879375837796026e-06, + "loss": 0.6275, + "step": 4270 + }, + { + "epoch": 1.1338112305854242, + "grad_norm": 0.3936150303902074, + "learning_rate": 4.5877455485681336e-06, + "loss": 0.6174, + "step": 4271 + }, + { + "epoch": 1.1340767290588079, + "grad_norm": 0.4191352127077562, + "learning_rate": 4.587553472640515e-06, + "loss": 0.6029, + "step": 4272 + }, + { + "epoch": 1.1343422275321917, + "grad_norm": 0.39581654999444527, + "learning_rate": 4.5873613560004925e-06, + "loss": 0.6259, + "step": 4273 + }, + { + "epoch": 1.1346077260055756, + "grad_norm": 0.40456689687387054, + "learning_rate": 4.5871691986518115e-06, + "loss": 0.5986, + "step": 4274 + }, + { + "epoch": 1.1348732244789592, + "grad_norm": 0.395604897033609, + "learning_rate": 4.586977000598223e-06, + "loss": 0.5928, + "step": 4275 + }, + { + "epoch": 1.135138722952343, + "grad_norm": 0.40476824186348964, + "learning_rate": 4.586784761843471e-06, + "loss": 0.5922, + "step": 4276 + }, + { + "epoch": 1.1354042214257267, + "grad_norm": 0.3995343438607479, + "learning_rate": 4.586592482391309e-06, + "loss": 0.5745, + "step": 4277 + }, + { + "epoch": 1.1356697198991106, + "grad_norm": 0.41523553225996046, + "learning_rate": 4.586400162245483e-06, + "loss": 0.6398, + "step": 4278 + }, + { + "epoch": 1.1359352183724944, + "grad_norm": 0.39780621096452673, + "learning_rate": 4.586207801409746e-06, + "loss": 0.5837, + "step": 4279 + }, + { + "epoch": 1.136200716845878, + "grad_norm": 0.40245607014516194, + "learning_rate": 4.586015399887849e-06, + "loss": 0.5653, + "step": 4280 + }, + { + "epoch": 1.136466215319262, + "grad_norm": 0.4074364049507209, + "learning_rate": 4.585822957683544e-06, + "loss": 0.5964, + "step": 4281 + }, + { + "epoch": 1.1367317137926456, + "grad_norm": 0.4288557939725421, + "learning_rate": 4.585630474800585e-06, + "loss": 0.6124, + "step": 4282 + }, + { + "epoch": 1.1369972122660295, + "grad_norm": 0.41661816147268377, + "learning_rate": 4.585437951242725e-06, + "loss": 0.5748, + "step": 4283 + }, + { + "epoch": 1.1372627107394133, + "grad_norm": 0.44452198381336505, + "learning_rate": 4.585245387013719e-06, + "loss": 0.6009, + "step": 4284 + }, + { + "epoch": 1.137528209212797, + "grad_norm": 0.4003785475367137, + "learning_rate": 4.585052782117322e-06, + "loss": 0.5803, + "step": 4285 + }, + { + "epoch": 1.1377937076861808, + "grad_norm": 0.40938385164890345, + "learning_rate": 4.584860136557291e-06, + "loss": 0.5789, + "step": 4286 + }, + { + "epoch": 1.1380592061595647, + "grad_norm": 0.39615381997230703, + "learning_rate": 4.584667450337382e-06, + "loss": 0.5914, + "step": 4287 + }, + { + "epoch": 1.1383247046329483, + "grad_norm": 0.40140944385360927, + "learning_rate": 4.584474723461355e-06, + "loss": 0.5996, + "step": 4288 + }, + { + "epoch": 1.1385902031063322, + "grad_norm": 0.3964996407844935, + "learning_rate": 4.5842819559329655e-06, + "loss": 0.5643, + "step": 4289 + }, + { + "epoch": 1.1388557015797158, + "grad_norm": 0.39738695020501213, + "learning_rate": 4.584089147755976e-06, + "loss": 0.5917, + "step": 4290 + }, + { + "epoch": 1.1391212000530997, + "grad_norm": 0.3954365491929587, + "learning_rate": 4.5838962989341445e-06, + "loss": 0.5658, + "step": 4291 + }, + { + "epoch": 1.1393866985264836, + "grad_norm": 0.3827228472520975, + "learning_rate": 4.583703409471234e-06, + "loss": 0.5762, + "step": 4292 + }, + { + "epoch": 1.1396521969998672, + "grad_norm": 0.3934912099010245, + "learning_rate": 4.583510479371004e-06, + "loss": 0.5857, + "step": 4293 + }, + { + "epoch": 1.139917695473251, + "grad_norm": 0.3983600647238615, + "learning_rate": 4.583317508637219e-06, + "loss": 0.6211, + "step": 4294 + }, + { + "epoch": 1.1401831939466347, + "grad_norm": 0.40078534572319097, + "learning_rate": 4.583124497273641e-06, + "loss": 0.6402, + "step": 4295 + }, + { + "epoch": 1.1404486924200186, + "grad_norm": 0.41170336961598936, + "learning_rate": 4.582931445284037e-06, + "loss": 0.6171, + "step": 4296 + }, + { + "epoch": 1.1407141908934024, + "grad_norm": 0.3994540264969489, + "learning_rate": 4.582738352672168e-06, + "loss": 0.5929, + "step": 4297 + }, + { + "epoch": 1.140979689366786, + "grad_norm": 0.40304779540207886, + "learning_rate": 4.582545219441802e-06, + "loss": 0.5906, + "step": 4298 + }, + { + "epoch": 1.14124518784017, + "grad_norm": 0.4118781115665021, + "learning_rate": 4.582352045596705e-06, + "loss": 0.5615, + "step": 4299 + }, + { + "epoch": 1.1415106863135538, + "grad_norm": 0.4073933136467304, + "learning_rate": 4.582158831140645e-06, + "loss": 0.5571, + "step": 4300 + }, + { + "epoch": 1.1417761847869374, + "grad_norm": 0.4191341167443241, + "learning_rate": 4.58196557607739e-06, + "loss": 0.5872, + "step": 4301 + }, + { + "epoch": 1.1420416832603213, + "grad_norm": 0.40363087802261094, + "learning_rate": 4.581772280410709e-06, + "loss": 0.5876, + "step": 4302 + }, + { + "epoch": 1.142307181733705, + "grad_norm": 0.40088202686285845, + "learning_rate": 4.581578944144371e-06, + "loss": 0.5992, + "step": 4303 + }, + { + "epoch": 1.1425726802070888, + "grad_norm": 0.39802560390802516, + "learning_rate": 4.581385567282147e-06, + "loss": 0.5569, + "step": 4304 + }, + { + "epoch": 1.1428381786804727, + "grad_norm": 0.4385536958866902, + "learning_rate": 4.581192149827808e-06, + "loss": 0.6067, + "step": 4305 + }, + { + "epoch": 1.1431036771538563, + "grad_norm": 0.39831625863198367, + "learning_rate": 4.580998691785127e-06, + "loss": 0.5754, + "step": 4306 + }, + { + "epoch": 1.1433691756272402, + "grad_norm": 0.40644696416310405, + "learning_rate": 4.580805193157876e-06, + "loss": 0.6019, + "step": 4307 + }, + { + "epoch": 1.1436346741006238, + "grad_norm": 0.4108739499394583, + "learning_rate": 4.580611653949829e-06, + "loss": 0.5787, + "step": 4308 + }, + { + "epoch": 1.1439001725740077, + "grad_norm": 0.41938891674170853, + "learning_rate": 4.580418074164761e-06, + "loss": 0.5959, + "step": 4309 + }, + { + "epoch": 1.1441656710473915, + "grad_norm": 0.4124085100535883, + "learning_rate": 4.580224453806447e-06, + "loss": 0.578, + "step": 4310 + }, + { + "epoch": 1.1444311695207752, + "grad_norm": 0.4181290215766447, + "learning_rate": 4.580030792878662e-06, + "loss": 0.6051, + "step": 4311 + }, + { + "epoch": 1.144696667994159, + "grad_norm": 0.39095242675070696, + "learning_rate": 4.5798370913851846e-06, + "loss": 0.5811, + "step": 4312 + }, + { + "epoch": 1.144962166467543, + "grad_norm": 0.4033025462178715, + "learning_rate": 4.579643349329791e-06, + "loss": 0.5924, + "step": 4313 + }, + { + "epoch": 1.1452276649409265, + "grad_norm": 0.39758849391784024, + "learning_rate": 4.5794495667162605e-06, + "loss": 0.617, + "step": 4314 + }, + { + "epoch": 1.1454931634143104, + "grad_norm": 0.40855575100791214, + "learning_rate": 4.579255743548372e-06, + "loss": 0.5881, + "step": 4315 + }, + { + "epoch": 1.1457586618876943, + "grad_norm": 0.4060475760244103, + "learning_rate": 4.579061879829906e-06, + "loss": 0.5855, + "step": 4316 + }, + { + "epoch": 1.146024160361078, + "grad_norm": 0.39626878897437523, + "learning_rate": 4.578867975564642e-06, + "loss": 0.6316, + "step": 4317 + }, + { + "epoch": 1.1462896588344618, + "grad_norm": 0.4009063840990518, + "learning_rate": 4.578674030756364e-06, + "loss": 0.5865, + "step": 4318 + }, + { + "epoch": 1.1465551573078454, + "grad_norm": 0.4184239460125252, + "learning_rate": 4.578480045408852e-06, + "loss": 0.5692, + "step": 4319 + }, + { + "epoch": 1.1468206557812293, + "grad_norm": 0.394082915045355, + "learning_rate": 4.578286019525889e-06, + "loss": 0.6147, + "step": 4320 + }, + { + "epoch": 1.1470861542546131, + "grad_norm": 0.4063715073134393, + "learning_rate": 4.578091953111262e-06, + "loss": 0.588, + "step": 4321 + }, + { + "epoch": 1.1473516527279968, + "grad_norm": 0.4088097849868308, + "learning_rate": 4.5778978461687525e-06, + "loss": 0.5907, + "step": 4322 + }, + { + "epoch": 1.1476171512013806, + "grad_norm": 0.39819479412364894, + "learning_rate": 4.577703698702149e-06, + "loss": 0.557, + "step": 4323 + }, + { + "epoch": 1.1478826496747643, + "grad_norm": 0.4131593160183521, + "learning_rate": 4.577509510715234e-06, + "loss": 0.6154, + "step": 4324 + }, + { + "epoch": 1.1481481481481481, + "grad_norm": 0.4197682165234953, + "learning_rate": 4.577315282211799e-06, + "loss": 0.577, + "step": 4325 + }, + { + "epoch": 1.148413646621532, + "grad_norm": 0.4071254271727446, + "learning_rate": 4.577121013195629e-06, + "loss": 0.6078, + "step": 4326 + }, + { + "epoch": 1.1486791450949156, + "grad_norm": 0.4383429135156656, + "learning_rate": 4.576926703670513e-06, + "loss": 0.596, + "step": 4327 + }, + { + "epoch": 1.1489446435682995, + "grad_norm": 0.394587245595195, + "learning_rate": 4.576732353640242e-06, + "loss": 0.6, + "step": 4328 + }, + { + "epoch": 1.1492101420416834, + "grad_norm": 0.4137329699443566, + "learning_rate": 4.576537963108606e-06, + "loss": 0.6188, + "step": 4329 + }, + { + "epoch": 1.149475640515067, + "grad_norm": 0.397942619021064, + "learning_rate": 4.576343532079395e-06, + "loss": 0.6067, + "step": 4330 + }, + { + "epoch": 1.1497411389884509, + "grad_norm": 0.3919396977565792, + "learning_rate": 4.5761490605564005e-06, + "loss": 0.6117, + "step": 4331 + }, + { + "epoch": 1.1500066374618345, + "grad_norm": 0.40139265544110847, + "learning_rate": 4.575954548543416e-06, + "loss": 0.5983, + "step": 4332 + }, + { + "epoch": 1.1502721359352184, + "grad_norm": 0.39496110889879826, + "learning_rate": 4.575759996044236e-06, + "loss": 0.6232, + "step": 4333 + }, + { + "epoch": 1.1505376344086022, + "grad_norm": 0.4107838187004183, + "learning_rate": 4.575565403062653e-06, + "loss": 0.6256, + "step": 4334 + }, + { + "epoch": 1.1508031328819859, + "grad_norm": 0.4092664147597602, + "learning_rate": 4.575370769602463e-06, + "loss": 0.5984, + "step": 4335 + }, + { + "epoch": 1.1510686313553697, + "grad_norm": 0.4101930266650931, + "learning_rate": 4.575176095667462e-06, + "loss": 0.5701, + "step": 4336 + }, + { + "epoch": 1.1513341298287534, + "grad_norm": 0.38430619673382416, + "learning_rate": 4.5749813812614455e-06, + "loss": 0.6357, + "step": 4337 + }, + { + "epoch": 1.1515996283021372, + "grad_norm": 0.40583016322142557, + "learning_rate": 4.574786626388212e-06, + "loss": 0.5767, + "step": 4338 + }, + { + "epoch": 1.151865126775521, + "grad_norm": 0.4114533009190886, + "learning_rate": 4.574591831051559e-06, + "loss": 0.5689, + "step": 4339 + }, + { + "epoch": 1.1521306252489047, + "grad_norm": 0.39756160439719124, + "learning_rate": 4.574396995255286e-06, + "loss": 0.609, + "step": 4340 + }, + { + "epoch": 1.1523961237222886, + "grad_norm": 0.4018294389530276, + "learning_rate": 4.574202119003192e-06, + "loss": 0.628, + "step": 4341 + }, + { + "epoch": 1.1526616221956725, + "grad_norm": 0.4148272578167303, + "learning_rate": 4.57400720229908e-06, + "loss": 0.6139, + "step": 4342 + }, + { + "epoch": 1.1529271206690561, + "grad_norm": 0.3966406167337366, + "learning_rate": 4.573812245146748e-06, + "loss": 0.582, + "step": 4343 + }, + { + "epoch": 1.15319261914244, + "grad_norm": 0.40722377067530713, + "learning_rate": 4.573617247550001e-06, + "loss": 0.6002, + "step": 4344 + }, + { + "epoch": 1.1534581176158236, + "grad_norm": 0.4016143861509407, + "learning_rate": 4.573422209512639e-06, + "loss": 0.627, + "step": 4345 + }, + { + "epoch": 1.1537236160892075, + "grad_norm": 0.3936139569586051, + "learning_rate": 4.573227131038468e-06, + "loss": 0.5365, + "step": 4346 + }, + { + "epoch": 1.1539891145625913, + "grad_norm": 0.38811120907461716, + "learning_rate": 4.5730320121312915e-06, + "loss": 0.6201, + "step": 4347 + }, + { + "epoch": 1.154254613035975, + "grad_norm": 0.42063094122751454, + "learning_rate": 4.572836852794915e-06, + "loss": 0.5914, + "step": 4348 + }, + { + "epoch": 1.1545201115093588, + "grad_norm": 0.41055565884154815, + "learning_rate": 4.5726416530331455e-06, + "loss": 0.6083, + "step": 4349 + }, + { + "epoch": 1.1547856099827425, + "grad_norm": 0.40469347811889916, + "learning_rate": 4.572446412849789e-06, + "loss": 0.5849, + "step": 4350 + }, + { + "epoch": 1.1550511084561264, + "grad_norm": 0.3961204287237384, + "learning_rate": 4.572251132248652e-06, + "loss": 0.5864, + "step": 4351 + }, + { + "epoch": 1.1553166069295102, + "grad_norm": 0.4056444506190797, + "learning_rate": 4.572055811233545e-06, + "loss": 0.587, + "step": 4352 + }, + { + "epoch": 1.1555821054028939, + "grad_norm": 0.40742181248702347, + "learning_rate": 4.571860449808276e-06, + "loss": 0.5758, + "step": 4353 + }, + { + "epoch": 1.1558476038762777, + "grad_norm": 0.4082464463596411, + "learning_rate": 4.571665047976657e-06, + "loss": 0.6179, + "step": 4354 + }, + { + "epoch": 1.1561131023496616, + "grad_norm": 0.42270839953522177, + "learning_rate": 4.5714696057424955e-06, + "loss": 0.6128, + "step": 4355 + }, + { + "epoch": 1.1563786008230452, + "grad_norm": 0.40302720730968244, + "learning_rate": 4.571274123109606e-06, + "loss": 0.5868, + "step": 4356 + }, + { + "epoch": 1.156644099296429, + "grad_norm": 0.39506001108151667, + "learning_rate": 4.5710786000817984e-06, + "loss": 0.5628, + "step": 4357 + }, + { + "epoch": 1.156909597769813, + "grad_norm": 0.41900861998223315, + "learning_rate": 4.5708830366628884e-06, + "loss": 0.6159, + "step": 4358 + }, + { + "epoch": 1.1571750962431966, + "grad_norm": 0.4082948086040363, + "learning_rate": 4.5706874328566886e-06, + "loss": 0.6009, + "step": 4359 + }, + { + "epoch": 1.1574405947165805, + "grad_norm": 0.4256093228088509, + "learning_rate": 4.570491788667014e-06, + "loss": 0.5685, + "step": 4360 + }, + { + "epoch": 1.157706093189964, + "grad_norm": 0.3947565997542352, + "learning_rate": 4.570296104097679e-06, + "loss": 0.5794, + "step": 4361 + }, + { + "epoch": 1.157971591663348, + "grad_norm": 0.4220567093035895, + "learning_rate": 4.570100379152503e-06, + "loss": 0.5844, + "step": 4362 + }, + { + "epoch": 1.1582370901367316, + "grad_norm": 0.3946991063016911, + "learning_rate": 4.5699046138353e-06, + "loss": 0.581, + "step": 4363 + }, + { + "epoch": 1.1585025886101155, + "grad_norm": 0.4013250793148427, + "learning_rate": 4.569708808149889e-06, + "loss": 0.6133, + "step": 4364 + }, + { + "epoch": 1.1587680870834993, + "grad_norm": 0.4111893879446417, + "learning_rate": 4.569512962100088e-06, + "loss": 0.6288, + "step": 4365 + }, + { + "epoch": 1.159033585556883, + "grad_norm": 0.3897881244853749, + "learning_rate": 4.569317075689718e-06, + "loss": 0.5526, + "step": 4366 + }, + { + "epoch": 1.1592990840302668, + "grad_norm": 0.39158440073797823, + "learning_rate": 4.5691211489225985e-06, + "loss": 0.5549, + "step": 4367 + }, + { + "epoch": 1.1595645825036507, + "grad_norm": 0.3878985636000898, + "learning_rate": 4.56892518180255e-06, + "loss": 0.6051, + "step": 4368 + }, + { + "epoch": 1.1598300809770343, + "grad_norm": 0.40838214922032773, + "learning_rate": 4.568729174333396e-06, + "loss": 0.5849, + "step": 4369 + }, + { + "epoch": 1.1600955794504182, + "grad_norm": 0.3877566228637658, + "learning_rate": 4.568533126518957e-06, + "loss": 0.6097, + "step": 4370 + }, + { + "epoch": 1.160361077923802, + "grad_norm": 0.42041684240148997, + "learning_rate": 4.568337038363058e-06, + "loss": 0.5981, + "step": 4371 + }, + { + "epoch": 1.1606265763971857, + "grad_norm": 0.40960859424338986, + "learning_rate": 4.568140909869522e-06, + "loss": 0.6155, + "step": 4372 + }, + { + "epoch": 1.1608920748705696, + "grad_norm": 0.3981388219702541, + "learning_rate": 4.567944741042175e-06, + "loss": 0.613, + "step": 4373 + }, + { + "epoch": 1.1611575733439532, + "grad_norm": 0.4081867519957751, + "learning_rate": 4.567748531884841e-06, + "loss": 0.5763, + "step": 4374 + }, + { + "epoch": 1.161423071817337, + "grad_norm": 0.41679817998867136, + "learning_rate": 4.567552282401349e-06, + "loss": 0.6126, + "step": 4375 + }, + { + "epoch": 1.161688570290721, + "grad_norm": 0.39021181631666174, + "learning_rate": 4.567355992595526e-06, + "loss": 0.572, + "step": 4376 + }, + { + "epoch": 1.1619540687641046, + "grad_norm": 0.4053955432990037, + "learning_rate": 4.567159662471199e-06, + "loss": 0.5794, + "step": 4377 + }, + { + "epoch": 1.1622195672374884, + "grad_norm": 0.4158627036409467, + "learning_rate": 4.5669632920321965e-06, + "loss": 0.5517, + "step": 4378 + }, + { + "epoch": 1.162485065710872, + "grad_norm": 0.4154756497949518, + "learning_rate": 4.56676688128235e-06, + "loss": 0.5759, + "step": 4379 + }, + { + "epoch": 1.162750564184256, + "grad_norm": 0.4153698012457725, + "learning_rate": 4.566570430225488e-06, + "loss": 0.6073, + "step": 4380 + }, + { + "epoch": 1.1630160626576398, + "grad_norm": 0.4118697437215938, + "learning_rate": 4.566373938865443e-06, + "loss": 0.5752, + "step": 4381 + }, + { + "epoch": 1.1632815611310234, + "grad_norm": 0.4039111215126782, + "learning_rate": 4.566177407206047e-06, + "loss": 0.6119, + "step": 4382 + }, + { + "epoch": 1.1635470596044073, + "grad_norm": 0.4248429675101417, + "learning_rate": 4.5659808352511334e-06, + "loss": 0.5743, + "step": 4383 + }, + { + "epoch": 1.1638125580777912, + "grad_norm": 0.39609242733633576, + "learning_rate": 4.565784223004535e-06, + "loss": 0.612, + "step": 4384 + }, + { + "epoch": 1.1640780565511748, + "grad_norm": 0.4099478144199655, + "learning_rate": 4.565587570470085e-06, + "loss": 0.6174, + "step": 4385 + }, + { + "epoch": 1.1643435550245587, + "grad_norm": 0.38693110294079974, + "learning_rate": 4.565390877651621e-06, + "loss": 0.5715, + "step": 4386 + }, + { + "epoch": 1.1646090534979423, + "grad_norm": 0.40842807643642776, + "learning_rate": 4.565194144552978e-06, + "loss": 0.5989, + "step": 4387 + }, + { + "epoch": 1.1648745519713262, + "grad_norm": 0.4048322965176205, + "learning_rate": 4.564997371177992e-06, + "loss": 0.5884, + "step": 4388 + }, + { + "epoch": 1.16514005044471, + "grad_norm": 0.4107886105100827, + "learning_rate": 4.564800557530502e-06, + "loss": 0.6111, + "step": 4389 + }, + { + "epoch": 1.1654055489180937, + "grad_norm": 0.41103771323376853, + "learning_rate": 4.564603703614345e-06, + "loss": 0.6045, + "step": 4390 + }, + { + "epoch": 1.1656710473914775, + "grad_norm": 0.3987996301871185, + "learning_rate": 4.564406809433362e-06, + "loss": 0.5892, + "step": 4391 + }, + { + "epoch": 1.1659365458648612, + "grad_norm": 0.41081796026287204, + "learning_rate": 4.56420987499139e-06, + "loss": 0.6071, + "step": 4392 + }, + { + "epoch": 1.166202044338245, + "grad_norm": 0.3985794969791896, + "learning_rate": 4.564012900292273e-06, + "loss": 0.5723, + "step": 4393 + }, + { + "epoch": 1.166467542811629, + "grad_norm": 0.4030640916780077, + "learning_rate": 4.563815885339849e-06, + "loss": 0.5664, + "step": 4394 + }, + { + "epoch": 1.1667330412850125, + "grad_norm": 0.43623304154587245, + "learning_rate": 4.563618830137964e-06, + "loss": 0.5597, + "step": 4395 + }, + { + "epoch": 1.1669985397583964, + "grad_norm": 0.39797416940200103, + "learning_rate": 4.563421734690458e-06, + "loss": 0.5577, + "step": 4396 + }, + { + "epoch": 1.1672640382317803, + "grad_norm": 0.4093438242962241, + "learning_rate": 4.563224599001176e-06, + "loss": 0.6026, + "step": 4397 + }, + { + "epoch": 1.167529536705164, + "grad_norm": 0.4280372928856646, + "learning_rate": 4.563027423073963e-06, + "loss": 0.6009, + "step": 4398 + }, + { + "epoch": 1.1677950351785478, + "grad_norm": 0.4262371076640645, + "learning_rate": 4.562830206912664e-06, + "loss": 0.572, + "step": 4399 + }, + { + "epoch": 1.1680605336519314, + "grad_norm": 0.41452503322674533, + "learning_rate": 4.562632950521126e-06, + "loss": 0.5813, + "step": 4400 + }, + { + "epoch": 1.1683260321253153, + "grad_norm": 0.40887694032556754, + "learning_rate": 4.562435653903195e-06, + "loss": 0.5658, + "step": 4401 + }, + { + "epoch": 1.1685915305986991, + "grad_norm": 0.43168322887375643, + "learning_rate": 4.562238317062719e-06, + "loss": 0.5888, + "step": 4402 + }, + { + "epoch": 1.1688570290720828, + "grad_norm": 0.41546602270343747, + "learning_rate": 4.562040940003546e-06, + "loss": 0.5852, + "step": 4403 + }, + { + "epoch": 1.1691225275454666, + "grad_norm": 0.4033632677397616, + "learning_rate": 4.561843522729526e-06, + "loss": 0.5897, + "step": 4404 + }, + { + "epoch": 1.1693880260188503, + "grad_norm": 0.42295209094946473, + "learning_rate": 4.561646065244509e-06, + "loss": 0.6243, + "step": 4405 + }, + { + "epoch": 1.1696535244922341, + "grad_norm": 0.3915228058390058, + "learning_rate": 4.5614485675523465e-06, + "loss": 0.5652, + "step": 4406 + }, + { + "epoch": 1.169919022965618, + "grad_norm": 0.39424830298244506, + "learning_rate": 4.561251029656889e-06, + "loss": 0.6068, + "step": 4407 + }, + { + "epoch": 1.1701845214390016, + "grad_norm": 0.4023095949127277, + "learning_rate": 4.561053451561991e-06, + "loss": 0.6163, + "step": 4408 + }, + { + "epoch": 1.1704500199123855, + "grad_norm": 0.4074628522585257, + "learning_rate": 4.560855833271503e-06, + "loss": 0.551, + "step": 4409 + }, + { + "epoch": 1.1707155183857694, + "grad_norm": 0.40475474746346707, + "learning_rate": 4.560658174789281e-06, + "loss": 0.598, + "step": 4410 + }, + { + "epoch": 1.170981016859153, + "grad_norm": 0.4039940095318477, + "learning_rate": 4.560460476119178e-06, + "loss": 0.5967, + "step": 4411 + }, + { + "epoch": 1.1712465153325369, + "grad_norm": 0.432137768569034, + "learning_rate": 4.560262737265053e-06, + "loss": 0.5884, + "step": 4412 + }, + { + "epoch": 1.1715120138059207, + "grad_norm": 0.4048461335656455, + "learning_rate": 4.560064958230759e-06, + "loss": 0.6042, + "step": 4413 + }, + { + "epoch": 1.1717775122793044, + "grad_norm": 0.40193364323770364, + "learning_rate": 4.559867139020154e-06, + "loss": 0.6044, + "step": 4414 + }, + { + "epoch": 1.1720430107526882, + "grad_norm": 0.4177572101740509, + "learning_rate": 4.5596692796370965e-06, + "loss": 0.5479, + "step": 4415 + }, + { + "epoch": 1.1723085092260719, + "grad_norm": 0.4053928878582181, + "learning_rate": 4.559471380085446e-06, + "loss": 0.5931, + "step": 4416 + }, + { + "epoch": 1.1725740076994557, + "grad_norm": 0.39152537400138876, + "learning_rate": 4.559273440369061e-06, + "loss": 0.57, + "step": 4417 + }, + { + "epoch": 1.1728395061728394, + "grad_norm": 0.4060633572060415, + "learning_rate": 4.5590754604918005e-06, + "loss": 0.6121, + "step": 4418 + }, + { + "epoch": 1.1731050046462232, + "grad_norm": 0.40490156316901305, + "learning_rate": 4.558877440457529e-06, + "loss": 0.63, + "step": 4419 + }, + { + "epoch": 1.1733705031196071, + "grad_norm": 0.41274850894603254, + "learning_rate": 4.5586793802701055e-06, + "loss": 0.5992, + "step": 4420 + }, + { + "epoch": 1.1736360015929908, + "grad_norm": 0.3977290201611633, + "learning_rate": 4.558481279933393e-06, + "loss": 0.6018, + "step": 4421 + }, + { + "epoch": 1.1739015000663746, + "grad_norm": 0.40782755391328357, + "learning_rate": 4.558283139451256e-06, + "loss": 0.6141, + "step": 4422 + }, + { + "epoch": 1.1741669985397585, + "grad_norm": 0.4093455930594219, + "learning_rate": 4.558084958827559e-06, + "loss": 0.6222, + "step": 4423 + }, + { + "epoch": 1.1744324970131421, + "grad_norm": 0.4167049326855498, + "learning_rate": 4.557886738066165e-06, + "loss": 0.5963, + "step": 4424 + }, + { + "epoch": 1.174697995486526, + "grad_norm": 0.41377064322024276, + "learning_rate": 4.5576884771709414e-06, + "loss": 0.5889, + "step": 4425 + }, + { + "epoch": 1.1749634939599098, + "grad_norm": 0.40552841499279046, + "learning_rate": 4.557490176145755e-06, + "loss": 0.6068, + "step": 4426 + }, + { + "epoch": 1.1752289924332935, + "grad_norm": 0.405054856497394, + "learning_rate": 4.557291834994472e-06, + "loss": 0.6395, + "step": 4427 + }, + { + "epoch": 1.1754944909066773, + "grad_norm": 0.4122727360553481, + "learning_rate": 4.557093453720961e-06, + "loss": 0.6244, + "step": 4428 + }, + { + "epoch": 1.175759989380061, + "grad_norm": 0.39312919854500455, + "learning_rate": 4.556895032329092e-06, + "loss": 0.5762, + "step": 4429 + }, + { + "epoch": 1.1760254878534449, + "grad_norm": 0.4261987839984862, + "learning_rate": 4.556696570822733e-06, + "loss": 0.6217, + "step": 4430 + }, + { + "epoch": 1.1762909863268287, + "grad_norm": 0.3961696856133243, + "learning_rate": 4.5564980692057545e-06, + "loss": 0.5837, + "step": 4431 + }, + { + "epoch": 1.1765564848002124, + "grad_norm": 0.39374528085339133, + "learning_rate": 4.556299527482029e-06, + "loss": 0.6204, + "step": 4432 + }, + { + "epoch": 1.1768219832735962, + "grad_norm": 0.3954590479198233, + "learning_rate": 4.556100945655428e-06, + "loss": 0.6104, + "step": 4433 + }, + { + "epoch": 1.1770874817469799, + "grad_norm": 0.3735118609040518, + "learning_rate": 4.555902323729825e-06, + "loss": 0.5994, + "step": 4434 + }, + { + "epoch": 1.1773529802203637, + "grad_norm": 0.4164002595732277, + "learning_rate": 4.555703661709092e-06, + "loss": 0.5678, + "step": 4435 + }, + { + "epoch": 1.1776184786937476, + "grad_norm": 0.4114315005319558, + "learning_rate": 4.5555049595971054e-06, + "loss": 0.5768, + "step": 4436 + }, + { + "epoch": 1.1778839771671312, + "grad_norm": 0.4010844841730783, + "learning_rate": 4.555306217397739e-06, + "loss": 0.6069, + "step": 4437 + }, + { + "epoch": 1.178149475640515, + "grad_norm": 0.4179267793227562, + "learning_rate": 4.5551074351148685e-06, + "loss": 0.6171, + "step": 4438 + }, + { + "epoch": 1.178414974113899, + "grad_norm": 0.4053431723984752, + "learning_rate": 4.5549086127523725e-06, + "loss": 0.6048, + "step": 4439 + }, + { + "epoch": 1.1786804725872826, + "grad_norm": 0.41097058314698387, + "learning_rate": 4.554709750314126e-06, + "loss": 0.5922, + "step": 4440 + }, + { + "epoch": 1.1789459710606665, + "grad_norm": 0.4134809534029421, + "learning_rate": 4.554510847804009e-06, + "loss": 0.5874, + "step": 4441 + }, + { + "epoch": 1.17921146953405, + "grad_norm": 0.4474319315573684, + "learning_rate": 4.5543119052259e-06, + "loss": 0.6015, + "step": 4442 + }, + { + "epoch": 1.179476968007434, + "grad_norm": 0.3989674882312488, + "learning_rate": 4.554112922583679e-06, + "loss": 0.5816, + "step": 4443 + }, + { + "epoch": 1.1797424664808178, + "grad_norm": 0.4284139969314759, + "learning_rate": 4.553913899881227e-06, + "loss": 0.5982, + "step": 4444 + }, + { + "epoch": 1.1800079649542015, + "grad_norm": 0.410263729605568, + "learning_rate": 4.553714837122425e-06, + "loss": 0.6215, + "step": 4445 + }, + { + "epoch": 1.1802734634275853, + "grad_norm": 0.43039607111057654, + "learning_rate": 4.553515734311155e-06, + "loss": 0.5971, + "step": 4446 + }, + { + "epoch": 1.180538961900969, + "grad_norm": 0.3976128116246067, + "learning_rate": 4.5533165914513015e-06, + "loss": 0.6068, + "step": 4447 + }, + { + "epoch": 1.1808044603743528, + "grad_norm": 0.42687000644298945, + "learning_rate": 4.5531174085467455e-06, + "loss": 0.5925, + "step": 4448 + }, + { + "epoch": 1.1810699588477367, + "grad_norm": 0.43996845001643287, + "learning_rate": 4.552918185601374e-06, + "loss": 0.6569, + "step": 4449 + }, + { + "epoch": 1.1813354573211203, + "grad_norm": 0.4012139379490353, + "learning_rate": 4.552718922619071e-06, + "loss": 0.6206, + "step": 4450 + }, + { + "epoch": 1.1816009557945042, + "grad_norm": 0.40865989646963835, + "learning_rate": 4.552519619603724e-06, + "loss": 0.6334, + "step": 4451 + }, + { + "epoch": 1.181866454267888, + "grad_norm": 0.3915229794382577, + "learning_rate": 4.552320276559218e-06, + "loss": 0.583, + "step": 4452 + }, + { + "epoch": 1.1821319527412717, + "grad_norm": 0.408867311475484, + "learning_rate": 4.552120893489441e-06, + "loss": 0.5875, + "step": 4453 + }, + { + "epoch": 1.1823974512146556, + "grad_norm": 0.4041688088258742, + "learning_rate": 4.551921470398284e-06, + "loss": 0.6328, + "step": 4454 + }, + { + "epoch": 1.1826629496880392, + "grad_norm": 0.3907784636482234, + "learning_rate": 4.5517220072896335e-06, + "loss": 0.5628, + "step": 4455 + }, + { + "epoch": 1.182928448161423, + "grad_norm": 0.4215109317205618, + "learning_rate": 4.55152250416738e-06, + "loss": 0.5901, + "step": 4456 + }, + { + "epoch": 1.183193946634807, + "grad_norm": 0.44120316888269473, + "learning_rate": 4.551322961035415e-06, + "loss": 0.5816, + "step": 4457 + }, + { + "epoch": 1.1834594451081906, + "grad_norm": 0.40244274378616046, + "learning_rate": 4.55112337789763e-06, + "loss": 0.5775, + "step": 4458 + }, + { + "epoch": 1.1837249435815744, + "grad_norm": 0.41561805639551425, + "learning_rate": 4.550923754757917e-06, + "loss": 0.6064, + "step": 4459 + }, + { + "epoch": 1.183990442054958, + "grad_norm": 0.44607153269247085, + "learning_rate": 4.550724091620169e-06, + "loss": 0.5926, + "step": 4460 + }, + { + "epoch": 1.184255940528342, + "grad_norm": 0.4001639537003755, + "learning_rate": 4.5505243884882804e-06, + "loss": 0.6111, + "step": 4461 + }, + { + "epoch": 1.1845214390017258, + "grad_norm": 0.41067182389608814, + "learning_rate": 4.550324645366145e-06, + "loss": 0.5833, + "step": 4462 + }, + { + "epoch": 1.1847869374751094, + "grad_norm": 0.4054665697570779, + "learning_rate": 4.55012486225766e-06, + "loss": 0.6399, + "step": 4463 + }, + { + "epoch": 1.1850524359484933, + "grad_norm": 0.41083439509682923, + "learning_rate": 4.549925039166721e-06, + "loss": 0.57, + "step": 4464 + }, + { + "epoch": 1.1853179344218772, + "grad_norm": 0.43795726955714775, + "learning_rate": 4.549725176097223e-06, + "loss": 0.5816, + "step": 4465 + }, + { + "epoch": 1.1855834328952608, + "grad_norm": 0.3786445238361183, + "learning_rate": 4.549525273053067e-06, + "loss": 0.5946, + "step": 4466 + }, + { + "epoch": 1.1858489313686447, + "grad_norm": 0.4202558449348351, + "learning_rate": 4.549325330038149e-06, + "loss": 0.6061, + "step": 4467 + }, + { + "epoch": 1.1861144298420285, + "grad_norm": 0.39885877883740245, + "learning_rate": 4.54912534705637e-06, + "loss": 0.6382, + "step": 4468 + }, + { + "epoch": 1.1863799283154122, + "grad_norm": 0.41303146492407405, + "learning_rate": 4.548925324111629e-06, + "loss": 0.6269, + "step": 4469 + }, + { + "epoch": 1.186645426788796, + "grad_norm": 0.4076530912489785, + "learning_rate": 4.548725261207828e-06, + "loss": 0.6292, + "step": 4470 + }, + { + "epoch": 1.1869109252621797, + "grad_norm": 0.4171350147421362, + "learning_rate": 4.548525158348869e-06, + "loss": 0.6066, + "step": 4471 + }, + { + "epoch": 1.1871764237355635, + "grad_norm": 0.40412164642943715, + "learning_rate": 4.548325015538653e-06, + "loss": 0.6467, + "step": 4472 + }, + { + "epoch": 1.1874419222089472, + "grad_norm": 0.3919507147783421, + "learning_rate": 4.548124832781084e-06, + "loss": 0.6009, + "step": 4473 + }, + { + "epoch": 1.187707420682331, + "grad_norm": 0.3997728557657679, + "learning_rate": 4.547924610080065e-06, + "loss": 0.5912, + "step": 4474 + }, + { + "epoch": 1.187972919155715, + "grad_norm": 0.40546725997533944, + "learning_rate": 4.547724347439504e-06, + "loss": 0.5858, + "step": 4475 + }, + { + "epoch": 1.1882384176290985, + "grad_norm": 0.37537981933255915, + "learning_rate": 4.547524044863303e-06, + "loss": 0.5615, + "step": 4476 + }, + { + "epoch": 1.1885039161024824, + "grad_norm": 0.3949913308540101, + "learning_rate": 4.54732370235537e-06, + "loss": 0.5611, + "step": 4477 + }, + { + "epoch": 1.1887694145758663, + "grad_norm": 0.42383223108378926, + "learning_rate": 4.547123319919613e-06, + "loss": 0.5784, + "step": 4478 + }, + { + "epoch": 1.18903491304925, + "grad_norm": 0.40104666067330436, + "learning_rate": 4.546922897559939e-06, + "loss": 0.6221, + "step": 4479 + }, + { + "epoch": 1.1893004115226338, + "grad_norm": 0.41261503353019885, + "learning_rate": 4.546722435280256e-06, + "loss": 0.604, + "step": 4480 + }, + { + "epoch": 1.1895659099960176, + "grad_norm": 0.4081502130888106, + "learning_rate": 4.546521933084475e-06, + "loss": 0.5861, + "step": 4481 + }, + { + "epoch": 1.1898314084694013, + "grad_norm": 0.400822806896701, + "learning_rate": 4.546321390976505e-06, + "loss": 0.5936, + "step": 4482 + }, + { + "epoch": 1.1900969069427851, + "grad_norm": 0.42768810762957216, + "learning_rate": 4.546120808960258e-06, + "loss": 0.5998, + "step": 4483 + }, + { + "epoch": 1.1903624054161688, + "grad_norm": 0.4011064979656986, + "learning_rate": 4.545920187039645e-06, + "loss": 0.6071, + "step": 4484 + }, + { + "epoch": 1.1906279038895526, + "grad_norm": 0.40875027623805554, + "learning_rate": 4.54571952521858e-06, + "loss": 0.6205, + "step": 4485 + }, + { + "epoch": 1.1908934023629365, + "grad_norm": 0.41132805619660584, + "learning_rate": 4.545518823500974e-06, + "loss": 0.5892, + "step": 4486 + }, + { + "epoch": 1.1911589008363201, + "grad_norm": 0.40585142412641545, + "learning_rate": 4.545318081890744e-06, + "loss": 0.5939, + "step": 4487 + }, + { + "epoch": 1.191424399309704, + "grad_norm": 0.407943189320469, + "learning_rate": 4.545117300391803e-06, + "loss": 0.6116, + "step": 4488 + }, + { + "epoch": 1.1916898977830876, + "grad_norm": 0.4318494921232628, + "learning_rate": 4.544916479008068e-06, + "loss": 0.6027, + "step": 4489 + }, + { + "epoch": 1.1919553962564715, + "grad_norm": 0.3828940604674715, + "learning_rate": 4.544715617743454e-06, + "loss": 0.6242, + "step": 4490 + }, + { + "epoch": 1.1922208947298554, + "grad_norm": 0.41038753962187513, + "learning_rate": 4.544514716601879e-06, + "loss": 0.5905, + "step": 4491 + }, + { + "epoch": 1.192486393203239, + "grad_norm": 0.41139164763241187, + "learning_rate": 4.544313775587262e-06, + "loss": 0.6135, + "step": 4492 + }, + { + "epoch": 1.1927518916766229, + "grad_norm": 0.4235348602624653, + "learning_rate": 4.54411279470352e-06, + "loss": 0.6007, + "step": 4493 + }, + { + "epoch": 1.1930173901500067, + "grad_norm": 0.40420641580806965, + "learning_rate": 4.543911773954575e-06, + "loss": 0.5693, + "step": 4494 + }, + { + "epoch": 1.1932828886233904, + "grad_norm": 0.3939052860726097, + "learning_rate": 4.543710713344345e-06, + "loss": 0.5322, + "step": 4495 + }, + { + "epoch": 1.1935483870967742, + "grad_norm": 0.39059419670150747, + "learning_rate": 4.543509612876752e-06, + "loss": 0.5753, + "step": 4496 + }, + { + "epoch": 1.1938138855701579, + "grad_norm": 0.41546161555275146, + "learning_rate": 4.543308472555719e-06, + "loss": 0.5806, + "step": 4497 + }, + { + "epoch": 1.1940793840435417, + "grad_norm": 0.4334850931043781, + "learning_rate": 4.543107292385168e-06, + "loss": 0.5885, + "step": 4498 + }, + { + "epoch": 1.1943448825169256, + "grad_norm": 0.4089692095857566, + "learning_rate": 4.542906072369022e-06, + "loss": 0.568, + "step": 4499 + }, + { + "epoch": 1.1946103809903093, + "grad_norm": 0.4063070216243931, + "learning_rate": 4.542704812511205e-06, + "loss": 0.6113, + "step": 4500 + }, + { + "epoch": 1.1948758794636931, + "grad_norm": 0.4221650231976977, + "learning_rate": 4.542503512815644e-06, + "loss": 0.6173, + "step": 4501 + }, + { + "epoch": 1.1951413779370768, + "grad_norm": 0.40758090839154787, + "learning_rate": 4.542302173286263e-06, + "loss": 0.576, + "step": 4502 + }, + { + "epoch": 1.1954068764104606, + "grad_norm": 0.41290803497968254, + "learning_rate": 4.542100793926989e-06, + "loss": 0.5814, + "step": 4503 + }, + { + "epoch": 1.1956723748838445, + "grad_norm": 0.4208714973677631, + "learning_rate": 4.54189937474175e-06, + "loss": 0.557, + "step": 4504 + }, + { + "epoch": 1.1959378733572281, + "grad_norm": 0.5011782559685238, + "learning_rate": 4.541697915734474e-06, + "loss": 0.56, + "step": 4505 + }, + { + "epoch": 1.196203371830612, + "grad_norm": 0.4085847322666289, + "learning_rate": 4.541496416909089e-06, + "loss": 0.5869, + "step": 4506 + }, + { + "epoch": 1.1964688703039958, + "grad_norm": 0.3958313087239738, + "learning_rate": 4.541294878269526e-06, + "loss": 0.5786, + "step": 4507 + }, + { + "epoch": 1.1967343687773795, + "grad_norm": 0.39454111088478866, + "learning_rate": 4.541093299819714e-06, + "loss": 0.6109, + "step": 4508 + }, + { + "epoch": 1.1969998672507633, + "grad_norm": 0.405234064488423, + "learning_rate": 4.540891681563586e-06, + "loss": 0.6118, + "step": 4509 + }, + { + "epoch": 1.197265365724147, + "grad_norm": 0.41491462236196663, + "learning_rate": 4.540690023505073e-06, + "loss": 0.5486, + "step": 4510 + }, + { + "epoch": 1.1975308641975309, + "grad_norm": 0.3855837997128597, + "learning_rate": 4.540488325648109e-06, + "loss": 0.5963, + "step": 4511 + }, + { + "epoch": 1.1977963626709147, + "grad_norm": 0.41037098836540603, + "learning_rate": 4.540286587996626e-06, + "loss": 0.5522, + "step": 4512 + }, + { + "epoch": 1.1980618611442984, + "grad_norm": 0.42924493232429467, + "learning_rate": 4.540084810554559e-06, + "loss": 0.6041, + "step": 4513 + }, + { + "epoch": 1.1983273596176822, + "grad_norm": 0.40355437810198835, + "learning_rate": 4.5398829933258425e-06, + "loss": 0.5728, + "step": 4514 + }, + { + "epoch": 1.1985928580910659, + "grad_norm": 0.3928276784461158, + "learning_rate": 4.539681136314414e-06, + "loss": 0.6263, + "step": 4515 + }, + { + "epoch": 1.1988583565644497, + "grad_norm": 0.4164995084427747, + "learning_rate": 4.539479239524209e-06, + "loss": 0.565, + "step": 4516 + }, + { + "epoch": 1.1991238550378336, + "grad_norm": 0.4282678988989548, + "learning_rate": 4.539277302959166e-06, + "loss": 0.5586, + "step": 4517 + }, + { + "epoch": 1.1993893535112172, + "grad_norm": 0.4091829418395631, + "learning_rate": 4.539075326623221e-06, + "loss": 0.5984, + "step": 4518 + }, + { + "epoch": 1.199654851984601, + "grad_norm": 0.44798887697912254, + "learning_rate": 4.538873310520316e-06, + "loss": 0.5976, + "step": 4519 + }, + { + "epoch": 1.199920350457985, + "grad_norm": 0.4235865839724445, + "learning_rate": 4.538671254654388e-06, + "loss": 0.5843, + "step": 4520 + }, + { + "epoch": 1.2001858489313686, + "grad_norm": 0.3960958371071703, + "learning_rate": 4.538469159029381e-06, + "loss": 0.5857, + "step": 4521 + }, + { + "epoch": 1.2004513474047525, + "grad_norm": 0.4145149146278742, + "learning_rate": 4.5382670236492335e-06, + "loss": 0.6093, + "step": 4522 + }, + { + "epoch": 1.2007168458781363, + "grad_norm": 0.40765250268742015, + "learning_rate": 4.538064848517888e-06, + "loss": 0.63, + "step": 4523 + }, + { + "epoch": 1.20098234435152, + "grad_norm": 0.4143048055970985, + "learning_rate": 4.5378626336392885e-06, + "loss": 0.5871, + "step": 4524 + }, + { + "epoch": 1.2012478428249038, + "grad_norm": 0.4454744517716582, + "learning_rate": 4.537660379017378e-06, + "loss": 0.5848, + "step": 4525 + }, + { + "epoch": 1.2015133412982875, + "grad_norm": 0.4074511184369045, + "learning_rate": 4.537458084656101e-06, + "loss": 0.5807, + "step": 4526 + }, + { + "epoch": 1.2017788397716713, + "grad_norm": 0.414583462594911, + "learning_rate": 4.537255750559403e-06, + "loss": 0.6341, + "step": 4527 + }, + { + "epoch": 1.202044338245055, + "grad_norm": 0.42959940060825563, + "learning_rate": 4.5370533767312296e-06, + "loss": 0.5525, + "step": 4528 + }, + { + "epoch": 1.2023098367184388, + "grad_norm": 0.45575786058567525, + "learning_rate": 4.536850963175528e-06, + "loss": 0.6377, + "step": 4529 + }, + { + "epoch": 1.2025753351918227, + "grad_norm": 0.4402112900444621, + "learning_rate": 4.536648509896245e-06, + "loss": 0.5758, + "step": 4530 + }, + { + "epoch": 1.2028408336652063, + "grad_norm": 0.38074566253122516, + "learning_rate": 4.536446016897329e-06, + "loss": 0.5889, + "step": 4531 + }, + { + "epoch": 1.2031063321385902, + "grad_norm": 0.4401624468798666, + "learning_rate": 4.536243484182731e-06, + "loss": 0.6074, + "step": 4532 + }, + { + "epoch": 1.203371830611974, + "grad_norm": 0.43027291202695106, + "learning_rate": 4.536040911756399e-06, + "loss": 0.6053, + "step": 4533 + }, + { + "epoch": 1.2036373290853577, + "grad_norm": 0.42253011725451534, + "learning_rate": 4.535838299622284e-06, + "loss": 0.5569, + "step": 4534 + }, + { + "epoch": 1.2039028275587416, + "grad_norm": 0.3907656598436608, + "learning_rate": 4.535635647784338e-06, + "loss": 0.5928, + "step": 4535 + }, + { + "epoch": 1.2041683260321254, + "grad_norm": 0.4112153085011214, + "learning_rate": 4.5354329562465115e-06, + "loss": 0.5911, + "step": 4536 + }, + { + "epoch": 1.204433824505509, + "grad_norm": 0.4040585164261546, + "learning_rate": 4.53523022501276e-06, + "loss": 0.5748, + "step": 4537 + }, + { + "epoch": 1.204699322978893, + "grad_norm": 0.40644159671718444, + "learning_rate": 4.535027454087036e-06, + "loss": 0.6354, + "step": 4538 + }, + { + "epoch": 1.2049648214522766, + "grad_norm": 0.4110424534487759, + "learning_rate": 4.534824643473293e-06, + "loss": 0.5583, + "step": 4539 + }, + { + "epoch": 1.2052303199256604, + "grad_norm": 0.40955423635184196, + "learning_rate": 4.534621793175488e-06, + "loss": 0.5952, + "step": 4540 + }, + { + "epoch": 1.2054958183990443, + "grad_norm": 0.4289755889051613, + "learning_rate": 4.534418903197576e-06, + "loss": 0.5712, + "step": 4541 + }, + { + "epoch": 1.205761316872428, + "grad_norm": 0.4120154199246293, + "learning_rate": 4.534215973543515e-06, + "loss": 0.6286, + "step": 4542 + }, + { + "epoch": 1.2060268153458118, + "grad_norm": 0.41588980666684294, + "learning_rate": 4.534013004217262e-06, + "loss": 0.6193, + "step": 4543 + }, + { + "epoch": 1.2062923138191954, + "grad_norm": 0.39906101138682426, + "learning_rate": 4.533809995222774e-06, + "loss": 0.6124, + "step": 4544 + }, + { + "epoch": 1.2065578122925793, + "grad_norm": 0.4145166725200401, + "learning_rate": 4.533606946564013e-06, + "loss": 0.5961, + "step": 4545 + }, + { + "epoch": 1.2068233107659632, + "grad_norm": 0.42495458727778007, + "learning_rate": 4.5334038582449356e-06, + "loss": 0.6129, + "step": 4546 + }, + { + "epoch": 1.2070888092393468, + "grad_norm": 0.41361553015640895, + "learning_rate": 4.5332007302695055e-06, + "loss": 0.5928, + "step": 4547 + }, + { + "epoch": 1.2073543077127307, + "grad_norm": 0.4029178366872803, + "learning_rate": 4.532997562641683e-06, + "loss": 0.5361, + "step": 4548 + }, + { + "epoch": 1.2076198061861145, + "grad_norm": 0.3996784170604655, + "learning_rate": 4.532794355365431e-06, + "loss": 0.5916, + "step": 4549 + }, + { + "epoch": 1.2078853046594982, + "grad_norm": 0.4011109933262168, + "learning_rate": 4.532591108444712e-06, + "loss": 0.621, + "step": 4550 + }, + { + "epoch": 1.208150803132882, + "grad_norm": 0.3945229566377231, + "learning_rate": 4.532387821883488e-06, + "loss": 0.5801, + "step": 4551 + }, + { + "epoch": 1.2084163016062657, + "grad_norm": 0.398654024028116, + "learning_rate": 4.532184495685726e-06, + "loss": 0.5832, + "step": 4552 + }, + { + "epoch": 1.2086818000796495, + "grad_norm": 0.4157788965184638, + "learning_rate": 4.531981129855392e-06, + "loss": 0.6173, + "step": 4553 + }, + { + "epoch": 1.2089472985530334, + "grad_norm": 0.4051148222248255, + "learning_rate": 4.53177772439645e-06, + "loss": 0.6115, + "step": 4554 + }, + { + "epoch": 1.209212797026417, + "grad_norm": 0.41810120374116055, + "learning_rate": 4.531574279312867e-06, + "loss": 0.6308, + "step": 4555 + }, + { + "epoch": 1.209478295499801, + "grad_norm": 0.40198744311657575, + "learning_rate": 4.531370794608612e-06, + "loss": 0.586, + "step": 4556 + }, + { + "epoch": 1.2097437939731845, + "grad_norm": 0.39062583238353016, + "learning_rate": 4.531167270287653e-06, + "loss": 0.5855, + "step": 4557 + }, + { + "epoch": 1.2100092924465684, + "grad_norm": 0.40915740354957764, + "learning_rate": 4.530963706353959e-06, + "loss": 0.5712, + "step": 4558 + }, + { + "epoch": 1.2102747909199523, + "grad_norm": 0.40037791420706736, + "learning_rate": 4.5307601028115e-06, + "loss": 0.6025, + "step": 4559 + }, + { + "epoch": 1.210540289393336, + "grad_norm": 0.39933653746219744, + "learning_rate": 4.5305564596642475e-06, + "loss": 0.5988, + "step": 4560 + }, + { + "epoch": 1.2108057878667198, + "grad_norm": 0.400864297190555, + "learning_rate": 4.530352776916172e-06, + "loss": 0.586, + "step": 4561 + }, + { + "epoch": 1.2110712863401036, + "grad_norm": 0.41490692534316537, + "learning_rate": 4.530149054571245e-06, + "loss": 0.6253, + "step": 4562 + }, + { + "epoch": 1.2113367848134873, + "grad_norm": 0.40539192908506905, + "learning_rate": 4.529945292633442e-06, + "loss": 0.5514, + "step": 4563 + }, + { + "epoch": 1.2116022832868711, + "grad_norm": 0.389057409179555, + "learning_rate": 4.529741491106735e-06, + "loss": 0.5641, + "step": 4564 + }, + { + "epoch": 1.211867781760255, + "grad_norm": 0.39969140548337406, + "learning_rate": 4.5295376499950995e-06, + "loss": 0.58, + "step": 4565 + }, + { + "epoch": 1.2121332802336386, + "grad_norm": 0.41620549492660186, + "learning_rate": 4.529333769302511e-06, + "loss": 0.5511, + "step": 4566 + }, + { + "epoch": 1.2123987787070225, + "grad_norm": 0.40172141795373945, + "learning_rate": 4.529129849032945e-06, + "loss": 0.6071, + "step": 4567 + }, + { + "epoch": 1.2126642771804061, + "grad_norm": 0.41048727528800466, + "learning_rate": 4.5289258891903784e-06, + "loss": 0.5761, + "step": 4568 + }, + { + "epoch": 1.21292977565379, + "grad_norm": 0.39947229916316845, + "learning_rate": 4.52872188977879e-06, + "loss": 0.6092, + "step": 4569 + }, + { + "epoch": 1.2131952741271737, + "grad_norm": 0.3992775681904612, + "learning_rate": 4.528517850802158e-06, + "loss": 0.5893, + "step": 4570 + }, + { + "epoch": 1.2134607726005575, + "grad_norm": 0.4248472191629426, + "learning_rate": 4.52831377226446e-06, + "loss": 0.5768, + "step": 4571 + }, + { + "epoch": 1.2137262710739414, + "grad_norm": 0.41341739925748106, + "learning_rate": 4.528109654169678e-06, + "loss": 0.5866, + "step": 4572 + }, + { + "epoch": 1.213991769547325, + "grad_norm": 0.40606116812831267, + "learning_rate": 4.527905496521792e-06, + "loss": 0.5787, + "step": 4573 + }, + { + "epoch": 1.2142572680207089, + "grad_norm": 0.39154015159768674, + "learning_rate": 4.527701299324783e-06, + "loss": 0.6088, + "step": 4574 + }, + { + "epoch": 1.2145227664940927, + "grad_norm": 0.3914008201477086, + "learning_rate": 4.527497062582635e-06, + "loss": 0.5969, + "step": 4575 + }, + { + "epoch": 1.2147882649674764, + "grad_norm": 0.40571045446581433, + "learning_rate": 4.52729278629933e-06, + "loss": 0.5737, + "step": 4576 + }, + { + "epoch": 1.2150537634408602, + "grad_norm": 0.4111999284430291, + "learning_rate": 4.527088470478851e-06, + "loss": 0.6127, + "step": 4577 + }, + { + "epoch": 1.215319261914244, + "grad_norm": 0.40429646665507607, + "learning_rate": 4.526884115125184e-06, + "loss": 0.6201, + "step": 4578 + }, + { + "epoch": 1.2155847603876277, + "grad_norm": 0.4006407886039044, + "learning_rate": 4.526679720242315e-06, + "loss": 0.607, + "step": 4579 + }, + { + "epoch": 1.2158502588610116, + "grad_norm": 0.39365647895764416, + "learning_rate": 4.526475285834229e-06, + "loss": 0.5369, + "step": 4580 + }, + { + "epoch": 1.2161157573343953, + "grad_norm": 0.4047812942037202, + "learning_rate": 4.526270811904913e-06, + "loss": 0.5636, + "step": 4581 + }, + { + "epoch": 1.2163812558077791, + "grad_norm": 0.4108136995666403, + "learning_rate": 4.526066298458355e-06, + "loss": 0.5828, + "step": 4582 + }, + { + "epoch": 1.216646754281163, + "grad_norm": 0.4128774609606118, + "learning_rate": 4.525861745498543e-06, + "loss": 0.5875, + "step": 4583 + }, + { + "epoch": 1.2169122527545466, + "grad_norm": 0.4066142453827169, + "learning_rate": 4.525657153029467e-06, + "loss": 0.608, + "step": 4584 + }, + { + "epoch": 1.2171777512279305, + "grad_norm": 0.4198649312306408, + "learning_rate": 4.525452521055117e-06, + "loss": 0.5892, + "step": 4585 + }, + { + "epoch": 1.2174432497013141, + "grad_norm": 0.41533679139931573, + "learning_rate": 4.525247849579484e-06, + "loss": 0.6364, + "step": 4586 + }, + { + "epoch": 1.217708748174698, + "grad_norm": 0.43139295959642077, + "learning_rate": 4.525043138606559e-06, + "loss": 0.5895, + "step": 4587 + }, + { + "epoch": 1.2179742466480818, + "grad_norm": 0.4056367209519541, + "learning_rate": 4.524838388140335e-06, + "loss": 0.6038, + "step": 4588 + }, + { + "epoch": 1.2182397451214655, + "grad_norm": 0.415044173530926, + "learning_rate": 4.524633598184805e-06, + "loss": 0.5698, + "step": 4589 + }, + { + "epoch": 1.2185052435948494, + "grad_norm": 0.41697069069839054, + "learning_rate": 4.524428768743961e-06, + "loss": 0.614, + "step": 4590 + }, + { + "epoch": 1.2187707420682332, + "grad_norm": 0.4115570086337441, + "learning_rate": 4.524223899821802e-06, + "loss": 0.6154, + "step": 4591 + }, + { + "epoch": 1.2190362405416169, + "grad_norm": 0.4166193044486813, + "learning_rate": 4.524018991422318e-06, + "loss": 0.5928, + "step": 4592 + }, + { + "epoch": 1.2193017390150007, + "grad_norm": 0.410511604327231, + "learning_rate": 4.523814043549509e-06, + "loss": 0.6006, + "step": 4593 + }, + { + "epoch": 1.2195672374883844, + "grad_norm": 0.4039501558846791, + "learning_rate": 4.523609056207372e-06, + "loss": 0.5889, + "step": 4594 + }, + { + "epoch": 1.2198327359617682, + "grad_norm": 0.41508578310649125, + "learning_rate": 4.523404029399903e-06, + "loss": 0.5829, + "step": 4595 + }, + { + "epoch": 1.220098234435152, + "grad_norm": 0.3970011813756224, + "learning_rate": 4.523198963131101e-06, + "loss": 0.6332, + "step": 4596 + }, + { + "epoch": 1.2203637329085357, + "grad_norm": 0.41313031555925156, + "learning_rate": 4.522993857404966e-06, + "loss": 0.6023, + "step": 4597 + }, + { + "epoch": 1.2206292313819196, + "grad_norm": 0.39739995783304405, + "learning_rate": 4.522788712225498e-06, + "loss": 0.5933, + "step": 4598 + }, + { + "epoch": 1.2208947298553032, + "grad_norm": 0.4167065395080637, + "learning_rate": 4.522583527596696e-06, + "loss": 0.6148, + "step": 4599 + }, + { + "epoch": 1.221160228328687, + "grad_norm": 0.4144098222409626, + "learning_rate": 4.522378303522566e-06, + "loss": 0.5566, + "step": 4600 + }, + { + "epoch": 1.221425726802071, + "grad_norm": 0.3998625675016481, + "learning_rate": 4.522173040007104e-06, + "loss": 0.5745, + "step": 4601 + }, + { + "epoch": 1.2216912252754546, + "grad_norm": 0.39702683324226407, + "learning_rate": 4.521967737054319e-06, + "loss": 0.5604, + "step": 4602 + }, + { + "epoch": 1.2219567237488385, + "grad_norm": 0.41059343956295374, + "learning_rate": 4.521762394668212e-06, + "loss": 0.6153, + "step": 4603 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.4009489866586967, + "learning_rate": 4.521557012852787e-06, + "loss": 0.6155, + "step": 4604 + }, + { + "epoch": 1.222487720695606, + "grad_norm": 0.41834760473973975, + "learning_rate": 4.521351591612052e-06, + "loss": 0.6347, + "step": 4605 + }, + { + "epoch": 1.2227532191689898, + "grad_norm": 0.4079427954416437, + "learning_rate": 4.521146130950011e-06, + "loss": 0.589, + "step": 4606 + }, + { + "epoch": 1.2230187176423735, + "grad_norm": 0.3954929039564125, + "learning_rate": 4.520940630870672e-06, + "loss": 0.629, + "step": 4607 + }, + { + "epoch": 1.2232842161157573, + "grad_norm": 0.4085706251219328, + "learning_rate": 4.520735091378042e-06, + "loss": 0.5406, + "step": 4608 + }, + { + "epoch": 1.2235497145891412, + "grad_norm": 0.4110493967941192, + "learning_rate": 4.520529512476132e-06, + "loss": 0.616, + "step": 4609 + }, + { + "epoch": 1.2238152130625248, + "grad_norm": 0.3983089461016142, + "learning_rate": 4.520323894168948e-06, + "loss": 0.5994, + "step": 4610 + }, + { + "epoch": 1.2240807115359087, + "grad_norm": 0.40665554619408245, + "learning_rate": 4.520118236460501e-06, + "loss": 0.6026, + "step": 4611 + }, + { + "epoch": 1.2243462100092923, + "grad_norm": 0.4403881455444084, + "learning_rate": 4.519912539354803e-06, + "loss": 0.558, + "step": 4612 + }, + { + "epoch": 1.2246117084826762, + "grad_norm": 0.42054227170701386, + "learning_rate": 4.519706802855866e-06, + "loss": 0.5818, + "step": 4613 + }, + { + "epoch": 1.22487720695606, + "grad_norm": 0.41161084844160967, + "learning_rate": 4.519501026967699e-06, + "loss": 0.5766, + "step": 4614 + }, + { + "epoch": 1.2251427054294437, + "grad_norm": 0.39775434452622377, + "learning_rate": 4.519295211694319e-06, + "loss": 0.6018, + "step": 4615 + }, + { + "epoch": 1.2254082039028276, + "grad_norm": 0.4171902766664939, + "learning_rate": 4.519089357039738e-06, + "loss": 0.5896, + "step": 4616 + }, + { + "epoch": 1.2256737023762114, + "grad_norm": 0.4255779402347447, + "learning_rate": 4.5188834630079705e-06, + "loss": 0.6361, + "step": 4617 + }, + { + "epoch": 1.225939200849595, + "grad_norm": 0.4100721490623379, + "learning_rate": 4.518677529603032e-06, + "loss": 0.603, + "step": 4618 + }, + { + "epoch": 1.226204699322979, + "grad_norm": 0.4062782809134659, + "learning_rate": 4.518471556828939e-06, + "loss": 0.6254, + "step": 4619 + }, + { + "epoch": 1.2264701977963628, + "grad_norm": 0.4091156502775502, + "learning_rate": 4.51826554468971e-06, + "loss": 0.6022, + "step": 4620 + }, + { + "epoch": 1.2267356962697464, + "grad_norm": 0.41707692683216596, + "learning_rate": 4.5180594931893605e-06, + "loss": 0.6062, + "step": 4621 + }, + { + "epoch": 1.2270011947431303, + "grad_norm": 0.4056566322567737, + "learning_rate": 4.5178534023319096e-06, + "loss": 0.6122, + "step": 4622 + }, + { + "epoch": 1.227266693216514, + "grad_norm": 0.41111474368024437, + "learning_rate": 4.517647272121377e-06, + "loss": 0.6156, + "step": 4623 + }, + { + "epoch": 1.2275321916898978, + "grad_norm": 0.41366505204896453, + "learning_rate": 4.517441102561783e-06, + "loss": 0.5562, + "step": 4624 + }, + { + "epoch": 1.2277976901632814, + "grad_norm": 0.405011097796717, + "learning_rate": 4.517234893657148e-06, + "loss": 0.5697, + "step": 4625 + }, + { + "epoch": 1.2280631886366653, + "grad_norm": 0.4013322854132762, + "learning_rate": 4.5170286454114934e-06, + "loss": 0.5999, + "step": 4626 + }, + { + "epoch": 1.2283286871100492, + "grad_norm": 0.4131120973257043, + "learning_rate": 4.516822357828841e-06, + "loss": 0.5804, + "step": 4627 + }, + { + "epoch": 1.2285941855834328, + "grad_norm": 0.4046586142140177, + "learning_rate": 4.516616030913214e-06, + "loss": 0.6332, + "step": 4628 + }, + { + "epoch": 1.2288596840568167, + "grad_norm": 0.4001701879071022, + "learning_rate": 4.5164096646686385e-06, + "loss": 0.5561, + "step": 4629 + }, + { + "epoch": 1.2291251825302005, + "grad_norm": 0.40580430907131293, + "learning_rate": 4.516203259099137e-06, + "loss": 0.6, + "step": 4630 + }, + { + "epoch": 1.2293906810035842, + "grad_norm": 0.40016235897519387, + "learning_rate": 4.515996814208735e-06, + "loss": 0.5774, + "step": 4631 + }, + { + "epoch": 1.229656179476968, + "grad_norm": 0.3934421227732901, + "learning_rate": 4.51579033000146e-06, + "loss": 0.5922, + "step": 4632 + }, + { + "epoch": 1.229921677950352, + "grad_norm": 0.4024494322905055, + "learning_rate": 4.515583806481337e-06, + "loss": 0.6046, + "step": 4633 + }, + { + "epoch": 1.2301871764237355, + "grad_norm": 0.4048726091100463, + "learning_rate": 4.5153772436523955e-06, + "loss": 0.6374, + "step": 4634 + }, + { + "epoch": 1.2304526748971194, + "grad_norm": 0.4167136721780543, + "learning_rate": 4.515170641518663e-06, + "loss": 0.5964, + "step": 4635 + }, + { + "epoch": 1.230718173370503, + "grad_norm": 0.4107196437185372, + "learning_rate": 4.514964000084169e-06, + "loss": 0.5774, + "step": 4636 + }, + { + "epoch": 1.230983671843887, + "grad_norm": 0.3949272655823079, + "learning_rate": 4.514757319352944e-06, + "loss": 0.612, + "step": 4637 + }, + { + "epoch": 1.2312491703172708, + "grad_norm": 0.42061832336763316, + "learning_rate": 4.514550599329017e-06, + "loss": 0.6127, + "step": 4638 + }, + { + "epoch": 1.2315146687906544, + "grad_norm": 0.40004602168281184, + "learning_rate": 4.514343840016422e-06, + "loss": 0.6193, + "step": 4639 + }, + { + "epoch": 1.2317801672640383, + "grad_norm": 0.41111003842226984, + "learning_rate": 4.5141370414191895e-06, + "loss": 0.6096, + "step": 4640 + }, + { + "epoch": 1.232045665737422, + "grad_norm": 0.4069227657412981, + "learning_rate": 4.513930203541354e-06, + "loss": 0.6078, + "step": 4641 + }, + { + "epoch": 1.2323111642108058, + "grad_norm": 0.3902501374962566, + "learning_rate": 4.513723326386948e-06, + "loss": 0.5724, + "step": 4642 + }, + { + "epoch": 1.2325766626841896, + "grad_norm": 0.4082978881839397, + "learning_rate": 4.5135164099600074e-06, + "loss": 0.5692, + "step": 4643 + }, + { + "epoch": 1.2328421611575733, + "grad_norm": 0.41004955212249095, + "learning_rate": 4.513309454264566e-06, + "loss": 0.5849, + "step": 4644 + }, + { + "epoch": 1.2331076596309571, + "grad_norm": 0.40945634756922294, + "learning_rate": 4.5131024593046606e-06, + "loss": 0.5997, + "step": 4645 + }, + { + "epoch": 1.233373158104341, + "grad_norm": 0.4057072883324477, + "learning_rate": 4.512895425084329e-06, + "loss": 0.6041, + "step": 4646 + }, + { + "epoch": 1.2336386565777246, + "grad_norm": 0.40020944826483784, + "learning_rate": 4.512688351607608e-06, + "loss": 0.5559, + "step": 4647 + }, + { + "epoch": 1.2339041550511085, + "grad_norm": 0.3992843360135841, + "learning_rate": 4.512481238878536e-06, + "loss": 0.5671, + "step": 4648 + }, + { + "epoch": 1.2341696535244921, + "grad_norm": 0.413861323136047, + "learning_rate": 4.512274086901153e-06, + "loss": 0.5934, + "step": 4649 + }, + { + "epoch": 1.234435151997876, + "grad_norm": 0.4131061998339838, + "learning_rate": 4.512066895679498e-06, + "loss": 0.613, + "step": 4650 + }, + { + "epoch": 1.2347006504712599, + "grad_norm": 0.411082721705838, + "learning_rate": 4.511859665217612e-06, + "loss": 0.5991, + "step": 4651 + }, + { + "epoch": 1.2349661489446435, + "grad_norm": 0.4129553397487316, + "learning_rate": 4.511652395519537e-06, + "loss": 0.6307, + "step": 4652 + }, + { + "epoch": 1.2352316474180274, + "grad_norm": 0.4005651475073838, + "learning_rate": 4.511445086589315e-06, + "loss": 0.5705, + "step": 4653 + }, + { + "epoch": 1.235497145891411, + "grad_norm": 0.40416961106595234, + "learning_rate": 4.511237738430988e-06, + "loss": 0.632, + "step": 4654 + }, + { + "epoch": 1.2357626443647949, + "grad_norm": 0.4094538232150745, + "learning_rate": 4.511030351048602e-06, + "loss": 0.621, + "step": 4655 + }, + { + "epoch": 1.2360281428381787, + "grad_norm": 0.41203220445226707, + "learning_rate": 4.5108229244462e-06, + "loss": 0.5656, + "step": 4656 + }, + { + "epoch": 1.2362936413115624, + "grad_norm": 0.3974064204139351, + "learning_rate": 4.510615458627827e-06, + "loss": 0.624, + "step": 4657 + }, + { + "epoch": 1.2365591397849462, + "grad_norm": 0.4053512805953914, + "learning_rate": 4.51040795359753e-06, + "loss": 0.5914, + "step": 4658 + }, + { + "epoch": 1.2368246382583301, + "grad_norm": 0.38385587230019147, + "learning_rate": 4.510200409359356e-06, + "loss": 0.6076, + "step": 4659 + }, + { + "epoch": 1.2370901367317138, + "grad_norm": 0.39302575054313876, + "learning_rate": 4.509992825917352e-06, + "loss": 0.6034, + "step": 4660 + }, + { + "epoch": 1.2373556352050976, + "grad_norm": 0.38631989647523873, + "learning_rate": 4.509785203275566e-06, + "loss": 0.5872, + "step": 4661 + }, + { + "epoch": 1.2376211336784813, + "grad_norm": 0.3947814513041379, + "learning_rate": 4.509577541438048e-06, + "loss": 0.6012, + "step": 4662 + }, + { + "epoch": 1.2378866321518651, + "grad_norm": 0.4049232620209587, + "learning_rate": 4.509369840408848e-06, + "loss": 0.6128, + "step": 4663 + }, + { + "epoch": 1.238152130625249, + "grad_norm": 0.3963694798022003, + "learning_rate": 4.5091621001920166e-06, + "loss": 0.5964, + "step": 4664 + }, + { + "epoch": 1.2384176290986326, + "grad_norm": 0.4114874854497047, + "learning_rate": 4.508954320791605e-06, + "loss": 0.5651, + "step": 4665 + }, + { + "epoch": 1.2386831275720165, + "grad_norm": 0.395159427292642, + "learning_rate": 4.508746502211664e-06, + "loss": 0.5826, + "step": 4666 + }, + { + "epoch": 1.2389486260454001, + "grad_norm": 0.4036992794775691, + "learning_rate": 4.508538644456249e-06, + "loss": 0.6182, + "step": 4667 + }, + { + "epoch": 1.239214124518784, + "grad_norm": 0.3969933140920868, + "learning_rate": 4.508330747529413e-06, + "loss": 0.6273, + "step": 4668 + }, + { + "epoch": 1.2394796229921679, + "grad_norm": 0.40312899372428723, + "learning_rate": 4.50812281143521e-06, + "loss": 0.6178, + "step": 4669 + }, + { + "epoch": 1.2397451214655515, + "grad_norm": 0.4043137724542079, + "learning_rate": 4.5079148361776945e-06, + "loss": 0.585, + "step": 4670 + }, + { + "epoch": 1.2400106199389354, + "grad_norm": 0.45024355811210065, + "learning_rate": 4.5077068217609246e-06, + "loss": 0.6055, + "step": 4671 + }, + { + "epoch": 1.2402761184123192, + "grad_norm": 0.41368168926650717, + "learning_rate": 4.5074987681889555e-06, + "loss": 0.565, + "step": 4672 + }, + { + "epoch": 1.2405416168857029, + "grad_norm": 0.42062874365501995, + "learning_rate": 4.5072906754658454e-06, + "loss": 0.601, + "step": 4673 + }, + { + "epoch": 1.2408071153590867, + "grad_norm": 0.5015529124767025, + "learning_rate": 4.5070825435956525e-06, + "loss": 0.6023, + "step": 4674 + }, + { + "epoch": 1.2410726138324706, + "grad_norm": 0.398246879631349, + "learning_rate": 4.506874372582435e-06, + "loss": 0.6171, + "step": 4675 + }, + { + "epoch": 1.2413381123058542, + "grad_norm": 0.41955983213594866, + "learning_rate": 4.506666162430254e-06, + "loss": 0.5944, + "step": 4676 + }, + { + "epoch": 1.241603610779238, + "grad_norm": 0.5200679786373662, + "learning_rate": 4.50645791314317e-06, + "loss": 0.5587, + "step": 4677 + }, + { + "epoch": 1.2418691092526217, + "grad_norm": 0.41630508089961454, + "learning_rate": 4.506249624725244e-06, + "loss": 0.6151, + "step": 4678 + }, + { + "epoch": 1.2421346077260056, + "grad_norm": 0.40955551028286946, + "learning_rate": 4.506041297180538e-06, + "loss": 0.5864, + "step": 4679 + }, + { + "epoch": 1.2424001061993892, + "grad_norm": 0.43170902491721946, + "learning_rate": 4.505832930513115e-06, + "loss": 0.5926, + "step": 4680 + }, + { + "epoch": 1.242665604672773, + "grad_norm": 0.4083531845328568, + "learning_rate": 4.505624524727039e-06, + "loss": 0.5993, + "step": 4681 + }, + { + "epoch": 1.242931103146157, + "grad_norm": 0.4386420105478396, + "learning_rate": 4.5054160798263735e-06, + "loss": 0.5785, + "step": 4682 + }, + { + "epoch": 1.2431966016195406, + "grad_norm": 0.3951430743520727, + "learning_rate": 4.5052075958151856e-06, + "loss": 0.576, + "step": 4683 + }, + { + "epoch": 1.2434621000929245, + "grad_norm": 0.40548669423749983, + "learning_rate": 4.504999072697539e-06, + "loss": 0.596, + "step": 4684 + }, + { + "epoch": 1.2437275985663083, + "grad_norm": 0.4588289431459512, + "learning_rate": 4.504790510477501e-06, + "loss": 0.6228, + "step": 4685 + }, + { + "epoch": 1.243993097039692, + "grad_norm": 0.4090080408112996, + "learning_rate": 4.50458190915914e-06, + "loss": 0.593, + "step": 4686 + }, + { + "epoch": 1.2442585955130758, + "grad_norm": 0.41576918542041946, + "learning_rate": 4.5043732687465245e-06, + "loss": 0.5503, + "step": 4687 + }, + { + "epoch": 1.2445240939864597, + "grad_norm": 0.4008314511939019, + "learning_rate": 4.504164589243721e-06, + "loss": 0.5777, + "step": 4688 + }, + { + "epoch": 1.2447895924598433, + "grad_norm": 0.4073114351363417, + "learning_rate": 4.503955870654803e-06, + "loss": 0.6074, + "step": 4689 + }, + { + "epoch": 1.2450550909332272, + "grad_norm": 0.4140349938233657, + "learning_rate": 4.503747112983838e-06, + "loss": 0.6218, + "step": 4690 + }, + { + "epoch": 1.2453205894066108, + "grad_norm": 0.42281289291644614, + "learning_rate": 4.503538316234898e-06, + "loss": 0.6192, + "step": 4691 + }, + { + "epoch": 1.2455860878799947, + "grad_norm": 0.4067179694581884, + "learning_rate": 4.503329480412055e-06, + "loss": 0.6112, + "step": 4692 + }, + { + "epoch": 1.2458515863533786, + "grad_norm": 0.4128378596988542, + "learning_rate": 4.503120605519383e-06, + "loss": 0.621, + "step": 4693 + }, + { + "epoch": 1.2461170848267622, + "grad_norm": 0.4210900128113515, + "learning_rate": 4.502911691560954e-06, + "loss": 0.5675, + "step": 4694 + }, + { + "epoch": 1.246382583300146, + "grad_norm": 0.40758204383424324, + "learning_rate": 4.502702738540844e-06, + "loss": 0.594, + "step": 4695 + }, + { + "epoch": 1.2466480817735297, + "grad_norm": 0.39190155252852305, + "learning_rate": 4.502493746463126e-06, + "loss": 0.6229, + "step": 4696 + }, + { + "epoch": 1.2469135802469136, + "grad_norm": 0.3956725827925965, + "learning_rate": 4.502284715331878e-06, + "loss": 0.5718, + "step": 4697 + }, + { + "epoch": 1.2471790787202974, + "grad_norm": 0.39501824097640387, + "learning_rate": 4.502075645151175e-06, + "loss": 0.5612, + "step": 4698 + }, + { + "epoch": 1.247444577193681, + "grad_norm": 0.39803292433372905, + "learning_rate": 4.501866535925096e-06, + "loss": 0.5822, + "step": 4699 + }, + { + "epoch": 1.247710075667065, + "grad_norm": 0.3984842903381202, + "learning_rate": 4.501657387657716e-06, + "loss": 0.6264, + "step": 4700 + }, + { + "epoch": 1.2479755741404488, + "grad_norm": 0.3967076500272702, + "learning_rate": 4.501448200353118e-06, + "loss": 0.5803, + "step": 4701 + }, + { + "epoch": 1.2482410726138324, + "grad_norm": 0.3976296019879218, + "learning_rate": 4.501238974015379e-06, + "loss": 0.6121, + "step": 4702 + }, + { + "epoch": 1.2485065710872163, + "grad_norm": 0.4145519121192118, + "learning_rate": 4.50102970864858e-06, + "loss": 0.6072, + "step": 4703 + }, + { + "epoch": 1.2487720695606, + "grad_norm": 0.39764640397743245, + "learning_rate": 4.500820404256803e-06, + "loss": 0.6249, + "step": 4704 + }, + { + "epoch": 1.2490375680339838, + "grad_norm": 0.41122615024325393, + "learning_rate": 4.500611060844129e-06, + "loss": 0.6099, + "step": 4705 + }, + { + "epoch": 1.2493030665073677, + "grad_norm": 0.3890616770628102, + "learning_rate": 4.50040167841464e-06, + "loss": 0.6027, + "step": 4706 + }, + { + "epoch": 1.2495685649807513, + "grad_norm": 0.4058409505074949, + "learning_rate": 4.500192256972421e-06, + "loss": 0.576, + "step": 4707 + }, + { + "epoch": 1.2498340634541352, + "grad_norm": 0.40915680828716045, + "learning_rate": 4.499982796521556e-06, + "loss": 0.6163, + "step": 4708 + }, + { + "epoch": 1.2500995619275188, + "grad_norm": 0.4053049845103103, + "learning_rate": 4.49977329706613e-06, + "loss": 0.6318, + "step": 4709 + }, + { + "epoch": 1.2503650604009027, + "grad_norm": 0.41696705173949994, + "learning_rate": 4.499563758610228e-06, + "loss": 0.568, + "step": 4710 + }, + { + "epoch": 1.2506305588742865, + "grad_norm": 0.415157862556153, + "learning_rate": 4.499354181157936e-06, + "loss": 0.5802, + "step": 4711 + }, + { + "epoch": 1.2508960573476702, + "grad_norm": 0.40249290308362196, + "learning_rate": 4.499144564713343e-06, + "loss": 0.5692, + "step": 4712 + }, + { + "epoch": 1.251161555821054, + "grad_norm": 0.39639656091276615, + "learning_rate": 4.498934909280537e-06, + "loss": 0.6373, + "step": 4713 + }, + { + "epoch": 1.251427054294438, + "grad_norm": 0.4021407436121963, + "learning_rate": 4.498725214863605e-06, + "loss": 0.6079, + "step": 4714 + }, + { + "epoch": 1.2516925527678215, + "grad_norm": 0.4063696027207103, + "learning_rate": 4.498515481466639e-06, + "loss": 0.5864, + "step": 4715 + }, + { + "epoch": 1.2519580512412054, + "grad_norm": 0.4025966927960479, + "learning_rate": 4.498305709093726e-06, + "loss": 0.5816, + "step": 4716 + }, + { + "epoch": 1.2522235497145893, + "grad_norm": 0.41638340641391497, + "learning_rate": 4.49809589774896e-06, + "loss": 0.6278, + "step": 4717 + }, + { + "epoch": 1.252489048187973, + "grad_norm": 0.40860106293670095, + "learning_rate": 4.497886047436432e-06, + "loss": 0.6149, + "step": 4718 + }, + { + "epoch": 1.2527545466613568, + "grad_norm": 0.4062239507832847, + "learning_rate": 4.4976761581602335e-06, + "loss": 0.5809, + "step": 4719 + }, + { + "epoch": 1.2530200451347404, + "grad_norm": 0.40019229522876015, + "learning_rate": 4.497466229924459e-06, + "loss": 0.608, + "step": 4720 + }, + { + "epoch": 1.2532855436081243, + "grad_norm": 0.40499221575238076, + "learning_rate": 4.497256262733203e-06, + "loss": 0.5626, + "step": 4721 + }, + { + "epoch": 1.253551042081508, + "grad_norm": 0.4083037706323773, + "learning_rate": 4.497046256590559e-06, + "loss": 0.5813, + "step": 4722 + }, + { + "epoch": 1.2538165405548918, + "grad_norm": 0.3988592553494657, + "learning_rate": 4.4968362115006245e-06, + "loss": 0.5634, + "step": 4723 + }, + { + "epoch": 1.2540820390282756, + "grad_norm": 0.41194910918504213, + "learning_rate": 4.4966261274674934e-06, + "loss": 0.5874, + "step": 4724 + }, + { + "epoch": 1.2543475375016593, + "grad_norm": 0.4098327613785817, + "learning_rate": 4.496416004495265e-06, + "loss": 0.5952, + "step": 4725 + }, + { + "epoch": 1.2546130359750431, + "grad_norm": 0.40154345997679974, + "learning_rate": 4.496205842588036e-06, + "loss": 0.5944, + "step": 4726 + }, + { + "epoch": 1.254878534448427, + "grad_norm": 0.4047970712517502, + "learning_rate": 4.495995641749907e-06, + "loss": 0.6039, + "step": 4727 + }, + { + "epoch": 1.2551440329218106, + "grad_norm": 0.5090464261553893, + "learning_rate": 4.495785401984975e-06, + "loss": 0.623, + "step": 4728 + }, + { + "epoch": 1.2554095313951945, + "grad_norm": 0.40659137936963125, + "learning_rate": 4.49557512329734e-06, + "loss": 0.5657, + "step": 4729 + }, + { + "epoch": 1.2556750298685784, + "grad_norm": 0.37876860421427405, + "learning_rate": 4.495364805691105e-06, + "loss": 0.5948, + "step": 4730 + }, + { + "epoch": 1.255940528341962, + "grad_norm": 0.4097370145139313, + "learning_rate": 4.495154449170371e-06, + "loss": 0.5725, + "step": 4731 + }, + { + "epoch": 1.2562060268153459, + "grad_norm": 0.4016076593630768, + "learning_rate": 4.49494405373924e-06, + "loss": 0.5599, + "step": 4732 + }, + { + "epoch": 1.2564715252887295, + "grad_norm": 0.41112434906347095, + "learning_rate": 4.494733619401815e-06, + "loss": 0.5867, + "step": 4733 + }, + { + "epoch": 1.2567370237621134, + "grad_norm": 0.41230364602881003, + "learning_rate": 4.4945231461622015e-06, + "loss": 0.6118, + "step": 4734 + }, + { + "epoch": 1.257002522235497, + "grad_norm": 0.3939069021295258, + "learning_rate": 4.494312634024502e-06, + "loss": 0.5986, + "step": 4735 + }, + { + "epoch": 1.2572680207088809, + "grad_norm": 0.3974222344781593, + "learning_rate": 4.494102082992825e-06, + "loss": 0.5912, + "step": 4736 + }, + { + "epoch": 1.2575335191822647, + "grad_norm": 0.4137566663149835, + "learning_rate": 4.493891493071274e-06, + "loss": 0.6081, + "step": 4737 + }, + { + "epoch": 1.2577990176556484, + "grad_norm": 0.39977547756883264, + "learning_rate": 4.4936808642639565e-06, + "loss": 0.5977, + "step": 4738 + }, + { + "epoch": 1.2580645161290323, + "grad_norm": 0.416984841331229, + "learning_rate": 4.493470196574981e-06, + "loss": 0.5927, + "step": 4739 + }, + { + "epoch": 1.2583300146024161, + "grad_norm": 0.3934255723016643, + "learning_rate": 4.493259490008457e-06, + "loss": 0.5806, + "step": 4740 + }, + { + "epoch": 1.2585955130757998, + "grad_norm": 0.4151678926674046, + "learning_rate": 4.493048744568491e-06, + "loss": 0.5773, + "step": 4741 + }, + { + "epoch": 1.2588610115491836, + "grad_norm": 0.4085956237297976, + "learning_rate": 4.4928379602591955e-06, + "loss": 0.5093, + "step": 4742 + }, + { + "epoch": 1.2591265100225675, + "grad_norm": 0.3979179667069224, + "learning_rate": 4.49262713708468e-06, + "loss": 0.5915, + "step": 4743 + }, + { + "epoch": 1.2593920084959511, + "grad_norm": 0.4055209220212614, + "learning_rate": 4.492416275049057e-06, + "loss": 0.631, + "step": 4744 + }, + { + "epoch": 1.259657506969335, + "grad_norm": 0.4099440898415954, + "learning_rate": 4.492205374156438e-06, + "loss": 0.5806, + "step": 4745 + }, + { + "epoch": 1.2599230054427186, + "grad_norm": 0.4244296166019007, + "learning_rate": 4.491994434410937e-06, + "loss": 0.5447, + "step": 4746 + }, + { + "epoch": 1.2601885039161025, + "grad_norm": 0.40926102246167423, + "learning_rate": 4.4917834558166665e-06, + "loss": 0.6181, + "step": 4747 + }, + { + "epoch": 1.2604540023894861, + "grad_norm": 0.41411071143827854, + "learning_rate": 4.4915724383777425e-06, + "loss": 0.5786, + "step": 4748 + }, + { + "epoch": 1.26071950086287, + "grad_norm": 0.431605614908769, + "learning_rate": 4.491361382098279e-06, + "loss": 0.5808, + "step": 4749 + }, + { + "epoch": 1.2609849993362539, + "grad_norm": 0.4126530945020734, + "learning_rate": 4.491150286982394e-06, + "loss": 0.6124, + "step": 4750 + }, + { + "epoch": 1.2612504978096375, + "grad_norm": 0.41152550735466387, + "learning_rate": 4.490939153034203e-06, + "loss": 0.597, + "step": 4751 + }, + { + "epoch": 1.2615159962830214, + "grad_norm": 0.42812263358301217, + "learning_rate": 4.490727980257823e-06, + "loss": 0.6013, + "step": 4752 + }, + { + "epoch": 1.2617814947564052, + "grad_norm": 0.4087351316592823, + "learning_rate": 4.490516768657373e-06, + "loss": 0.5725, + "step": 4753 + }, + { + "epoch": 1.2620469932297889, + "grad_norm": 0.4222095885754993, + "learning_rate": 4.4903055182369735e-06, + "loss": 0.5928, + "step": 4754 + }, + { + "epoch": 1.2623124917031727, + "grad_norm": 0.41816055562302085, + "learning_rate": 4.490094229000743e-06, + "loss": 0.6029, + "step": 4755 + }, + { + "epoch": 1.2625779901765566, + "grad_norm": 0.38859145446784293, + "learning_rate": 4.489882900952802e-06, + "loss": 0.5834, + "step": 4756 + }, + { + "epoch": 1.2628434886499402, + "grad_norm": 0.4920456543950286, + "learning_rate": 4.489671534097273e-06, + "loss": 0.5923, + "step": 4757 + }, + { + "epoch": 1.263108987123324, + "grad_norm": 0.38975299000872626, + "learning_rate": 4.489460128438276e-06, + "loss": 0.5658, + "step": 4758 + }, + { + "epoch": 1.263374485596708, + "grad_norm": 0.4086697348407216, + "learning_rate": 4.489248683979937e-06, + "loss": 0.5668, + "step": 4759 + }, + { + "epoch": 1.2636399840700916, + "grad_norm": 0.4091965681306532, + "learning_rate": 4.489037200726378e-06, + "loss": 0.6212, + "step": 4760 + }, + { + "epoch": 1.2639054825434755, + "grad_norm": 0.44470399037483943, + "learning_rate": 4.488825678681724e-06, + "loss": 0.5685, + "step": 4761 + }, + { + "epoch": 1.264170981016859, + "grad_norm": 0.42906645057804543, + "learning_rate": 4.488614117850098e-06, + "loss": 0.6041, + "step": 4762 + }, + { + "epoch": 1.264436479490243, + "grad_norm": 0.4030515200006183, + "learning_rate": 4.48840251823563e-06, + "loss": 0.5984, + "step": 4763 + }, + { + "epoch": 1.2647019779636266, + "grad_norm": 0.41071995572515413, + "learning_rate": 4.488190879842443e-06, + "loss": 0.5844, + "step": 4764 + }, + { + "epoch": 1.2649674764370105, + "grad_norm": 0.41879161853034597, + "learning_rate": 4.487979202674667e-06, + "loss": 0.5673, + "step": 4765 + }, + { + "epoch": 1.2652329749103943, + "grad_norm": 0.4073524914173789, + "learning_rate": 4.487767486736428e-06, + "loss": 0.5909, + "step": 4766 + }, + { + "epoch": 1.265498473383778, + "grad_norm": 0.4310325096706651, + "learning_rate": 4.487555732031856e-06, + "loss": 0.5652, + "step": 4767 + }, + { + "epoch": 1.2657639718571618, + "grad_norm": 0.4088101291713759, + "learning_rate": 4.487343938565082e-06, + "loss": 0.5817, + "step": 4768 + }, + { + "epoch": 1.2660294703305457, + "grad_norm": 0.40114012494791745, + "learning_rate": 4.4871321063402345e-06, + "loss": 0.6111, + "step": 4769 + }, + { + "epoch": 1.2662949688039293, + "grad_norm": 0.4390390891353396, + "learning_rate": 4.486920235361446e-06, + "loss": 0.5953, + "step": 4770 + }, + { + "epoch": 1.2665604672773132, + "grad_norm": 0.4107415947510306, + "learning_rate": 4.486708325632847e-06, + "loss": 0.5918, + "step": 4771 + }, + { + "epoch": 1.266825965750697, + "grad_norm": 0.4041596575008259, + "learning_rate": 4.486496377158572e-06, + "loss": 0.5535, + "step": 4772 + }, + { + "epoch": 1.2670914642240807, + "grad_norm": 0.3952041202089783, + "learning_rate": 4.486284389942755e-06, + "loss": 0.6087, + "step": 4773 + }, + { + "epoch": 1.2673569626974646, + "grad_norm": 0.4069445701758252, + "learning_rate": 4.486072363989528e-06, + "loss": 0.5532, + "step": 4774 + }, + { + "epoch": 1.2676224611708482, + "grad_norm": 0.46798041103016264, + "learning_rate": 4.4858602993030275e-06, + "loss": 0.5893, + "step": 4775 + }, + { + "epoch": 1.267887959644232, + "grad_norm": 0.39629517895838134, + "learning_rate": 4.4856481958873885e-06, + "loss": 0.5718, + "step": 4776 + }, + { + "epoch": 1.2681534581176157, + "grad_norm": 0.4088267200524566, + "learning_rate": 4.485436053746749e-06, + "loss": 0.6095, + "step": 4777 + }, + { + "epoch": 1.2684189565909996, + "grad_norm": 0.4206337488572424, + "learning_rate": 4.4852238728852435e-06, + "loss": 0.5816, + "step": 4778 + }, + { + "epoch": 1.2686844550643834, + "grad_norm": 0.4248247975976921, + "learning_rate": 4.485011653307013e-06, + "loss": 0.5917, + "step": 4779 + }, + { + "epoch": 1.268949953537767, + "grad_norm": 0.4065412804836255, + "learning_rate": 4.4847993950161955e-06, + "loss": 0.627, + "step": 4780 + }, + { + "epoch": 1.269215452011151, + "grad_norm": 0.411719771782179, + "learning_rate": 4.4845870980169294e-06, + "loss": 0.546, + "step": 4781 + }, + { + "epoch": 1.2694809504845348, + "grad_norm": 0.4191350242941418, + "learning_rate": 4.484374762313356e-06, + "loss": 0.5656, + "step": 4782 + }, + { + "epoch": 1.2697464489579184, + "grad_norm": 0.4050154944718186, + "learning_rate": 4.484162387909616e-06, + "loss": 0.6091, + "step": 4783 + }, + { + "epoch": 1.2700119474313023, + "grad_norm": 0.41182283144403287, + "learning_rate": 4.483949974809852e-06, + "loss": 0.6256, + "step": 4784 + }, + { + "epoch": 1.2702774459046862, + "grad_norm": 0.41344406733221284, + "learning_rate": 4.483737523018206e-06, + "loss": 0.6095, + "step": 4785 + }, + { + "epoch": 1.2705429443780698, + "grad_norm": 0.401132951206163, + "learning_rate": 4.48352503253882e-06, + "loss": 0.5828, + "step": 4786 + }, + { + "epoch": 1.2708084428514537, + "grad_norm": 0.40306996783221555, + "learning_rate": 4.4833125033758404e-06, + "loss": 0.5794, + "step": 4787 + }, + { + "epoch": 1.2710739413248373, + "grad_norm": 0.40600983052603384, + "learning_rate": 4.48309993553341e-06, + "loss": 0.5955, + "step": 4788 + }, + { + "epoch": 1.2713394397982212, + "grad_norm": 0.39566826374726843, + "learning_rate": 4.482887329015677e-06, + "loss": 0.5818, + "step": 4789 + }, + { + "epoch": 1.2716049382716048, + "grad_norm": 0.40271988676629156, + "learning_rate": 4.4826746838267855e-06, + "loss": 0.5918, + "step": 4790 + }, + { + "epoch": 1.2718704367449887, + "grad_norm": 0.3869110575668333, + "learning_rate": 4.4824619999708825e-06, + "loss": 0.6297, + "step": 4791 + }, + { + "epoch": 1.2721359352183725, + "grad_norm": 0.3988670578931698, + "learning_rate": 4.482249277452117e-06, + "loss": 0.5732, + "step": 4792 + }, + { + "epoch": 1.2724014336917562, + "grad_norm": 0.40337604236371577, + "learning_rate": 4.482036516274637e-06, + "loss": 0.5877, + "step": 4793 + }, + { + "epoch": 1.27266693216514, + "grad_norm": 0.3965852107471716, + "learning_rate": 4.4818237164425926e-06, + "loss": 0.6058, + "step": 4794 + }, + { + "epoch": 1.272932430638524, + "grad_norm": 0.39467361845498866, + "learning_rate": 4.481610877960133e-06, + "loss": 0.5946, + "step": 4795 + }, + { + "epoch": 1.2731979291119075, + "grad_norm": 0.41405476701881155, + "learning_rate": 4.48139800083141e-06, + "loss": 0.5655, + "step": 4796 + }, + { + "epoch": 1.2734634275852914, + "grad_norm": 0.39727062147722964, + "learning_rate": 4.481185085060574e-06, + "loss": 0.5744, + "step": 4797 + }, + { + "epoch": 1.2737289260586753, + "grad_norm": 0.4051504859500576, + "learning_rate": 4.480972130651779e-06, + "loss": 0.5939, + "step": 4798 + }, + { + "epoch": 1.273994424532059, + "grad_norm": 0.4110274549838971, + "learning_rate": 4.480759137609177e-06, + "loss": 0.5798, + "step": 4799 + }, + { + "epoch": 1.2742599230054428, + "grad_norm": 0.4041875531959028, + "learning_rate": 4.480546105936921e-06, + "loss": 0.6093, + "step": 4800 + }, + { + "epoch": 1.2745254214788264, + "grad_norm": 0.3970489992123183, + "learning_rate": 4.4803330356391675e-06, + "loss": 0.599, + "step": 4801 + }, + { + "epoch": 1.2747909199522103, + "grad_norm": 0.40995236191142226, + "learning_rate": 4.4801199267200705e-06, + "loss": 0.6095, + "step": 4802 + }, + { + "epoch": 1.275056418425594, + "grad_norm": 0.4015925545741892, + "learning_rate": 4.479906779183788e-06, + "loss": 0.6153, + "step": 4803 + }, + { + "epoch": 1.2753219168989778, + "grad_norm": 0.4093549856330689, + "learning_rate": 4.479693593034475e-06, + "loss": 0.5716, + "step": 4804 + }, + { + "epoch": 1.2755874153723616, + "grad_norm": 0.38151222400251716, + "learning_rate": 4.479480368276289e-06, + "loss": 0.6065, + "step": 4805 + }, + { + "epoch": 1.2758529138457453, + "grad_norm": 0.40350724378341696, + "learning_rate": 4.47926710491339e-06, + "loss": 0.6121, + "step": 4806 + }, + { + "epoch": 1.2761184123191291, + "grad_norm": 0.4075373516368296, + "learning_rate": 4.479053802949936e-06, + "loss": 0.6211, + "step": 4807 + }, + { + "epoch": 1.276383910792513, + "grad_norm": 0.41150806857921224, + "learning_rate": 4.478840462390088e-06, + "loss": 0.5906, + "step": 4808 + }, + { + "epoch": 1.2766494092658967, + "grad_norm": 0.40762919214732546, + "learning_rate": 4.478627083238005e-06, + "loss": 0.5958, + "step": 4809 + }, + { + "epoch": 1.2769149077392805, + "grad_norm": 0.3930084019369108, + "learning_rate": 4.4784136654978505e-06, + "loss": 0.5839, + "step": 4810 + }, + { + "epoch": 1.2771804062126644, + "grad_norm": 0.41015098535345956, + "learning_rate": 4.478200209173785e-06, + "loss": 0.5903, + "step": 4811 + }, + { + "epoch": 1.277445904686048, + "grad_norm": 0.41187644322477424, + "learning_rate": 4.4779867142699715e-06, + "loss": 0.5507, + "step": 4812 + }, + { + "epoch": 1.2777114031594319, + "grad_norm": 0.40852280586238554, + "learning_rate": 4.477773180790574e-06, + "loss": 0.5943, + "step": 4813 + }, + { + "epoch": 1.2779769016328157, + "grad_norm": 0.4030066484287729, + "learning_rate": 4.477559608739758e-06, + "loss": 0.5891, + "step": 4814 + }, + { + "epoch": 1.2782424001061994, + "grad_norm": 0.40570442453337285, + "learning_rate": 4.477345998121687e-06, + "loss": 0.5892, + "step": 4815 + }, + { + "epoch": 1.2785078985795832, + "grad_norm": 0.40639671976606295, + "learning_rate": 4.477132348940528e-06, + "loss": 0.6073, + "step": 4816 + }, + { + "epoch": 1.2787733970529669, + "grad_norm": 0.41492400827927484, + "learning_rate": 4.476918661200447e-06, + "loss": 0.583, + "step": 4817 + }, + { + "epoch": 1.2790388955263507, + "grad_norm": 0.41355270732709376, + "learning_rate": 4.476704934905612e-06, + "loss": 0.5681, + "step": 4818 + }, + { + "epoch": 1.2793043939997344, + "grad_norm": 0.40671603678749557, + "learning_rate": 4.476491170060191e-06, + "loss": 0.5812, + "step": 4819 + }, + { + "epoch": 1.2795698924731183, + "grad_norm": 0.40960390021168025, + "learning_rate": 4.476277366668353e-06, + "loss": 0.6126, + "step": 4820 + }, + { + "epoch": 1.2798353909465021, + "grad_norm": 0.3989253870870453, + "learning_rate": 4.476063524734268e-06, + "loss": 0.5956, + "step": 4821 + }, + { + "epoch": 1.2801008894198858, + "grad_norm": 0.3959694536636935, + "learning_rate": 4.4758496442621055e-06, + "loss": 0.565, + "step": 4822 + }, + { + "epoch": 1.2803663878932696, + "grad_norm": 0.4050948502117333, + "learning_rate": 4.475635725256038e-06, + "loss": 0.5907, + "step": 4823 + }, + { + "epoch": 1.2806318863666535, + "grad_norm": 0.3985265120580581, + "learning_rate": 4.475421767720235e-06, + "loss": 0.5825, + "step": 4824 + }, + { + "epoch": 1.2808973848400371, + "grad_norm": 0.3984796997375991, + "learning_rate": 4.4752077716588725e-06, + "loss": 0.6085, + "step": 4825 + }, + { + "epoch": 1.281162883313421, + "grad_norm": 0.38902982064431796, + "learning_rate": 4.474993737076121e-06, + "loss": 0.5868, + "step": 4826 + }, + { + "epoch": 1.2814283817868048, + "grad_norm": 0.4055795435691279, + "learning_rate": 4.474779663976157e-06, + "loss": 0.5712, + "step": 4827 + }, + { + "epoch": 1.2816938802601885, + "grad_norm": 0.39817241335539527, + "learning_rate": 4.474565552363153e-06, + "loss": 0.5937, + "step": 4828 + }, + { + "epoch": 1.2819593787335724, + "grad_norm": 0.39704626074569316, + "learning_rate": 4.474351402241288e-06, + "loss": 0.5746, + "step": 4829 + }, + { + "epoch": 1.282224877206956, + "grad_norm": 0.42386760846994787, + "learning_rate": 4.474137213614735e-06, + "loss": 0.5539, + "step": 4830 + }, + { + "epoch": 1.2824903756803399, + "grad_norm": 0.41060071853683094, + "learning_rate": 4.473922986487674e-06, + "loss": 0.594, + "step": 4831 + }, + { + "epoch": 1.2827558741537235, + "grad_norm": 0.39749026576463337, + "learning_rate": 4.473708720864281e-06, + "loss": 0.6217, + "step": 4832 + }, + { + "epoch": 1.2830213726271074, + "grad_norm": 0.4205963811592871, + "learning_rate": 4.473494416748736e-06, + "loss": 0.5945, + "step": 4833 + }, + { + "epoch": 1.2832868711004912, + "grad_norm": 0.4213602542027498, + "learning_rate": 4.473280074145219e-06, + "loss": 0.5607, + "step": 4834 + }, + { + "epoch": 1.2835523695738749, + "grad_norm": 0.40042092281192254, + "learning_rate": 4.473065693057908e-06, + "loss": 0.6233, + "step": 4835 + }, + { + "epoch": 1.2838178680472587, + "grad_norm": 0.41131366887398746, + "learning_rate": 4.472851273490985e-06, + "loss": 0.5799, + "step": 4836 + }, + { + "epoch": 1.2840833665206426, + "grad_norm": 0.4023613343319866, + "learning_rate": 4.472636815448631e-06, + "loss": 0.5956, + "step": 4837 + }, + { + "epoch": 1.2843488649940262, + "grad_norm": 0.4149846313082392, + "learning_rate": 4.47242231893503e-06, + "loss": 0.5907, + "step": 4838 + }, + { + "epoch": 1.28461436346741, + "grad_norm": 0.41737786085224055, + "learning_rate": 4.472207783954365e-06, + "loss": 0.6129, + "step": 4839 + }, + { + "epoch": 1.284879861940794, + "grad_norm": 0.4089426490047283, + "learning_rate": 4.471993210510819e-06, + "loss": 0.5601, + "step": 4840 + }, + { + "epoch": 1.2851453604141776, + "grad_norm": 0.40589244994955054, + "learning_rate": 4.471778598608577e-06, + "loss": 0.6024, + "step": 4841 + }, + { + "epoch": 1.2854108588875615, + "grad_norm": 0.4130906605683283, + "learning_rate": 4.471563948251824e-06, + "loss": 0.5867, + "step": 4842 + }, + { + "epoch": 1.285676357360945, + "grad_norm": 0.40454329764588115, + "learning_rate": 4.471349259444748e-06, + "loss": 0.6083, + "step": 4843 + }, + { + "epoch": 1.285941855834329, + "grad_norm": 0.40150588487408206, + "learning_rate": 4.471134532191534e-06, + "loss": 0.5751, + "step": 4844 + }, + { + "epoch": 1.2862073543077126, + "grad_norm": 0.39357217461506994, + "learning_rate": 4.470919766496371e-06, + "loss": 0.5676, + "step": 4845 + }, + { + "epoch": 1.2864728527810965, + "grad_norm": 0.3974261930678769, + "learning_rate": 4.470704962363446e-06, + "loss": 0.5918, + "step": 4846 + }, + { + "epoch": 1.2867383512544803, + "grad_norm": 0.4216397648345008, + "learning_rate": 4.47049011979695e-06, + "loss": 0.5696, + "step": 4847 + }, + { + "epoch": 1.287003849727864, + "grad_norm": 0.3991238213415633, + "learning_rate": 4.470275238801072e-06, + "loss": 0.5609, + "step": 4848 + }, + { + "epoch": 1.2872693482012478, + "grad_norm": 0.39755954531684756, + "learning_rate": 4.470060319380002e-06, + "loss": 0.5833, + "step": 4849 + }, + { + "epoch": 1.2875348466746317, + "grad_norm": 0.410318689773213, + "learning_rate": 4.4698453615379325e-06, + "loss": 0.6427, + "step": 4850 + }, + { + "epoch": 1.2878003451480153, + "grad_norm": 0.4119998414391094, + "learning_rate": 4.469630365279056e-06, + "loss": 0.6049, + "step": 4851 + }, + { + "epoch": 1.2880658436213992, + "grad_norm": 0.415713469107909, + "learning_rate": 4.469415330607565e-06, + "loss": 0.579, + "step": 4852 + }, + { + "epoch": 1.288331342094783, + "grad_norm": 0.4008093507352382, + "learning_rate": 4.469200257527652e-06, + "loss": 0.5921, + "step": 4853 + }, + { + "epoch": 1.2885968405681667, + "grad_norm": 0.4204746050607293, + "learning_rate": 4.468985146043514e-06, + "loss": 0.6103, + "step": 4854 + }, + { + "epoch": 1.2888623390415506, + "grad_norm": 0.4144420026529189, + "learning_rate": 4.468769996159343e-06, + "loss": 0.6077, + "step": 4855 + }, + { + "epoch": 1.2891278375149344, + "grad_norm": 0.41346886902265734, + "learning_rate": 4.468554807879339e-06, + "loss": 0.5892, + "step": 4856 + }, + { + "epoch": 1.289393335988318, + "grad_norm": 0.4130070518978181, + "learning_rate": 4.468339581207694e-06, + "loss": 0.608, + "step": 4857 + }, + { + "epoch": 1.2896588344617017, + "grad_norm": 0.41133939447795226, + "learning_rate": 4.468124316148609e-06, + "loss": 0.6089, + "step": 4858 + }, + { + "epoch": 1.2899243329350856, + "grad_norm": 0.3990111320182523, + "learning_rate": 4.467909012706281e-06, + "loss": 0.6017, + "step": 4859 + }, + { + "epoch": 1.2901898314084694, + "grad_norm": 0.39591399252536497, + "learning_rate": 4.467693670884909e-06, + "loss": 0.6099, + "step": 4860 + }, + { + "epoch": 1.290455329881853, + "grad_norm": 0.4245395063765497, + "learning_rate": 4.467478290688693e-06, + "loss": 0.6037, + "step": 4861 + }, + { + "epoch": 1.290720828355237, + "grad_norm": 0.4131157545958495, + "learning_rate": 4.4672628721218325e-06, + "loss": 0.5918, + "step": 4862 + }, + { + "epoch": 1.2909863268286208, + "grad_norm": 0.4241970294972353, + "learning_rate": 4.467047415188529e-06, + "loss": 0.6079, + "step": 4863 + }, + { + "epoch": 1.2912518253020044, + "grad_norm": 0.40836007528589496, + "learning_rate": 4.466831919892986e-06, + "loss": 0.6031, + "step": 4864 + }, + { + "epoch": 1.2915173237753883, + "grad_norm": 0.40460961855468014, + "learning_rate": 4.466616386239403e-06, + "loss": 0.6057, + "step": 4865 + }, + { + "epoch": 1.2917828222487722, + "grad_norm": 0.3983367622313922, + "learning_rate": 4.466400814231987e-06, + "loss": 0.6104, + "step": 4866 + }, + { + "epoch": 1.2920483207221558, + "grad_norm": 0.44122615558805117, + "learning_rate": 4.46618520387494e-06, + "loss": 0.5715, + "step": 4867 + }, + { + "epoch": 1.2923138191955397, + "grad_norm": 0.39461543050577097, + "learning_rate": 4.465969555172468e-06, + "loss": 0.6042, + "step": 4868 + }, + { + "epoch": 1.2925793176689235, + "grad_norm": 0.4008858405893098, + "learning_rate": 4.465753868128775e-06, + "loss": 0.5769, + "step": 4869 + }, + { + "epoch": 1.2928448161423072, + "grad_norm": 0.41956821905745156, + "learning_rate": 4.465538142748069e-06, + "loss": 0.5602, + "step": 4870 + }, + { + "epoch": 1.293110314615691, + "grad_norm": 0.39954694162905713, + "learning_rate": 4.4653223790345575e-06, + "loss": 0.6066, + "step": 4871 + }, + { + "epoch": 1.2933758130890747, + "grad_norm": 0.392956173024187, + "learning_rate": 4.465106576992447e-06, + "loss": 0.5755, + "step": 4872 + }, + { + "epoch": 1.2936413115624585, + "grad_norm": 0.39229841630418266, + "learning_rate": 4.464890736625947e-06, + "loss": 0.5887, + "step": 4873 + }, + { + "epoch": 1.2939068100358422, + "grad_norm": 0.4020361585980307, + "learning_rate": 4.464674857939267e-06, + "loss": 0.5925, + "step": 4874 + }, + { + "epoch": 1.294172308509226, + "grad_norm": 0.40567876691463367, + "learning_rate": 4.464458940936617e-06, + "loss": 0.5922, + "step": 4875 + }, + { + "epoch": 1.29443780698261, + "grad_norm": 0.4013564549878668, + "learning_rate": 4.464242985622207e-06, + "loss": 0.6106, + "step": 4876 + }, + { + "epoch": 1.2947033054559935, + "grad_norm": 0.4038216727943326, + "learning_rate": 4.46402699200025e-06, + "loss": 0.5887, + "step": 4877 + }, + { + "epoch": 1.2949688039293774, + "grad_norm": 0.3978614122089539, + "learning_rate": 4.463810960074958e-06, + "loss": 0.6061, + "step": 4878 + }, + { + "epoch": 1.2952343024027613, + "grad_norm": 0.3977618233500997, + "learning_rate": 4.463594889850543e-06, + "loss": 0.6316, + "step": 4879 + }, + { + "epoch": 1.295499800876145, + "grad_norm": 0.4218751705702764, + "learning_rate": 4.4633787813312216e-06, + "loss": 0.6167, + "step": 4880 + }, + { + "epoch": 1.2957652993495288, + "grad_norm": 0.4058847872883744, + "learning_rate": 4.463162634521206e-06, + "loss": 0.6037, + "step": 4881 + }, + { + "epoch": 1.2960307978229126, + "grad_norm": 0.42217733066166013, + "learning_rate": 4.462946449424713e-06, + "loss": 0.5571, + "step": 4882 + }, + { + "epoch": 1.2962962962962963, + "grad_norm": 0.4098362634149795, + "learning_rate": 4.462730226045957e-06, + "loss": 0.5729, + "step": 4883 + }, + { + "epoch": 1.2965617947696801, + "grad_norm": 0.4232138768948184, + "learning_rate": 4.462513964389157e-06, + "loss": 0.5639, + "step": 4884 + }, + { + "epoch": 1.2968272932430638, + "grad_norm": 0.39686602524610415, + "learning_rate": 4.462297664458529e-06, + "loss": 0.6282, + "step": 4885 + }, + { + "epoch": 1.2970927917164476, + "grad_norm": 0.4138818690910932, + "learning_rate": 4.462081326258292e-06, + "loss": 0.5851, + "step": 4886 + }, + { + "epoch": 1.2973582901898313, + "grad_norm": 0.4194989149152848, + "learning_rate": 4.4618649497926655e-06, + "loss": 0.581, + "step": 4887 + }, + { + "epoch": 1.2976237886632151, + "grad_norm": 0.40558023529005793, + "learning_rate": 4.461648535065869e-06, + "loss": 0.5991, + "step": 4888 + }, + { + "epoch": 1.297889287136599, + "grad_norm": 0.41331403939075745, + "learning_rate": 4.461432082082123e-06, + "loss": 0.6123, + "step": 4889 + }, + { + "epoch": 1.2981547856099827, + "grad_norm": 0.4174597695987421, + "learning_rate": 4.461215590845649e-06, + "loss": 0.6309, + "step": 4890 + }, + { + "epoch": 1.2984202840833665, + "grad_norm": 0.4111726198608028, + "learning_rate": 4.460999061360668e-06, + "loss": 0.6279, + "step": 4891 + }, + { + "epoch": 1.2986857825567504, + "grad_norm": 0.43028637961062544, + "learning_rate": 4.460782493631405e-06, + "loss": 0.6217, + "step": 4892 + }, + { + "epoch": 1.298951281030134, + "grad_norm": 0.41265769455231094, + "learning_rate": 4.460565887662083e-06, + "loss": 0.5778, + "step": 4893 + }, + { + "epoch": 1.2992167795035179, + "grad_norm": 0.4202122052221382, + "learning_rate": 4.4603492434569255e-06, + "loss": 0.5451, + "step": 4894 + }, + { + "epoch": 1.2994822779769017, + "grad_norm": 0.43001248873158865, + "learning_rate": 4.460132561020158e-06, + "loss": 0.5641, + "step": 4895 + }, + { + "epoch": 1.2997477764502854, + "grad_norm": 0.4372149661039394, + "learning_rate": 4.459915840356006e-06, + "loss": 0.5949, + "step": 4896 + }, + { + "epoch": 1.3000132749236692, + "grad_norm": 0.4021836231967466, + "learning_rate": 4.4596990814686966e-06, + "loss": 0.5682, + "step": 4897 + }, + { + "epoch": 1.300278773397053, + "grad_norm": 0.4057342123872635, + "learning_rate": 4.459482284362457e-06, + "loss": 0.5711, + "step": 4898 + }, + { + "epoch": 1.3005442718704368, + "grad_norm": 0.41845207031620923, + "learning_rate": 4.459265449041516e-06, + "loss": 0.5919, + "step": 4899 + }, + { + "epoch": 1.3008097703438204, + "grad_norm": 0.4048730667788804, + "learning_rate": 4.4590485755101e-06, + "loss": 0.5988, + "step": 4900 + }, + { + "epoch": 1.3010752688172043, + "grad_norm": 0.3970598634688899, + "learning_rate": 4.458831663772441e-06, + "loss": 0.5992, + "step": 4901 + }, + { + "epoch": 1.3013407672905881, + "grad_norm": 0.3907628960114115, + "learning_rate": 4.458614713832768e-06, + "loss": 0.6216, + "step": 4902 + }, + { + "epoch": 1.3016062657639718, + "grad_norm": 0.40237661235405225, + "learning_rate": 4.458397725695314e-06, + "loss": 0.5517, + "step": 4903 + }, + { + "epoch": 1.3018717642373556, + "grad_norm": 0.4042654953387462, + "learning_rate": 4.458180699364308e-06, + "loss": 0.5938, + "step": 4904 + }, + { + "epoch": 1.3021372627107395, + "grad_norm": 0.4148204077744537, + "learning_rate": 4.457963634843985e-06, + "loss": 0.6185, + "step": 4905 + }, + { + "epoch": 1.3024027611841231, + "grad_norm": 0.4157144792489109, + "learning_rate": 4.457746532138576e-06, + "loss": 0.5866, + "step": 4906 + }, + { + "epoch": 1.302668259657507, + "grad_norm": 0.4103066982869338, + "learning_rate": 4.457529391252317e-06, + "loss": 0.5722, + "step": 4907 + }, + { + "epoch": 1.3029337581308909, + "grad_norm": 0.3958075038029031, + "learning_rate": 4.457312212189442e-06, + "loss": 0.6335, + "step": 4908 + }, + { + "epoch": 1.3031992566042745, + "grad_norm": 0.4118756023541191, + "learning_rate": 4.457094994954186e-06, + "loss": 0.5822, + "step": 4909 + }, + { + "epoch": 1.3034647550776584, + "grad_norm": 0.40902517113752285, + "learning_rate": 4.456877739550787e-06, + "loss": 0.6125, + "step": 4910 + }, + { + "epoch": 1.3037302535510422, + "grad_norm": 0.4024089878684346, + "learning_rate": 4.456660445983479e-06, + "loss": 0.6175, + "step": 4911 + }, + { + "epoch": 1.3039957520244259, + "grad_norm": 0.4007104200870501, + "learning_rate": 4.456443114256503e-06, + "loss": 0.5798, + "step": 4912 + }, + { + "epoch": 1.3042612504978095, + "grad_norm": 0.40492257298770873, + "learning_rate": 4.4562257443740955e-06, + "loss": 0.5647, + "step": 4913 + }, + { + "epoch": 1.3045267489711934, + "grad_norm": 0.41318478381146634, + "learning_rate": 4.456008336340497e-06, + "loss": 0.6189, + "step": 4914 + }, + { + "epoch": 1.3047922474445772, + "grad_norm": 0.42449931760978266, + "learning_rate": 4.455790890159946e-06, + "loss": 0.5838, + "step": 4915 + }, + { + "epoch": 1.3050577459179609, + "grad_norm": 0.41283085691953864, + "learning_rate": 4.455573405836684e-06, + "loss": 0.6002, + "step": 4916 + }, + { + "epoch": 1.3053232443913447, + "grad_norm": 0.42067519724069574, + "learning_rate": 4.455355883374953e-06, + "loss": 0.5906, + "step": 4917 + }, + { + "epoch": 1.3055887428647286, + "grad_norm": 0.3966990758298903, + "learning_rate": 4.455138322778995e-06, + "loss": 0.6081, + "step": 4918 + }, + { + "epoch": 1.3058542413381122, + "grad_norm": 0.39692347523808585, + "learning_rate": 4.454920724053052e-06, + "loss": 0.5817, + "step": 4919 + }, + { + "epoch": 1.306119739811496, + "grad_norm": 0.40854683486509485, + "learning_rate": 4.454703087201368e-06, + "loss": 0.581, + "step": 4920 + }, + { + "epoch": 1.30638523828488, + "grad_norm": 0.40358569446086373, + "learning_rate": 4.4544854122281885e-06, + "loss": 0.5978, + "step": 4921 + }, + { + "epoch": 1.3066507367582636, + "grad_norm": 0.39539448833647567, + "learning_rate": 4.454267699137759e-06, + "loss": 0.5813, + "step": 4922 + }, + { + "epoch": 1.3069162352316475, + "grad_norm": 0.4178932170650583, + "learning_rate": 4.454049947934323e-06, + "loss": 0.602, + "step": 4923 + }, + { + "epoch": 1.3071817337050313, + "grad_norm": 0.4087833790590675, + "learning_rate": 4.453832158622129e-06, + "loss": 0.58, + "step": 4924 + }, + { + "epoch": 1.307447232178415, + "grad_norm": 0.4315773571172743, + "learning_rate": 4.4536143312054246e-06, + "loss": 0.6021, + "step": 4925 + }, + { + "epoch": 1.3077127306517988, + "grad_norm": 0.43304482672326006, + "learning_rate": 4.453396465688458e-06, + "loss": 0.5878, + "step": 4926 + }, + { + "epoch": 1.3079782291251825, + "grad_norm": 0.40648341954478884, + "learning_rate": 4.4531785620754765e-06, + "loss": 0.6036, + "step": 4927 + }, + { + "epoch": 1.3082437275985663, + "grad_norm": 0.4104017402563507, + "learning_rate": 4.452960620370731e-06, + "loss": 0.5634, + "step": 4928 + }, + { + "epoch": 1.30850922607195, + "grad_norm": 0.4215156051170924, + "learning_rate": 4.452742640578472e-06, + "loss": 0.5798, + "step": 4929 + }, + { + "epoch": 1.3087747245453338, + "grad_norm": 0.4293369353442428, + "learning_rate": 4.45252462270295e-06, + "loss": 0.595, + "step": 4930 + }, + { + "epoch": 1.3090402230187177, + "grad_norm": 0.40963147419785767, + "learning_rate": 4.452306566748418e-06, + "loss": 0.5794, + "step": 4931 + }, + { + "epoch": 1.3093057214921013, + "grad_norm": 0.40718378492322854, + "learning_rate": 4.452088472719127e-06, + "loss": 0.5882, + "step": 4932 + }, + { + "epoch": 1.3095712199654852, + "grad_norm": 0.41386040653446554, + "learning_rate": 4.451870340619331e-06, + "loss": 0.5912, + "step": 4933 + }, + { + "epoch": 1.309836718438869, + "grad_norm": 0.43873713573961204, + "learning_rate": 4.4516521704532855e-06, + "loss": 0.5983, + "step": 4934 + }, + { + "epoch": 1.3101022169122527, + "grad_norm": 0.41559467475908973, + "learning_rate": 4.4514339622252434e-06, + "loss": 0.5746, + "step": 4935 + }, + { + "epoch": 1.3103677153856366, + "grad_norm": 0.3894541434040422, + "learning_rate": 4.451215715939461e-06, + "loss": 0.5968, + "step": 4936 + }, + { + "epoch": 1.3106332138590204, + "grad_norm": 0.4036207110914407, + "learning_rate": 4.450997431600196e-06, + "loss": 0.5797, + "step": 4937 + }, + { + "epoch": 1.310898712332404, + "grad_norm": 0.41484750551669364, + "learning_rate": 4.450779109211703e-06, + "loss": 0.5812, + "step": 4938 + }, + { + "epoch": 1.311164210805788, + "grad_norm": 0.4103433857138673, + "learning_rate": 4.450560748778242e-06, + "loss": 0.5606, + "step": 4939 + }, + { + "epoch": 1.3114297092791716, + "grad_norm": 0.40268329385275714, + "learning_rate": 4.45034235030407e-06, + "loss": 0.566, + "step": 4940 + }, + { + "epoch": 1.3116952077525554, + "grad_norm": 0.4195615678342612, + "learning_rate": 4.450123913793446e-06, + "loss": 0.5839, + "step": 4941 + }, + { + "epoch": 1.311960706225939, + "grad_norm": 0.4075118580038868, + "learning_rate": 4.449905439250632e-06, + "loss": 0.5909, + "step": 4942 + }, + { + "epoch": 1.312226204699323, + "grad_norm": 0.4077988711345542, + "learning_rate": 4.449686926679888e-06, + "loss": 0.584, + "step": 4943 + }, + { + "epoch": 1.3124917031727068, + "grad_norm": 0.39843862997833607, + "learning_rate": 4.449468376085475e-06, + "loss": 0.6331, + "step": 4944 + }, + { + "epoch": 1.3127572016460904, + "grad_norm": 0.40758777115073547, + "learning_rate": 4.449249787471655e-06, + "loss": 0.6179, + "step": 4945 + }, + { + "epoch": 1.3130227001194743, + "grad_norm": 0.4170488038253358, + "learning_rate": 4.4490311608426915e-06, + "loss": 0.6002, + "step": 4946 + }, + { + "epoch": 1.3132881985928582, + "grad_norm": 0.4137659366462854, + "learning_rate": 4.448812496202849e-06, + "loss": 0.5762, + "step": 4947 + }, + { + "epoch": 1.3135536970662418, + "grad_norm": 0.41173212727099967, + "learning_rate": 4.448593793556391e-06, + "loss": 0.5936, + "step": 4948 + }, + { + "epoch": 1.3138191955396257, + "grad_norm": 0.41220219762569343, + "learning_rate": 4.448375052907583e-06, + "loss": 0.6073, + "step": 4949 + }, + { + "epoch": 1.3140846940130095, + "grad_norm": 0.41164444333141387, + "learning_rate": 4.448156274260692e-06, + "loss": 0.5952, + "step": 4950 + }, + { + "epoch": 1.3143501924863932, + "grad_norm": 0.41336092340573943, + "learning_rate": 4.447937457619982e-06, + "loss": 0.5786, + "step": 4951 + }, + { + "epoch": 1.314615690959777, + "grad_norm": 0.4132578017693204, + "learning_rate": 4.447718602989723e-06, + "loss": 0.5486, + "step": 4952 + }, + { + "epoch": 1.3148811894331607, + "grad_norm": 0.40171552999094035, + "learning_rate": 4.447499710374184e-06, + "loss": 0.5641, + "step": 4953 + }, + { + "epoch": 1.3151466879065445, + "grad_norm": 0.3990453125849152, + "learning_rate": 4.447280779777629e-06, + "loss": 0.5577, + "step": 4954 + }, + { + "epoch": 1.3154121863799282, + "grad_norm": 0.4196073706130943, + "learning_rate": 4.447061811204334e-06, + "loss": 0.5552, + "step": 4955 + }, + { + "epoch": 1.315677684853312, + "grad_norm": 0.40245043178713646, + "learning_rate": 4.446842804658566e-06, + "loss": 0.5824, + "step": 4956 + }, + { + "epoch": 1.315943183326696, + "grad_norm": 0.4074182932135894, + "learning_rate": 4.446623760144595e-06, + "loss": 0.5883, + "step": 4957 + }, + { + "epoch": 1.3162086818000795, + "grad_norm": 0.41514255806711453, + "learning_rate": 4.446404677666696e-06, + "loss": 0.5774, + "step": 4958 + }, + { + "epoch": 1.3164741802734634, + "grad_norm": 0.41226552621481066, + "learning_rate": 4.446185557229139e-06, + "loss": 0.6007, + "step": 4959 + }, + { + "epoch": 1.3167396787468473, + "grad_norm": 0.4174646477327288, + "learning_rate": 4.445966398836199e-06, + "loss": 0.5885, + "step": 4960 + }, + { + "epoch": 1.317005177220231, + "grad_norm": 0.4175165796360001, + "learning_rate": 4.44574720249215e-06, + "loss": 0.6381, + "step": 4961 + }, + { + "epoch": 1.3172706756936148, + "grad_norm": 0.4179511888457544, + "learning_rate": 4.445527968201267e-06, + "loss": 0.5988, + "step": 4962 + }, + { + "epoch": 1.3175361741669986, + "grad_norm": 0.4145385412129239, + "learning_rate": 4.445308695967824e-06, + "loss": 0.5728, + "step": 4963 + }, + { + "epoch": 1.3178016726403823, + "grad_norm": 0.42404094443982626, + "learning_rate": 4.445089385796099e-06, + "loss": 0.6115, + "step": 4964 + }, + { + "epoch": 1.3180671711137661, + "grad_norm": 0.4031924679634836, + "learning_rate": 4.444870037690368e-06, + "loss": 0.5817, + "step": 4965 + }, + { + "epoch": 1.31833266958715, + "grad_norm": 0.40885603633534584, + "learning_rate": 4.444650651654911e-06, + "loss": 0.6037, + "step": 4966 + }, + { + "epoch": 1.3185981680605336, + "grad_norm": 0.45789135321367413, + "learning_rate": 4.444431227694004e-06, + "loss": 0.6013, + "step": 4967 + }, + { + "epoch": 1.3188636665339175, + "grad_norm": 0.409986144554615, + "learning_rate": 4.444211765811927e-06, + "loss": 0.5808, + "step": 4968 + }, + { + "epoch": 1.3191291650073012, + "grad_norm": 0.4080317409770435, + "learning_rate": 4.443992266012961e-06, + "loss": 0.6323, + "step": 4969 + }, + { + "epoch": 1.319394663480685, + "grad_norm": 0.40962624701312145, + "learning_rate": 4.443772728301385e-06, + "loss": 0.5617, + "step": 4970 + }, + { + "epoch": 1.3196601619540687, + "grad_norm": 0.4184075206212948, + "learning_rate": 4.4435531526814825e-06, + "loss": 0.5714, + "step": 4971 + }, + { + "epoch": 1.3199256604274525, + "grad_norm": 0.40541038067775204, + "learning_rate": 4.443333539157535e-06, + "loss": 0.6065, + "step": 4972 + }, + { + "epoch": 1.3201911589008364, + "grad_norm": 0.3979943404378569, + "learning_rate": 4.443113887733824e-06, + "loss": 0.561, + "step": 4973 + }, + { + "epoch": 1.32045665737422, + "grad_norm": 0.40468721169834854, + "learning_rate": 4.442894198414636e-06, + "loss": 0.5863, + "step": 4974 + }, + { + "epoch": 1.3207221558476039, + "grad_norm": 0.41360532501049635, + "learning_rate": 4.4426744712042545e-06, + "loss": 0.5477, + "step": 4975 + }, + { + "epoch": 1.3209876543209877, + "grad_norm": 0.3980882953464089, + "learning_rate": 4.442454706106963e-06, + "loss": 0.5848, + "step": 4976 + }, + { + "epoch": 1.3212531527943714, + "grad_norm": 0.39465377928410006, + "learning_rate": 4.4422349031270495e-06, + "loss": 0.5909, + "step": 4977 + }, + { + "epoch": 1.3215186512677553, + "grad_norm": 0.41398691594786546, + "learning_rate": 4.4420150622688e-06, + "loss": 0.6067, + "step": 4978 + }, + { + "epoch": 1.3217841497411391, + "grad_norm": 0.4169460212147881, + "learning_rate": 4.441795183536502e-06, + "loss": 0.5964, + "step": 4979 + }, + { + "epoch": 1.3220496482145228, + "grad_norm": 0.4032021127670633, + "learning_rate": 4.441575266934444e-06, + "loss": 0.5971, + "step": 4980 + }, + { + "epoch": 1.3223151466879066, + "grad_norm": 0.4114418771584214, + "learning_rate": 4.4413553124669126e-06, + "loss": 0.5583, + "step": 4981 + }, + { + "epoch": 1.3225806451612903, + "grad_norm": 0.39697519598617365, + "learning_rate": 4.441135320138201e-06, + "loss": 0.59, + "step": 4982 + }, + { + "epoch": 1.3228461436346741, + "grad_norm": 0.3990293758275327, + "learning_rate": 4.440915289952598e-06, + "loss": 0.6205, + "step": 4983 + }, + { + "epoch": 1.3231116421080578, + "grad_norm": 0.4020691031854009, + "learning_rate": 4.440695221914394e-06, + "loss": 0.5803, + "step": 4984 + }, + { + "epoch": 1.3233771405814416, + "grad_norm": 0.4102046584337675, + "learning_rate": 4.440475116027882e-06, + "loss": 0.595, + "step": 4985 + }, + { + "epoch": 1.3236426390548255, + "grad_norm": 0.39274017988004467, + "learning_rate": 4.440254972297354e-06, + "loss": 0.5708, + "step": 4986 + }, + { + "epoch": 1.3239081375282091, + "grad_norm": 0.3920600022470943, + "learning_rate": 4.4400347907271024e-06, + "loss": 0.5414, + "step": 4987 + }, + { + "epoch": 1.324173636001593, + "grad_norm": 0.38421972792100684, + "learning_rate": 4.4398145713214234e-06, + "loss": 0.5774, + "step": 4988 + }, + { + "epoch": 1.3244391344749769, + "grad_norm": 0.41529836990010277, + "learning_rate": 4.439594314084611e-06, + "loss": 0.5571, + "step": 4989 + }, + { + "epoch": 1.3247046329483605, + "grad_norm": 0.4060323850989439, + "learning_rate": 4.43937401902096e-06, + "loss": 0.6157, + "step": 4990 + }, + { + "epoch": 1.3249701314217444, + "grad_norm": 0.40949474269586444, + "learning_rate": 4.439153686134768e-06, + "loss": 0.5963, + "step": 4991 + }, + { + "epoch": 1.3252356298951282, + "grad_norm": 0.4029031408444375, + "learning_rate": 4.438933315430331e-06, + "loss": 0.5712, + "step": 4992 + }, + { + "epoch": 1.3255011283685119, + "grad_norm": 0.4043024787791668, + "learning_rate": 4.438712906911946e-06, + "loss": 0.5846, + "step": 4993 + }, + { + "epoch": 1.3257666268418957, + "grad_norm": 0.40919937957356767, + "learning_rate": 4.438492460583913e-06, + "loss": 0.602, + "step": 4994 + }, + { + "epoch": 1.3260321253152794, + "grad_norm": 0.41866175490329155, + "learning_rate": 4.438271976450531e-06, + "loss": 0.5969, + "step": 4995 + }, + { + "epoch": 1.3262976237886632, + "grad_norm": 0.4040783534475649, + "learning_rate": 4.4380514545161e-06, + "loss": 0.649, + "step": 4996 + }, + { + "epoch": 1.3265631222620469, + "grad_norm": 0.4070636098216918, + "learning_rate": 4.437830894784921e-06, + "loss": 0.5958, + "step": 4997 + }, + { + "epoch": 1.3268286207354307, + "grad_norm": 0.405725607606133, + "learning_rate": 4.437610297261294e-06, + "loss": 0.6041, + "step": 4998 + }, + { + "epoch": 1.3270941192088146, + "grad_norm": 0.41216796382968224, + "learning_rate": 4.437389661949522e-06, + "loss": 0.5959, + "step": 4999 + }, + { + "epoch": 1.3273596176821982, + "grad_norm": 0.42580833447783706, + "learning_rate": 4.4371689888539085e-06, + "loss": 0.5918, + "step": 5000 + }, + { + "epoch": 1.327625116155582, + "grad_norm": 0.4022898238282934, + "learning_rate": 4.436948277978756e-06, + "loss": 0.5387, + "step": 5001 + }, + { + "epoch": 1.327890614628966, + "grad_norm": 0.4243664722750062, + "learning_rate": 4.4367275293283706e-06, + "loss": 0.6421, + "step": 5002 + }, + { + "epoch": 1.3281561131023496, + "grad_norm": 0.40452875420533846, + "learning_rate": 4.436506742907056e-06, + "loss": 0.6072, + "step": 5003 + }, + { + "epoch": 1.3284216115757335, + "grad_norm": 0.4006932644712524, + "learning_rate": 4.436285918719118e-06, + "loss": 0.6004, + "step": 5004 + }, + { + "epoch": 1.3286871100491173, + "grad_norm": 0.41270472684735265, + "learning_rate": 4.4360650567688646e-06, + "loss": 0.6129, + "step": 5005 + }, + { + "epoch": 1.328952608522501, + "grad_norm": 0.41761017984070553, + "learning_rate": 4.435844157060602e-06, + "loss": 0.5781, + "step": 5006 + }, + { + "epoch": 1.3292181069958848, + "grad_norm": 0.4014641418120313, + "learning_rate": 4.435623219598638e-06, + "loss": 0.6019, + "step": 5007 + }, + { + "epoch": 1.3294836054692685, + "grad_norm": 0.4158333117016189, + "learning_rate": 4.435402244387284e-06, + "loss": 0.5686, + "step": 5008 + }, + { + "epoch": 1.3297491039426523, + "grad_norm": 0.39589892370960766, + "learning_rate": 4.435181231430845e-06, + "loss": 0.6102, + "step": 5009 + }, + { + "epoch": 1.330014602416036, + "grad_norm": 0.41354345770561685, + "learning_rate": 4.4349601807336354e-06, + "loss": 0.6024, + "step": 5010 + }, + { + "epoch": 1.3302801008894198, + "grad_norm": 0.39990629929221533, + "learning_rate": 4.434739092299964e-06, + "loss": 0.5865, + "step": 5011 + }, + { + "epoch": 1.3305455993628037, + "grad_norm": 0.3960230668392498, + "learning_rate": 4.434517966134143e-06, + "loss": 0.6226, + "step": 5012 + }, + { + "epoch": 1.3308110978361873, + "grad_norm": 0.4163560407375716, + "learning_rate": 4.4342968022404855e-06, + "loss": 0.599, + "step": 5013 + }, + { + "epoch": 1.3310765963095712, + "grad_norm": 0.410894748073801, + "learning_rate": 4.4340756006233045e-06, + "loss": 0.5679, + "step": 5014 + }, + { + "epoch": 1.331342094782955, + "grad_norm": 0.39352752492836857, + "learning_rate": 4.4338543612869146e-06, + "loss": 0.5641, + "step": 5015 + }, + { + "epoch": 1.3316075932563387, + "grad_norm": 0.41061439661062427, + "learning_rate": 4.433633084235629e-06, + "loss": 0.5435, + "step": 5016 + }, + { + "epoch": 1.3318730917297226, + "grad_norm": 0.39333883740131115, + "learning_rate": 4.433411769473763e-06, + "loss": 0.5366, + "step": 5017 + }, + { + "epoch": 1.3321385902031064, + "grad_norm": 0.3977195000961838, + "learning_rate": 4.433190417005635e-06, + "loss": 0.5777, + "step": 5018 + }, + { + "epoch": 1.33240408867649, + "grad_norm": 0.39975439408090524, + "learning_rate": 4.4329690268355606e-06, + "loss": 0.6084, + "step": 5019 + }, + { + "epoch": 1.332669587149874, + "grad_norm": 0.40345153781685766, + "learning_rate": 4.432747598967857e-06, + "loss": 0.561, + "step": 5020 + }, + { + "epoch": 1.3329350856232578, + "grad_norm": 0.4031615410430819, + "learning_rate": 4.432526133406843e-06, + "loss": 0.5165, + "step": 5021 + }, + { + "epoch": 1.3332005840966414, + "grad_norm": 0.4126099355864836, + "learning_rate": 4.4323046301568374e-06, + "loss": 0.5437, + "step": 5022 + }, + { + "epoch": 1.3334660825700253, + "grad_norm": 0.42318155647259525, + "learning_rate": 4.432083089222161e-06, + "loss": 0.589, + "step": 5023 + }, + { + "epoch": 1.333731581043409, + "grad_norm": 0.42853829159879614, + "learning_rate": 4.431861510607134e-06, + "loss": 0.5966, + "step": 5024 + }, + { + "epoch": 1.3339970795167928, + "grad_norm": 0.4108754486994434, + "learning_rate": 4.431639894316076e-06, + "loss": 0.5928, + "step": 5025 + }, + { + "epoch": 1.3342625779901764, + "grad_norm": 0.4028532830464729, + "learning_rate": 4.431418240353312e-06, + "loss": 0.6179, + "step": 5026 + }, + { + "epoch": 1.3345280764635603, + "grad_norm": 0.41811562947826264, + "learning_rate": 4.431196548723164e-06, + "loss": 0.6042, + "step": 5027 + }, + { + "epoch": 1.3347935749369442, + "grad_norm": 0.4118665234306777, + "learning_rate": 4.430974819429954e-06, + "loss": 0.5901, + "step": 5028 + }, + { + "epoch": 1.3350590734103278, + "grad_norm": 0.4049832608414355, + "learning_rate": 4.430753052478007e-06, + "loss": 0.5831, + "step": 5029 + }, + { + "epoch": 1.3353245718837117, + "grad_norm": 0.41570376109355267, + "learning_rate": 4.430531247871649e-06, + "loss": 0.6395, + "step": 5030 + }, + { + "epoch": 1.3355900703570955, + "grad_norm": 0.4134855453681411, + "learning_rate": 4.430309405615204e-06, + "loss": 0.6081, + "step": 5031 + }, + { + "epoch": 1.3358555688304792, + "grad_norm": 0.4195997411148105, + "learning_rate": 4.430087525712999e-06, + "loss": 0.6097, + "step": 5032 + }, + { + "epoch": 1.336121067303863, + "grad_norm": 0.39883310142252193, + "learning_rate": 4.429865608169363e-06, + "loss": 0.5666, + "step": 5033 + }, + { + "epoch": 1.336386565777247, + "grad_norm": 0.3978944358633572, + "learning_rate": 4.4296436529886216e-06, + "loss": 0.5701, + "step": 5034 + }, + { + "epoch": 1.3366520642506305, + "grad_norm": 0.4281636841120658, + "learning_rate": 4.429421660175105e-06, + "loss": 0.5882, + "step": 5035 + }, + { + "epoch": 1.3369175627240144, + "grad_norm": 0.3975590630072414, + "learning_rate": 4.429199629733141e-06, + "loss": 0.5765, + "step": 5036 + }, + { + "epoch": 1.337183061197398, + "grad_norm": 0.4062196905582909, + "learning_rate": 4.428977561667062e-06, + "loss": 0.5673, + "step": 5037 + }, + { + "epoch": 1.337448559670782, + "grad_norm": 0.39482637350780103, + "learning_rate": 4.428755455981196e-06, + "loss": 0.5857, + "step": 5038 + }, + { + "epoch": 1.3377140581441656, + "grad_norm": 0.4134638636681826, + "learning_rate": 4.428533312679878e-06, + "loss": 0.6154, + "step": 5039 + }, + { + "epoch": 1.3379795566175494, + "grad_norm": 0.4026259070920607, + "learning_rate": 4.4283111317674375e-06, + "loss": 0.6218, + "step": 5040 + }, + { + "epoch": 1.3382450550909333, + "grad_norm": 0.40904344092777983, + "learning_rate": 4.4280889132482085e-06, + "loss": 0.5972, + "step": 5041 + }, + { + "epoch": 1.338510553564317, + "grad_norm": 0.42274703242304396, + "learning_rate": 4.427866657126525e-06, + "loss": 0.6346, + "step": 5042 + }, + { + "epoch": 1.3387760520377008, + "grad_norm": 0.39950268110867015, + "learning_rate": 4.427644363406721e-06, + "loss": 0.5905, + "step": 5043 + }, + { + "epoch": 1.3390415505110846, + "grad_norm": 0.40976211167144805, + "learning_rate": 4.4274220320931334e-06, + "loss": 0.6348, + "step": 5044 + }, + { + "epoch": 1.3393070489844683, + "grad_norm": 0.4193727489853488, + "learning_rate": 4.427199663190097e-06, + "loss": 0.5917, + "step": 5045 + }, + { + "epoch": 1.3395725474578521, + "grad_norm": 0.41160967623833183, + "learning_rate": 4.426977256701948e-06, + "loss": 0.5844, + "step": 5046 + }, + { + "epoch": 1.339838045931236, + "grad_norm": 0.4356531804738442, + "learning_rate": 4.426754812633024e-06, + "loss": 0.597, + "step": 5047 + }, + { + "epoch": 1.3401035444046197, + "grad_norm": 0.4324168009310734, + "learning_rate": 4.426532330987664e-06, + "loss": 0.5648, + "step": 5048 + }, + { + "epoch": 1.3403690428780035, + "grad_norm": 0.39502224872010605, + "learning_rate": 4.4263098117702075e-06, + "loss": 0.5821, + "step": 5049 + }, + { + "epoch": 1.3406345413513872, + "grad_norm": 0.4133142833839653, + "learning_rate": 4.426087254984992e-06, + "loss": 0.6146, + "step": 5050 + }, + { + "epoch": 1.340900039824771, + "grad_norm": 0.45347721868925234, + "learning_rate": 4.42586466063636e-06, + "loss": 0.5783, + "step": 5051 + }, + { + "epoch": 1.3411655382981547, + "grad_norm": 0.44019193437927234, + "learning_rate": 4.425642028728652e-06, + "loss": 0.6004, + "step": 5052 + }, + { + "epoch": 1.3414310367715385, + "grad_norm": 0.41238131588091564, + "learning_rate": 4.4254193592662085e-06, + "loss": 0.6087, + "step": 5053 + }, + { + "epoch": 1.3416965352449224, + "grad_norm": 0.4083800972192628, + "learning_rate": 4.425196652253374e-06, + "loss": 0.5711, + "step": 5054 + }, + { + "epoch": 1.341962033718306, + "grad_norm": 0.4251367749526427, + "learning_rate": 4.424973907694491e-06, + "loss": 0.6202, + "step": 5055 + }, + { + "epoch": 1.3422275321916899, + "grad_norm": 0.4134648872232151, + "learning_rate": 4.424751125593903e-06, + "loss": 0.5515, + "step": 5056 + }, + { + "epoch": 1.3424930306650737, + "grad_norm": 0.4391411207469455, + "learning_rate": 4.4245283059559565e-06, + "loss": 0.6309, + "step": 5057 + }, + { + "epoch": 1.3427585291384574, + "grad_norm": 0.4114403926087078, + "learning_rate": 4.424305448784995e-06, + "loss": 0.5553, + "step": 5058 + }, + { + "epoch": 1.3430240276118413, + "grad_norm": 0.48383354083900076, + "learning_rate": 4.424082554085366e-06, + "loss": 0.5618, + "step": 5059 + }, + { + "epoch": 1.3432895260852251, + "grad_norm": 0.4000636094710507, + "learning_rate": 4.423859621861417e-06, + "loss": 0.5891, + "step": 5060 + }, + { + "epoch": 1.3435550245586088, + "grad_norm": 0.40022518286199704, + "learning_rate": 4.423636652117494e-06, + "loss": 0.5351, + "step": 5061 + }, + { + "epoch": 1.3438205230319926, + "grad_norm": 0.4221018952087181, + "learning_rate": 4.423413644857947e-06, + "loss": 0.5786, + "step": 5062 + }, + { + "epoch": 1.3440860215053765, + "grad_norm": 0.4124671380774633, + "learning_rate": 4.423190600087124e-06, + "loss": 0.6412, + "step": 5063 + }, + { + "epoch": 1.3443515199787601, + "grad_norm": 0.4258627339237869, + "learning_rate": 4.422967517809377e-06, + "loss": 0.6627, + "step": 5064 + }, + { + "epoch": 1.3446170184521438, + "grad_norm": 0.4131704741384072, + "learning_rate": 4.422744398029054e-06, + "loss": 0.6202, + "step": 5065 + }, + { + "epoch": 1.3448825169255276, + "grad_norm": 0.39329717059421276, + "learning_rate": 4.422521240750507e-06, + "loss": 0.584, + "step": 5066 + }, + { + "epoch": 1.3451480153989115, + "grad_norm": 0.41269799981370214, + "learning_rate": 4.42229804597809e-06, + "loss": 0.5749, + "step": 5067 + }, + { + "epoch": 1.3454135138722951, + "grad_norm": 0.4212370407606538, + "learning_rate": 4.422074813716153e-06, + "loss": 0.579, + "step": 5068 + }, + { + "epoch": 1.345679012345679, + "grad_norm": 0.4210303616633975, + "learning_rate": 4.421851543969052e-06, + "loss": 0.5801, + "step": 5069 + }, + { + "epoch": 1.3459445108190629, + "grad_norm": 0.38243691527271945, + "learning_rate": 4.42162823674114e-06, + "loss": 0.576, + "step": 5070 + }, + { + "epoch": 1.3462100092924465, + "grad_norm": 0.4131632383889058, + "learning_rate": 4.421404892036773e-06, + "loss": 0.556, + "step": 5071 + }, + { + "epoch": 1.3464755077658304, + "grad_norm": 0.39017119518826265, + "learning_rate": 4.421181509860306e-06, + "loss": 0.6197, + "step": 5072 + }, + { + "epoch": 1.3467410062392142, + "grad_norm": 0.40189399659200686, + "learning_rate": 4.420958090216095e-06, + "loss": 0.5918, + "step": 5073 + }, + { + "epoch": 1.3470065047125979, + "grad_norm": 0.4143724895933705, + "learning_rate": 4.420734633108499e-06, + "loss": 0.5741, + "step": 5074 + }, + { + "epoch": 1.3472720031859817, + "grad_norm": 0.4080873981595074, + "learning_rate": 4.420511138541874e-06, + "loss": 0.5559, + "step": 5075 + }, + { + "epoch": 1.3475375016593656, + "grad_norm": 0.4056024414237227, + "learning_rate": 4.4202876065205805e-06, + "loss": 0.5988, + "step": 5076 + }, + { + "epoch": 1.3478030001327492, + "grad_norm": 0.403401918130962, + "learning_rate": 4.4200640370489764e-06, + "loss": 0.6079, + "step": 5077 + }, + { + "epoch": 1.348068498606133, + "grad_norm": 0.39958445292036104, + "learning_rate": 4.419840430131422e-06, + "loss": 0.6197, + "step": 5078 + }, + { + "epoch": 1.3483339970795167, + "grad_norm": 0.41459340641791764, + "learning_rate": 4.419616785772279e-06, + "loss": 0.5922, + "step": 5079 + }, + { + "epoch": 1.3485994955529006, + "grad_norm": 0.43002389749473247, + "learning_rate": 4.4193931039759095e-06, + "loss": 0.6362, + "step": 5080 + }, + { + "epoch": 1.3488649940262842, + "grad_norm": 0.4050876883830849, + "learning_rate": 4.419169384746675e-06, + "loss": 0.624, + "step": 5081 + }, + { + "epoch": 1.349130492499668, + "grad_norm": 0.4051989824459654, + "learning_rate": 4.418945628088937e-06, + "loss": 0.5596, + "step": 5082 + }, + { + "epoch": 1.349395990973052, + "grad_norm": 0.3948948661453604, + "learning_rate": 4.418721834007062e-06, + "loss": 0.5623, + "step": 5083 + }, + { + "epoch": 1.3496614894464356, + "grad_norm": 0.39304582750769324, + "learning_rate": 4.418498002505413e-06, + "loss": 0.5945, + "step": 5084 + }, + { + "epoch": 1.3499269879198195, + "grad_norm": 0.4044159936110484, + "learning_rate": 4.418274133588356e-06, + "loss": 0.5872, + "step": 5085 + }, + { + "epoch": 1.3501924863932033, + "grad_norm": 0.4075175743729038, + "learning_rate": 4.418050227260257e-06, + "loss": 0.5534, + "step": 5086 + }, + { + "epoch": 1.350457984866587, + "grad_norm": 0.40925555442412337, + "learning_rate": 4.417826283525481e-06, + "loss": 0.5932, + "step": 5087 + }, + { + "epoch": 1.3507234833399708, + "grad_norm": 0.38691779675751636, + "learning_rate": 4.417602302388398e-06, + "loss": 0.5976, + "step": 5088 + }, + { + "epoch": 1.3509889818133547, + "grad_norm": 0.41979258821905574, + "learning_rate": 4.417378283853374e-06, + "loss": 0.5946, + "step": 5089 + }, + { + "epoch": 1.3512544802867383, + "grad_norm": 0.39871698278174694, + "learning_rate": 4.4171542279247795e-06, + "loss": 0.5715, + "step": 5090 + }, + { + "epoch": 1.3515199787601222, + "grad_norm": 0.40728571312735545, + "learning_rate": 4.416930134606984e-06, + "loss": 0.62, + "step": 5091 + }, + { + "epoch": 1.3517854772335058, + "grad_norm": 0.4130698278443358, + "learning_rate": 4.416706003904357e-06, + "loss": 0.5696, + "step": 5092 + }, + { + "epoch": 1.3520509757068897, + "grad_norm": 0.459513594535085, + "learning_rate": 4.41648183582127e-06, + "loss": 0.5671, + "step": 5093 + }, + { + "epoch": 1.3523164741802733, + "grad_norm": 0.39243996224763655, + "learning_rate": 4.416257630362095e-06, + "loss": 0.588, + "step": 5094 + }, + { + "epoch": 1.3525819726536572, + "grad_norm": 0.42102018332415797, + "learning_rate": 4.416033387531204e-06, + "loss": 0.5552, + "step": 5095 + }, + { + "epoch": 1.352847471127041, + "grad_norm": 0.4231024813055664, + "learning_rate": 4.4158091073329714e-06, + "loss": 0.6053, + "step": 5096 + }, + { + "epoch": 1.3531129696004247, + "grad_norm": 0.41444342980929527, + "learning_rate": 4.415584789771769e-06, + "loss": 0.5758, + "step": 5097 + }, + { + "epoch": 1.3533784680738086, + "grad_norm": 0.410161314351242, + "learning_rate": 4.415360434851975e-06, + "loss": 0.615, + "step": 5098 + }, + { + "epoch": 1.3536439665471924, + "grad_norm": 0.4054365677030745, + "learning_rate": 4.4151360425779615e-06, + "loss": 0.6066, + "step": 5099 + }, + { + "epoch": 1.353909465020576, + "grad_norm": 0.43503654516844664, + "learning_rate": 4.414911612954107e-06, + "loss": 0.5993, + "step": 5100 + }, + { + "epoch": 1.35417496349396, + "grad_norm": 0.4332953614553006, + "learning_rate": 4.414687145984787e-06, + "loss": 0.6001, + "step": 5101 + }, + { + "epoch": 1.3544404619673438, + "grad_norm": 0.407582719266609, + "learning_rate": 4.41446264167438e-06, + "loss": 0.6012, + "step": 5102 + }, + { + "epoch": 1.3547059604407274, + "grad_norm": 0.4008306934284124, + "learning_rate": 4.414238100027264e-06, + "loss": 0.5881, + "step": 5103 + }, + { + "epoch": 1.3549714589141113, + "grad_norm": 0.4208095853075956, + "learning_rate": 4.414013521047817e-06, + "loss": 0.5485, + "step": 5104 + }, + { + "epoch": 1.355236957387495, + "grad_norm": 0.43405413338415744, + "learning_rate": 4.413788904740422e-06, + "loss": 0.6103, + "step": 5105 + }, + { + "epoch": 1.3555024558608788, + "grad_norm": 0.4024702046819154, + "learning_rate": 4.4135642511094566e-06, + "loss": 0.5844, + "step": 5106 + }, + { + "epoch": 1.3557679543342624, + "grad_norm": 0.3964452290433775, + "learning_rate": 4.413339560159303e-06, + "loss": 0.5837, + "step": 5107 + }, + { + "epoch": 1.3560334528076463, + "grad_norm": 0.4488612374596151, + "learning_rate": 4.413114831894344e-06, + "loss": 0.553, + "step": 5108 + }, + { + "epoch": 1.3562989512810302, + "grad_norm": 0.42676583857580014, + "learning_rate": 4.412890066318961e-06, + "loss": 0.5698, + "step": 5109 + }, + { + "epoch": 1.3565644497544138, + "grad_norm": 0.39670193834007356, + "learning_rate": 4.412665263437538e-06, + "loss": 0.5554, + "step": 5110 + }, + { + "epoch": 1.3568299482277977, + "grad_norm": 0.38910047758471217, + "learning_rate": 4.41244042325446e-06, + "loss": 0.6195, + "step": 5111 + }, + { + "epoch": 1.3570954467011815, + "grad_norm": 0.4736034088407191, + "learning_rate": 4.412215545774111e-06, + "loss": 0.5712, + "step": 5112 + }, + { + "epoch": 1.3573609451745652, + "grad_norm": 0.43170925827530643, + "learning_rate": 4.4119906310008765e-06, + "loss": 0.628, + "step": 5113 + }, + { + "epoch": 1.357626443647949, + "grad_norm": 0.41175074062244466, + "learning_rate": 4.411765678939144e-06, + "loss": 0.5905, + "step": 5114 + }, + { + "epoch": 1.357891942121333, + "grad_norm": 0.41182863150885163, + "learning_rate": 4.4115406895933e-06, + "loss": 0.6135, + "step": 5115 + }, + { + "epoch": 1.3581574405947165, + "grad_norm": 0.39681398882634666, + "learning_rate": 4.411315662967732e-06, + "loss": 0.5708, + "step": 5116 + }, + { + "epoch": 1.3584229390681004, + "grad_norm": 0.4029371644670817, + "learning_rate": 4.411090599066829e-06, + "loss": 0.6264, + "step": 5117 + }, + { + "epoch": 1.3586884375414843, + "grad_norm": 0.42340763942115595, + "learning_rate": 4.410865497894979e-06, + "loss": 0.6171, + "step": 5118 + }, + { + "epoch": 1.358953936014868, + "grad_norm": 0.41988373333397255, + "learning_rate": 4.410640359456575e-06, + "loss": 0.573, + "step": 5119 + }, + { + "epoch": 1.3592194344882516, + "grad_norm": 0.40488482694374284, + "learning_rate": 4.410415183756005e-06, + "loss": 0.5962, + "step": 5120 + }, + { + "epoch": 1.3594849329616354, + "grad_norm": 0.4153589898806118, + "learning_rate": 4.4101899707976615e-06, + "loss": 0.5836, + "step": 5121 + }, + { + "epoch": 1.3597504314350193, + "grad_norm": 0.4108910256101074, + "learning_rate": 4.409964720585937e-06, + "loss": 0.578, + "step": 5122 + }, + { + "epoch": 1.360015929908403, + "grad_norm": 0.40191295331559923, + "learning_rate": 4.409739433125223e-06, + "loss": 0.5719, + "step": 5123 + }, + { + "epoch": 1.3602814283817868, + "grad_norm": 0.4098072327025475, + "learning_rate": 4.4095141084199155e-06, + "loss": 0.5847, + "step": 5124 + }, + { + "epoch": 1.3605469268551706, + "grad_norm": 0.4142253200508539, + "learning_rate": 4.409288746474407e-06, + "loss": 0.6066, + "step": 5125 + }, + { + "epoch": 1.3608124253285543, + "grad_norm": 0.39511170463658823, + "learning_rate": 4.409063347293093e-06, + "loss": 0.5791, + "step": 5126 + }, + { + "epoch": 1.3610779238019381, + "grad_norm": 0.423496425849535, + "learning_rate": 4.40883791088037e-06, + "loss": 0.6097, + "step": 5127 + }, + { + "epoch": 1.361343422275322, + "grad_norm": 0.40114354325139334, + "learning_rate": 4.408612437240633e-06, + "loss": 0.5823, + "step": 5128 + }, + { + "epoch": 1.3616089207487057, + "grad_norm": 0.40411237735566014, + "learning_rate": 4.408386926378282e-06, + "loss": 0.6187, + "step": 5129 + }, + { + "epoch": 1.3618744192220895, + "grad_norm": 0.4004335352018841, + "learning_rate": 4.4081613782977125e-06, + "loss": 0.6027, + "step": 5130 + }, + { + "epoch": 1.3621399176954734, + "grad_norm": 0.41121731232989656, + "learning_rate": 4.407935793003324e-06, + "loss": 0.6067, + "step": 5131 + }, + { + "epoch": 1.362405416168857, + "grad_norm": 0.40741065321744613, + "learning_rate": 4.407710170499517e-06, + "loss": 0.5947, + "step": 5132 + }, + { + "epoch": 1.3626709146422409, + "grad_norm": 0.40662582732184704, + "learning_rate": 4.40748451079069e-06, + "loss": 0.5945, + "step": 5133 + }, + { + "epoch": 1.3629364131156245, + "grad_norm": 0.40149074464183626, + "learning_rate": 4.407258813881244e-06, + "loss": 0.5979, + "step": 5134 + }, + { + "epoch": 1.3632019115890084, + "grad_norm": 0.39889904614270105, + "learning_rate": 4.407033079775583e-06, + "loss": 0.6051, + "step": 5135 + }, + { + "epoch": 1.363467410062392, + "grad_norm": 0.4001566859196351, + "learning_rate": 4.4068073084781075e-06, + "loss": 0.5835, + "step": 5136 + }, + { + "epoch": 1.363732908535776, + "grad_norm": 0.4043102328551009, + "learning_rate": 4.40658149999322e-06, + "loss": 0.5957, + "step": 5137 + }, + { + "epoch": 1.3639984070091598, + "grad_norm": 0.3980156532579141, + "learning_rate": 4.406355654325325e-06, + "loss": 0.5925, + "step": 5138 + }, + { + "epoch": 1.3642639054825434, + "grad_norm": 0.41100335983671743, + "learning_rate": 4.4061297714788275e-06, + "loss": 0.6124, + "step": 5139 + }, + { + "epoch": 1.3645294039559273, + "grad_norm": 0.40137178260222345, + "learning_rate": 4.405903851458133e-06, + "loss": 0.6285, + "step": 5140 + }, + { + "epoch": 1.3647949024293111, + "grad_norm": 0.41479490289086995, + "learning_rate": 4.405677894267646e-06, + "loss": 0.5526, + "step": 5141 + }, + { + "epoch": 1.3650604009026948, + "grad_norm": 0.40607094222167517, + "learning_rate": 4.405451899911775e-06, + "loss": 0.5956, + "step": 5142 + }, + { + "epoch": 1.3653258993760786, + "grad_norm": 0.4124953942465148, + "learning_rate": 4.405225868394926e-06, + "loss": 0.6073, + "step": 5143 + }, + { + "epoch": 1.3655913978494625, + "grad_norm": 0.4122150155113502, + "learning_rate": 4.404999799721508e-06, + "loss": 0.5639, + "step": 5144 + }, + { + "epoch": 1.3658568963228461, + "grad_norm": 0.4088992678580032, + "learning_rate": 4.4047736938959295e-06, + "loss": 0.5794, + "step": 5145 + }, + { + "epoch": 1.36612239479623, + "grad_norm": 0.39888140573982966, + "learning_rate": 4.404547550922601e-06, + "loss": 0.6099, + "step": 5146 + }, + { + "epoch": 1.3663878932696136, + "grad_norm": 0.4042769177880839, + "learning_rate": 4.404321370805931e-06, + "loss": 0.5796, + "step": 5147 + }, + { + "epoch": 1.3666533917429975, + "grad_norm": 0.3997113478974893, + "learning_rate": 4.404095153550332e-06, + "loss": 0.5715, + "step": 5148 + }, + { + "epoch": 1.3669188902163811, + "grad_norm": 0.40237218998174806, + "learning_rate": 4.4038688991602165e-06, + "loss": 0.5729, + "step": 5149 + }, + { + "epoch": 1.367184388689765, + "grad_norm": 0.3963382812760406, + "learning_rate": 4.403642607639995e-06, + "loss": 0.5365, + "step": 5150 + }, + { + "epoch": 1.3674498871631489, + "grad_norm": 0.4024347914345515, + "learning_rate": 4.403416278994081e-06, + "loss": 0.5909, + "step": 5151 + }, + { + "epoch": 1.3677153856365325, + "grad_norm": 0.41489547589206194, + "learning_rate": 4.403189913226891e-06, + "loss": 0.6142, + "step": 5152 + }, + { + "epoch": 1.3679808841099164, + "grad_norm": 0.4144344201689968, + "learning_rate": 4.402963510342837e-06, + "loss": 0.562, + "step": 5153 + }, + { + "epoch": 1.3682463825833002, + "grad_norm": 0.4035158287419469, + "learning_rate": 4.402737070346335e-06, + "loss": 0.5604, + "step": 5154 + }, + { + "epoch": 1.3685118810566839, + "grad_norm": 0.4018852196902614, + "learning_rate": 4.402510593241802e-06, + "loss": 0.6218, + "step": 5155 + }, + { + "epoch": 1.3687773795300677, + "grad_norm": 0.4359921044630224, + "learning_rate": 4.402284079033654e-06, + "loss": 0.5748, + "step": 5156 + }, + { + "epoch": 1.3690428780034516, + "grad_norm": 0.4237437493228621, + "learning_rate": 4.402057527726309e-06, + "loss": 0.5933, + "step": 5157 + }, + { + "epoch": 1.3693083764768352, + "grad_norm": 0.41906126999846177, + "learning_rate": 4.4018309393241845e-06, + "loss": 0.594, + "step": 5158 + }, + { + "epoch": 1.369573874950219, + "grad_norm": 0.40130761670913456, + "learning_rate": 4.401604313831701e-06, + "loss": 0.6085, + "step": 5159 + }, + { + "epoch": 1.3698393734236027, + "grad_norm": 0.4375434043708791, + "learning_rate": 4.4013776512532775e-06, + "loss": 0.5894, + "step": 5160 + }, + { + "epoch": 1.3701048718969866, + "grad_norm": 0.44560980520280286, + "learning_rate": 4.401150951593334e-06, + "loss": 0.5901, + "step": 5161 + }, + { + "epoch": 1.3703703703703702, + "grad_norm": 0.41522144859204435, + "learning_rate": 4.400924214856292e-06, + "loss": 0.6097, + "step": 5162 + }, + { + "epoch": 1.370635868843754, + "grad_norm": 0.41000841916376307, + "learning_rate": 4.4006974410465745e-06, + "loss": 0.6107, + "step": 5163 + }, + { + "epoch": 1.370901367317138, + "grad_norm": 0.43472481653103207, + "learning_rate": 4.400470630168602e-06, + "loss": 0.5831, + "step": 5164 + }, + { + "epoch": 1.3711668657905216, + "grad_norm": 0.4860283206483714, + "learning_rate": 4.4002437822267995e-06, + "loss": 0.5841, + "step": 5165 + }, + { + "epoch": 1.3714323642639055, + "grad_norm": 0.4065099474621937, + "learning_rate": 4.400016897225591e-06, + "loss": 0.5832, + "step": 5166 + }, + { + "epoch": 1.3716978627372893, + "grad_norm": 0.39128508150702834, + "learning_rate": 4.399789975169401e-06, + "loss": 0.593, + "step": 5167 + }, + { + "epoch": 1.371963361210673, + "grad_norm": 0.4068634251361246, + "learning_rate": 4.399563016062654e-06, + "loss": 0.6045, + "step": 5168 + }, + { + "epoch": 1.3722288596840568, + "grad_norm": 0.45924541259772284, + "learning_rate": 4.399336019909778e-06, + "loss": 0.5575, + "step": 5169 + }, + { + "epoch": 1.3724943581574407, + "grad_norm": 0.41493654768151256, + "learning_rate": 4.399108986715199e-06, + "loss": 0.6114, + "step": 5170 + }, + { + "epoch": 1.3727598566308243, + "grad_norm": 0.4035868879952453, + "learning_rate": 4.398881916483345e-06, + "loss": 0.5826, + "step": 5171 + }, + { + "epoch": 1.3730253551042082, + "grad_norm": 0.3975413196561161, + "learning_rate": 4.398654809218644e-06, + "loss": 0.606, + "step": 5172 + }, + { + "epoch": 1.373290853577592, + "grad_norm": 0.4245881673087071, + "learning_rate": 4.398427664925526e-06, + "loss": 0.6202, + "step": 5173 + }, + { + "epoch": 1.3735563520509757, + "grad_norm": 0.43707577390240415, + "learning_rate": 4.398200483608421e-06, + "loss": 0.5882, + "step": 5174 + }, + { + "epoch": 1.3738218505243596, + "grad_norm": 0.4221363096323231, + "learning_rate": 4.397973265271758e-06, + "loss": 0.594, + "step": 5175 + }, + { + "epoch": 1.3740873489977432, + "grad_norm": 0.3882554117741956, + "learning_rate": 4.39774600991997e-06, + "loss": 0.5557, + "step": 5176 + }, + { + "epoch": 1.374352847471127, + "grad_norm": 0.4028669981011816, + "learning_rate": 4.397518717557487e-06, + "loss": 0.5828, + "step": 5177 + }, + { + "epoch": 1.3746183459445107, + "grad_norm": 0.41465818363684664, + "learning_rate": 4.397291388188744e-06, + "loss": 0.6095, + "step": 5178 + }, + { + "epoch": 1.3748838444178946, + "grad_norm": 0.4068593304795824, + "learning_rate": 4.397064021818174e-06, + "loss": 0.5993, + "step": 5179 + }, + { + "epoch": 1.3751493428912784, + "grad_norm": 0.395238036353193, + "learning_rate": 4.39683661845021e-06, + "loss": 0.601, + "step": 5180 + }, + { + "epoch": 1.375414841364662, + "grad_norm": 0.40823737926322445, + "learning_rate": 4.396609178089289e-06, + "loss": 0.6143, + "step": 5181 + }, + { + "epoch": 1.375680339838046, + "grad_norm": 0.41417551719357265, + "learning_rate": 4.3963817007398435e-06, + "loss": 0.592, + "step": 5182 + }, + { + "epoch": 1.3759458383114298, + "grad_norm": 0.41741590668108636, + "learning_rate": 4.396154186406313e-06, + "loss": 0.5865, + "step": 5183 + }, + { + "epoch": 1.3762113367848134, + "grad_norm": 0.3911179426916859, + "learning_rate": 4.395926635093133e-06, + "loss": 0.5533, + "step": 5184 + }, + { + "epoch": 1.3764768352581973, + "grad_norm": 0.3999646422309801, + "learning_rate": 4.395699046804741e-06, + "loss": 0.5747, + "step": 5185 + }, + { + "epoch": 1.3767423337315812, + "grad_norm": 0.4150982840876382, + "learning_rate": 4.3954714215455765e-06, + "loss": 0.5868, + "step": 5186 + }, + { + "epoch": 1.3770078322049648, + "grad_norm": 0.4121907759175896, + "learning_rate": 4.395243759320078e-06, + "loss": 0.5915, + "step": 5187 + }, + { + "epoch": 1.3772733306783487, + "grad_norm": 0.40075637999370006, + "learning_rate": 4.3950160601326865e-06, + "loss": 0.5486, + "step": 5188 + }, + { + "epoch": 1.3775388291517323, + "grad_norm": 0.3960970869949558, + "learning_rate": 4.394788323987842e-06, + "loss": 0.5909, + "step": 5189 + }, + { + "epoch": 1.3778043276251162, + "grad_norm": 0.39499631804329505, + "learning_rate": 4.394560550889986e-06, + "loss": 0.6211, + "step": 5190 + }, + { + "epoch": 1.3780698260984998, + "grad_norm": 0.42576540200857177, + "learning_rate": 4.3943327408435595e-06, + "loss": 0.5977, + "step": 5191 + }, + { + "epoch": 1.3783353245718837, + "grad_norm": 0.40123813178607043, + "learning_rate": 4.394104893853007e-06, + "loss": 0.5716, + "step": 5192 + }, + { + "epoch": 1.3786008230452675, + "grad_norm": 0.4084780525420066, + "learning_rate": 4.393877009922772e-06, + "loss": 0.5905, + "step": 5193 + }, + { + "epoch": 1.3788663215186512, + "grad_norm": 0.41111851101974783, + "learning_rate": 4.393649089057298e-06, + "loss": 0.5996, + "step": 5194 + }, + { + "epoch": 1.379131819992035, + "grad_norm": 0.40484709527646406, + "learning_rate": 4.39342113126103e-06, + "loss": 0.5698, + "step": 5195 + }, + { + "epoch": 1.379397318465419, + "grad_norm": 0.402256232823181, + "learning_rate": 4.393193136538414e-06, + "loss": 0.6086, + "step": 5196 + }, + { + "epoch": 1.3796628169388025, + "grad_norm": 0.409708999293741, + "learning_rate": 4.392965104893897e-06, + "loss": 0.561, + "step": 5197 + }, + { + "epoch": 1.3799283154121864, + "grad_norm": 0.40512371843060374, + "learning_rate": 4.3927370363319255e-06, + "loss": 0.5754, + "step": 5198 + }, + { + "epoch": 1.3801938138855703, + "grad_norm": 0.4159106102704585, + "learning_rate": 4.392508930856948e-06, + "loss": 0.5572, + "step": 5199 + }, + { + "epoch": 1.380459312358954, + "grad_norm": 0.40693072266653496, + "learning_rate": 4.392280788473412e-06, + "loss": 0.6428, + "step": 5200 + }, + { + "epoch": 1.3807248108323378, + "grad_norm": 0.43367558769797493, + "learning_rate": 4.392052609185768e-06, + "loss": 0.5888, + "step": 5201 + }, + { + "epoch": 1.3809903093057214, + "grad_norm": 0.41551843956692247, + "learning_rate": 4.391824392998466e-06, + "loss": 0.6278, + "step": 5202 + }, + { + "epoch": 1.3812558077791053, + "grad_norm": 0.4195867693713215, + "learning_rate": 4.391596139915954e-06, + "loss": 0.5902, + "step": 5203 + }, + { + "epoch": 1.381521306252489, + "grad_norm": 0.41489556331257904, + "learning_rate": 4.391367849942689e-06, + "loss": 0.5602, + "step": 5204 + }, + { + "epoch": 1.3817868047258728, + "grad_norm": 0.3963284122015101, + "learning_rate": 4.391139523083118e-06, + "loss": 0.6444, + "step": 5205 + }, + { + "epoch": 1.3820523031992566, + "grad_norm": 0.40405803538589546, + "learning_rate": 4.390911159341697e-06, + "loss": 0.5545, + "step": 5206 + }, + { + "epoch": 1.3823178016726403, + "grad_norm": 0.42129643175926523, + "learning_rate": 4.390682758722879e-06, + "loss": 0.5969, + "step": 5207 + }, + { + "epoch": 1.3825833001460242, + "grad_norm": 0.40080826826494015, + "learning_rate": 4.390454321231116e-06, + "loss": 0.5926, + "step": 5208 + }, + { + "epoch": 1.382848798619408, + "grad_norm": 0.40771326198367613, + "learning_rate": 4.390225846870867e-06, + "loss": 0.5779, + "step": 5209 + }, + { + "epoch": 1.3831142970927917, + "grad_norm": 0.4083610487619787, + "learning_rate": 4.389997335646586e-06, + "loss": 0.6063, + "step": 5210 + }, + { + "epoch": 1.3833797955661755, + "grad_norm": 0.40602251783352133, + "learning_rate": 4.389768787562728e-06, + "loss": 0.6117, + "step": 5211 + }, + { + "epoch": 1.3836452940395594, + "grad_norm": 0.397016564639049, + "learning_rate": 4.3895402026237525e-06, + "loss": 0.6045, + "step": 5212 + }, + { + "epoch": 1.383910792512943, + "grad_norm": 0.398031226496274, + "learning_rate": 4.389311580834116e-06, + "loss": 0.6, + "step": 5213 + }, + { + "epoch": 1.3841762909863269, + "grad_norm": 0.41573413135455856, + "learning_rate": 4.389082922198279e-06, + "loss": 0.5528, + "step": 5214 + }, + { + "epoch": 1.3844417894597105, + "grad_norm": 0.40069920513836227, + "learning_rate": 4.388854226720699e-06, + "loss": 0.6154, + "step": 5215 + }, + { + "epoch": 1.3847072879330944, + "grad_norm": 0.411186195683949, + "learning_rate": 4.388625494405837e-06, + "loss": 0.612, + "step": 5216 + }, + { + "epoch": 1.384972786406478, + "grad_norm": 0.42290654487713875, + "learning_rate": 4.388396725258154e-06, + "loss": 0.6032, + "step": 5217 + }, + { + "epoch": 1.385238284879862, + "grad_norm": 0.4324741418431151, + "learning_rate": 4.388167919282111e-06, + "loss": 0.621, + "step": 5218 + }, + { + "epoch": 1.3855037833532458, + "grad_norm": 0.40589506391927377, + "learning_rate": 4.387939076482171e-06, + "loss": 0.5726, + "step": 5219 + }, + { + "epoch": 1.3857692818266294, + "grad_norm": 0.40004561454762105, + "learning_rate": 4.387710196862796e-06, + "loss": 0.5743, + "step": 5220 + }, + { + "epoch": 1.3860347803000133, + "grad_norm": 0.4185177741539191, + "learning_rate": 4.387481280428449e-06, + "loss": 0.6067, + "step": 5221 + }, + { + "epoch": 1.3863002787733971, + "grad_norm": 0.39380742461438356, + "learning_rate": 4.387252327183598e-06, + "loss": 0.6017, + "step": 5222 + }, + { + "epoch": 1.3865657772467808, + "grad_norm": 0.3992993598117144, + "learning_rate": 4.387023337132705e-06, + "loss": 0.5929, + "step": 5223 + }, + { + "epoch": 1.3868312757201646, + "grad_norm": 0.4190835212476741, + "learning_rate": 4.386794310280236e-06, + "loss": 0.6207, + "step": 5224 + }, + { + "epoch": 1.3870967741935485, + "grad_norm": 0.41231246584060016, + "learning_rate": 4.386565246630659e-06, + "loss": 0.5732, + "step": 5225 + }, + { + "epoch": 1.3873622726669321, + "grad_norm": 0.3981583509387078, + "learning_rate": 4.3863361461884406e-06, + "loss": 0.5723, + "step": 5226 + }, + { + "epoch": 1.387627771140316, + "grad_norm": 0.4045155093797978, + "learning_rate": 4.386107008958048e-06, + "loss": 0.5672, + "step": 5227 + }, + { + "epoch": 1.3878932696136999, + "grad_norm": 0.406821326535879, + "learning_rate": 4.385877834943952e-06, + "loss": 0.5763, + "step": 5228 + }, + { + "epoch": 1.3881587680870835, + "grad_norm": 0.40400137811210296, + "learning_rate": 4.38564862415062e-06, + "loss": 0.5535, + "step": 5229 + }, + { + "epoch": 1.3884242665604674, + "grad_norm": 0.4306187064563657, + "learning_rate": 4.385419376582523e-06, + "loss": 0.5789, + "step": 5230 + }, + { + "epoch": 1.388689765033851, + "grad_norm": 0.4099761592932349, + "learning_rate": 4.385190092244132e-06, + "loss": 0.5859, + "step": 5231 + }, + { + "epoch": 1.3889552635072349, + "grad_norm": 0.42042871322118036, + "learning_rate": 4.384960771139918e-06, + "loss": 0.5796, + "step": 5232 + }, + { + "epoch": 1.3892207619806185, + "grad_norm": 0.4276004321752673, + "learning_rate": 4.384731413274355e-06, + "loss": 0.6205, + "step": 5233 + }, + { + "epoch": 1.3894862604540024, + "grad_norm": 0.41051195787550787, + "learning_rate": 4.3845020186519134e-06, + "loss": 0.5801, + "step": 5234 + }, + { + "epoch": 1.3897517589273862, + "grad_norm": 0.41765827688835866, + "learning_rate": 4.384272587277069e-06, + "loss": 0.6001, + "step": 5235 + }, + { + "epoch": 1.3900172574007699, + "grad_norm": 0.41303481892769406, + "learning_rate": 4.384043119154296e-06, + "loss": 0.5814, + "step": 5236 + }, + { + "epoch": 1.3902827558741537, + "grad_norm": 0.4088262874845921, + "learning_rate": 4.383813614288068e-06, + "loss": 0.5706, + "step": 5237 + }, + { + "epoch": 1.3905482543475376, + "grad_norm": 0.41649243462503616, + "learning_rate": 4.383584072682864e-06, + "loss": 0.6, + "step": 5238 + }, + { + "epoch": 1.3908137528209212, + "grad_norm": 0.39287278721369573, + "learning_rate": 4.383354494343157e-06, + "loss": 0.6184, + "step": 5239 + }, + { + "epoch": 1.391079251294305, + "grad_norm": 0.3986868006834712, + "learning_rate": 4.383124879273427e-06, + "loss": 0.5915, + "step": 5240 + }, + { + "epoch": 1.391344749767689, + "grad_norm": 0.4106235709483954, + "learning_rate": 4.3828952274781515e-06, + "loss": 0.5924, + "step": 5241 + }, + { + "epoch": 1.3916102482410726, + "grad_norm": 0.40646694030420844, + "learning_rate": 4.382665538961809e-06, + "loss": 0.5819, + "step": 5242 + }, + { + "epoch": 1.3918757467144565, + "grad_norm": 0.4112161565798037, + "learning_rate": 4.382435813728878e-06, + "loss": 0.5801, + "step": 5243 + }, + { + "epoch": 1.39214124518784, + "grad_norm": 0.4049221955762099, + "learning_rate": 4.38220605178384e-06, + "loss": 0.5817, + "step": 5244 + }, + { + "epoch": 1.392406743661224, + "grad_norm": 0.40905885815627957, + "learning_rate": 4.381976253131176e-06, + "loss": 0.5804, + "step": 5245 + }, + { + "epoch": 1.3926722421346076, + "grad_norm": 0.40874363693064714, + "learning_rate": 4.381746417775366e-06, + "loss": 0.5631, + "step": 5246 + }, + { + "epoch": 1.3929377406079915, + "grad_norm": 0.4089193276785924, + "learning_rate": 4.381516545720895e-06, + "loss": 0.5817, + "step": 5247 + }, + { + "epoch": 1.3932032390813753, + "grad_norm": 0.43167715992544775, + "learning_rate": 4.381286636972244e-06, + "loss": 0.5838, + "step": 5248 + }, + { + "epoch": 1.393468737554759, + "grad_norm": 0.39500581663931006, + "learning_rate": 4.381056691533897e-06, + "loss": 0.6011, + "step": 5249 + }, + { + "epoch": 1.3937342360281428, + "grad_norm": 0.4028414347896397, + "learning_rate": 4.38082670941034e-06, + "loss": 0.5703, + "step": 5250 + }, + { + "epoch": 1.3939997345015267, + "grad_norm": 0.4136080586054172, + "learning_rate": 4.380596690606056e-06, + "loss": 0.6054, + "step": 5251 + }, + { + "epoch": 1.3942652329749103, + "grad_norm": 0.40149146172991584, + "learning_rate": 4.380366635125533e-06, + "loss": 0.5902, + "step": 5252 + }, + { + "epoch": 1.3945307314482942, + "grad_norm": 0.416163080127752, + "learning_rate": 4.380136542973256e-06, + "loss": 0.5898, + "step": 5253 + }, + { + "epoch": 1.394796229921678, + "grad_norm": 0.396558188045198, + "learning_rate": 4.379906414153713e-06, + "loss": 0.5887, + "step": 5254 + }, + { + "epoch": 1.3950617283950617, + "grad_norm": 0.42481901318735066, + "learning_rate": 4.379676248671392e-06, + "loss": 0.5691, + "step": 5255 + }, + { + "epoch": 1.3953272268684456, + "grad_norm": 0.4145932863931775, + "learning_rate": 4.3794460465307825e-06, + "loss": 0.5956, + "step": 5256 + }, + { + "epoch": 1.3955927253418292, + "grad_norm": 0.3977345376481052, + "learning_rate": 4.379215807736373e-06, + "loss": 0.5933, + "step": 5257 + }, + { + "epoch": 1.395858223815213, + "grad_norm": 0.40879445235316325, + "learning_rate": 4.3789855322926545e-06, + "loss": 0.598, + "step": 5258 + }, + { + "epoch": 1.3961237222885967, + "grad_norm": 0.40717392334343067, + "learning_rate": 4.378755220204117e-06, + "loss": 0.6068, + "step": 5259 + }, + { + "epoch": 1.3963892207619806, + "grad_norm": 0.3869425074525864, + "learning_rate": 4.378524871475253e-06, + "loss": 0.6149, + "step": 5260 + }, + { + "epoch": 1.3966547192353644, + "grad_norm": 0.40452703949006136, + "learning_rate": 4.3782944861105545e-06, + "loss": 0.5633, + "step": 5261 + }, + { + "epoch": 1.396920217708748, + "grad_norm": 0.41214806232486756, + "learning_rate": 4.378064064114515e-06, + "loss": 0.6269, + "step": 5262 + }, + { + "epoch": 1.397185716182132, + "grad_norm": 0.4178515447823688, + "learning_rate": 4.377833605491629e-06, + "loss": 0.6087, + "step": 5263 + }, + { + "epoch": 1.3974512146555158, + "grad_norm": 0.41279575162953963, + "learning_rate": 4.377603110246388e-06, + "loss": 0.5848, + "step": 5264 + }, + { + "epoch": 1.3977167131288994, + "grad_norm": 0.4054764645833888, + "learning_rate": 4.37737257838329e-06, + "loss": 0.6044, + "step": 5265 + }, + { + "epoch": 1.3979822116022833, + "grad_norm": 0.40834561977882916, + "learning_rate": 4.3771420099068295e-06, + "loss": 0.5615, + "step": 5266 + }, + { + "epoch": 1.3982477100756672, + "grad_norm": 0.40490491799432443, + "learning_rate": 4.376911404821504e-06, + "loss": 0.5945, + "step": 5267 + }, + { + "epoch": 1.3985132085490508, + "grad_norm": 0.413022600097997, + "learning_rate": 4.376680763131811e-06, + "loss": 0.6112, + "step": 5268 + }, + { + "epoch": 1.3987787070224347, + "grad_norm": 0.4089796095265697, + "learning_rate": 4.376450084842249e-06, + "loss": 0.6115, + "step": 5269 + }, + { + "epoch": 1.3990442054958183, + "grad_norm": 0.42262412206284095, + "learning_rate": 4.376219369957315e-06, + "loss": 0.5912, + "step": 5270 + }, + { + "epoch": 1.3993097039692022, + "grad_norm": 0.4304380950985587, + "learning_rate": 4.3759886184815096e-06, + "loss": 0.6099, + "step": 5271 + }, + { + "epoch": 1.3995752024425858, + "grad_norm": 0.3979478574613287, + "learning_rate": 4.375757830419332e-06, + "loss": 0.5859, + "step": 5272 + }, + { + "epoch": 1.3998407009159697, + "grad_norm": 0.40454692791838376, + "learning_rate": 4.375527005775285e-06, + "loss": 0.5864, + "step": 5273 + }, + { + "epoch": 1.4001061993893535, + "grad_norm": 0.4405928165143641, + "learning_rate": 4.37529614455387e-06, + "loss": 0.6055, + "step": 5274 + }, + { + "epoch": 1.4003716978627372, + "grad_norm": 0.42149400487582067, + "learning_rate": 4.375065246759588e-06, + "loss": 0.5874, + "step": 5275 + }, + { + "epoch": 1.400637196336121, + "grad_norm": 0.39690124135541516, + "learning_rate": 4.3748343123969425e-06, + "loss": 0.6018, + "step": 5276 + }, + { + "epoch": 1.400902694809505, + "grad_norm": 0.4101053211535445, + "learning_rate": 4.3746033414704385e-06, + "loss": 0.6074, + "step": 5277 + }, + { + "epoch": 1.4011681932828886, + "grad_norm": 0.41938765628301966, + "learning_rate": 4.374372333984578e-06, + "loss": 0.5746, + "step": 5278 + }, + { + "epoch": 1.4014336917562724, + "grad_norm": 0.4139352872300997, + "learning_rate": 4.374141289943869e-06, + "loss": 0.5996, + "step": 5279 + }, + { + "epoch": 1.4016991902296563, + "grad_norm": 0.4168765139329988, + "learning_rate": 4.373910209352816e-06, + "loss": 0.5917, + "step": 5280 + }, + { + "epoch": 1.40196468870304, + "grad_norm": 0.40484974493577286, + "learning_rate": 4.373679092215925e-06, + "loss": 0.5643, + "step": 5281 + }, + { + "epoch": 1.4022301871764238, + "grad_norm": 0.4067886246181902, + "learning_rate": 4.3734479385377045e-06, + "loss": 0.5952, + "step": 5282 + }, + { + "epoch": 1.4024956856498076, + "grad_norm": 0.4071437567241423, + "learning_rate": 4.373216748322663e-06, + "loss": 0.5963, + "step": 5283 + }, + { + "epoch": 1.4027611841231913, + "grad_norm": 0.4130238143307736, + "learning_rate": 4.372985521575307e-06, + "loss": 0.6022, + "step": 5284 + }, + { + "epoch": 1.4030266825965751, + "grad_norm": 0.4035257034397141, + "learning_rate": 4.372754258300148e-06, + "loss": 0.6177, + "step": 5285 + }, + { + "epoch": 1.4032921810699588, + "grad_norm": 0.4074638211158301, + "learning_rate": 4.3725229585016965e-06, + "loss": 0.5848, + "step": 5286 + }, + { + "epoch": 1.4035576795433427, + "grad_norm": 0.4158302905645629, + "learning_rate": 4.372291622184462e-06, + "loss": 0.5869, + "step": 5287 + }, + { + "epoch": 1.4038231780167263, + "grad_norm": 0.39694763723980575, + "learning_rate": 4.372060249352956e-06, + "loss": 0.5652, + "step": 5288 + }, + { + "epoch": 1.4040886764901102, + "grad_norm": 0.39655098340209066, + "learning_rate": 4.371828840011693e-06, + "loss": 0.583, + "step": 5289 + }, + { + "epoch": 1.404354174963494, + "grad_norm": 0.4091042832231682, + "learning_rate": 4.3715973941651836e-06, + "loss": 0.6206, + "step": 5290 + }, + { + "epoch": 1.4046196734368777, + "grad_norm": 0.4053651809520837, + "learning_rate": 4.371365911817943e-06, + "loss": 0.6026, + "step": 5291 + }, + { + "epoch": 1.4048851719102615, + "grad_norm": 0.40941064842613534, + "learning_rate": 4.371134392974486e-06, + "loss": 0.6361, + "step": 5292 + }, + { + "epoch": 1.4051506703836454, + "grad_norm": 0.415346579813271, + "learning_rate": 4.370902837639325e-06, + "loss": 0.6199, + "step": 5293 + }, + { + "epoch": 1.405416168857029, + "grad_norm": 0.40667694688698347, + "learning_rate": 4.3706712458169795e-06, + "loss": 0.591, + "step": 5294 + }, + { + "epoch": 1.4056816673304129, + "grad_norm": 0.40934496579058594, + "learning_rate": 4.370439617511963e-06, + "loss": 0.6115, + "step": 5295 + }, + { + "epoch": 1.4059471658037968, + "grad_norm": 0.40727328604552504, + "learning_rate": 4.370207952728796e-06, + "loss": 0.6051, + "step": 5296 + }, + { + "epoch": 1.4062126642771804, + "grad_norm": 0.408943781476209, + "learning_rate": 4.369976251471995e-06, + "loss": 0.5886, + "step": 5297 + }, + { + "epoch": 1.4064781627505643, + "grad_norm": 0.41548593915236476, + "learning_rate": 4.369744513746077e-06, + "loss": 0.5629, + "step": 5298 + }, + { + "epoch": 1.406743661223948, + "grad_norm": 0.39960240060407004, + "learning_rate": 4.369512739555564e-06, + "loss": 0.5957, + "step": 5299 + }, + { + "epoch": 1.4070091596973318, + "grad_norm": 0.41430665662236077, + "learning_rate": 4.369280928904975e-06, + "loss": 0.5872, + "step": 5300 + }, + { + "epoch": 1.4072746581707154, + "grad_norm": 0.40361720832664233, + "learning_rate": 4.3690490817988305e-06, + "loss": 0.5963, + "step": 5301 + }, + { + "epoch": 1.4075401566440993, + "grad_norm": 0.4098336163375993, + "learning_rate": 4.368817198241654e-06, + "loss": 0.6168, + "step": 5302 + }, + { + "epoch": 1.4078056551174831, + "grad_norm": 0.40695955961714264, + "learning_rate": 4.368585278237966e-06, + "loss": 0.5861, + "step": 5303 + }, + { + "epoch": 1.4080711535908668, + "grad_norm": 0.4088482339140311, + "learning_rate": 4.368353321792289e-06, + "loss": 0.6039, + "step": 5304 + }, + { + "epoch": 1.4083366520642506, + "grad_norm": 0.40111906182858825, + "learning_rate": 4.368121328909149e-06, + "loss": 0.5584, + "step": 5305 + }, + { + "epoch": 1.4086021505376345, + "grad_norm": 0.3995935637648144, + "learning_rate": 4.367889299593069e-06, + "loss": 0.6107, + "step": 5306 + }, + { + "epoch": 1.4088676490110181, + "grad_norm": 0.4027004276828151, + "learning_rate": 4.367657233848574e-06, + "loss": 0.5606, + "step": 5307 + }, + { + "epoch": 1.409133147484402, + "grad_norm": 0.4055271806434678, + "learning_rate": 4.36742513168019e-06, + "loss": 0.5814, + "step": 5308 + }, + { + "epoch": 1.4093986459577859, + "grad_norm": 0.3986650374895936, + "learning_rate": 4.367192993092443e-06, + "loss": 0.608, + "step": 5309 + }, + { + "epoch": 1.4096641444311695, + "grad_norm": 0.4055650011720386, + "learning_rate": 4.366960818089862e-06, + "loss": 0.5736, + "step": 5310 + }, + { + "epoch": 1.4099296429045534, + "grad_norm": 0.40806863755128475, + "learning_rate": 4.366728606676974e-06, + "loss": 0.563, + "step": 5311 + }, + { + "epoch": 1.410195141377937, + "grad_norm": 0.39639167088312977, + "learning_rate": 4.366496358858307e-06, + "loss": 0.5699, + "step": 5312 + }, + { + "epoch": 1.4104606398513209, + "grad_norm": 0.41173710187012796, + "learning_rate": 4.366264074638391e-06, + "loss": 0.5993, + "step": 5313 + }, + { + "epoch": 1.4107261383247045, + "grad_norm": 0.40364060277847263, + "learning_rate": 4.366031754021757e-06, + "loss": 0.5619, + "step": 5314 + }, + { + "epoch": 1.4109916367980884, + "grad_norm": 0.3945677982795281, + "learning_rate": 4.365799397012934e-06, + "loss": 0.584, + "step": 5315 + }, + { + "epoch": 1.4112571352714722, + "grad_norm": 0.4053811827837774, + "learning_rate": 4.365567003616455e-06, + "loss": 0.6125, + "step": 5316 + }, + { + "epoch": 1.4115226337448559, + "grad_norm": 0.4127711039071285, + "learning_rate": 4.365334573836851e-06, + "loss": 0.5948, + "step": 5317 + }, + { + "epoch": 1.4117881322182397, + "grad_norm": 0.41381557860950585, + "learning_rate": 4.365102107678657e-06, + "loss": 0.5808, + "step": 5318 + }, + { + "epoch": 1.4120536306916236, + "grad_norm": 0.39775087318588587, + "learning_rate": 4.364869605146405e-06, + "loss": 0.5871, + "step": 5319 + }, + { + "epoch": 1.4123191291650072, + "grad_norm": 0.4120491014172842, + "learning_rate": 4.364637066244629e-06, + "loss": 0.5882, + "step": 5320 + }, + { + "epoch": 1.412584627638391, + "grad_norm": 0.40479997226818665, + "learning_rate": 4.364404490977865e-06, + "loss": 0.5641, + "step": 5321 + }, + { + "epoch": 1.412850126111775, + "grad_norm": 0.3963891092680847, + "learning_rate": 4.364171879350649e-06, + "loss": 0.5876, + "step": 5322 + }, + { + "epoch": 1.4131156245851586, + "grad_norm": 0.40607422557176354, + "learning_rate": 4.363939231367517e-06, + "loss": 0.611, + "step": 5323 + }, + { + "epoch": 1.4133811230585425, + "grad_norm": 0.4152950605828711, + "learning_rate": 4.363706547033006e-06, + "loss": 0.6077, + "step": 5324 + }, + { + "epoch": 1.4136466215319263, + "grad_norm": 0.4061887548601726, + "learning_rate": 4.363473826351654e-06, + "loss": 0.605, + "step": 5325 + }, + { + "epoch": 1.41391212000531, + "grad_norm": 0.40335911343303776, + "learning_rate": 4.363241069328001e-06, + "loss": 0.6235, + "step": 5326 + }, + { + "epoch": 1.4141776184786936, + "grad_norm": 0.4178917424227247, + "learning_rate": 4.363008275966584e-06, + "loss": 0.5695, + "step": 5327 + }, + { + "epoch": 1.4144431169520775, + "grad_norm": 0.40552455661420167, + "learning_rate": 4.362775446271944e-06, + "loss": 0.6083, + "step": 5328 + }, + { + "epoch": 1.4147086154254613, + "grad_norm": 0.41041762619987465, + "learning_rate": 4.362542580248622e-06, + "loss": 0.5633, + "step": 5329 + }, + { + "epoch": 1.414974113898845, + "grad_norm": 0.411152773868579, + "learning_rate": 4.36230967790116e-06, + "loss": 0.5839, + "step": 5330 + }, + { + "epoch": 1.4152396123722288, + "grad_norm": 0.393379430056764, + "learning_rate": 4.362076739234099e-06, + "loss": 0.5985, + "step": 5331 + }, + { + "epoch": 1.4155051108456127, + "grad_norm": 0.41925976968021955, + "learning_rate": 4.361843764251983e-06, + "loss": 0.6014, + "step": 5332 + }, + { + "epoch": 1.4157706093189963, + "grad_norm": 0.4162452389113423, + "learning_rate": 4.361610752959355e-06, + "loss": 0.5684, + "step": 5333 + }, + { + "epoch": 1.4160361077923802, + "grad_norm": 0.41766182715221006, + "learning_rate": 4.361377705360758e-06, + "loss": 0.5963, + "step": 5334 + }, + { + "epoch": 1.416301606265764, + "grad_norm": 0.40308721273131726, + "learning_rate": 4.361144621460741e-06, + "loss": 0.5735, + "step": 5335 + }, + { + "epoch": 1.4165671047391477, + "grad_norm": 0.416531654864316, + "learning_rate": 4.360911501263845e-06, + "loss": 0.596, + "step": 5336 + }, + { + "epoch": 1.4168326032125316, + "grad_norm": 0.402475051645599, + "learning_rate": 4.360678344774619e-06, + "loss": 0.5922, + "step": 5337 + }, + { + "epoch": 1.4170981016859154, + "grad_norm": 0.4103172913816098, + "learning_rate": 4.360445151997609e-06, + "loss": 0.5866, + "step": 5338 + }, + { + "epoch": 1.417363600159299, + "grad_norm": 0.4067747266708178, + "learning_rate": 4.360211922937364e-06, + "loss": 0.5763, + "step": 5339 + }, + { + "epoch": 1.417629098632683, + "grad_norm": 0.4303578690942572, + "learning_rate": 4.359978657598432e-06, + "loss": 0.5983, + "step": 5340 + }, + { + "epoch": 1.4178945971060666, + "grad_norm": 0.41425841064424457, + "learning_rate": 4.359745355985361e-06, + "loss": 0.5949, + "step": 5341 + }, + { + "epoch": 1.4181600955794504, + "grad_norm": 0.4121203876653073, + "learning_rate": 4.359512018102703e-06, + "loss": 0.574, + "step": 5342 + }, + { + "epoch": 1.418425594052834, + "grad_norm": 0.40959548419616476, + "learning_rate": 4.359278643955008e-06, + "loss": 0.6094, + "step": 5343 + }, + { + "epoch": 1.418691092526218, + "grad_norm": 0.423015636493266, + "learning_rate": 4.359045233546827e-06, + "loss": 0.5818, + "step": 5344 + }, + { + "epoch": 1.4189565909996018, + "grad_norm": 0.401945569769227, + "learning_rate": 4.3588117868827115e-06, + "loss": 0.6068, + "step": 5345 + }, + { + "epoch": 1.4192220894729854, + "grad_norm": 0.40915017011413707, + "learning_rate": 4.358578303967217e-06, + "loss": 0.597, + "step": 5346 + }, + { + "epoch": 1.4194875879463693, + "grad_norm": 0.40799379446061695, + "learning_rate": 4.358344784804892e-06, + "loss": 0.6024, + "step": 5347 + }, + { + "epoch": 1.4197530864197532, + "grad_norm": 0.42379208375936706, + "learning_rate": 4.358111229400296e-06, + "loss": 0.5782, + "step": 5348 + }, + { + "epoch": 1.4200185848931368, + "grad_norm": 0.41992365656012554, + "learning_rate": 4.357877637757981e-06, + "loss": 0.6092, + "step": 5349 + }, + { + "epoch": 1.4202840833665207, + "grad_norm": 0.4094398581320978, + "learning_rate": 4.357644009882503e-06, + "loss": 0.5828, + "step": 5350 + }, + { + "epoch": 1.4205495818399045, + "grad_norm": 0.4134424558294693, + "learning_rate": 4.357410345778419e-06, + "loss": 0.5951, + "step": 5351 + }, + { + "epoch": 1.4208150803132882, + "grad_norm": 0.40875161300908747, + "learning_rate": 4.357176645450285e-06, + "loss": 0.6119, + "step": 5352 + }, + { + "epoch": 1.421080578786672, + "grad_norm": 0.4196709278126285, + "learning_rate": 4.356942908902659e-06, + "loss": 0.5819, + "step": 5353 + }, + { + "epoch": 1.4213460772600557, + "grad_norm": 0.4088574346368782, + "learning_rate": 4.3567091361401e-06, + "loss": 0.5793, + "step": 5354 + }, + { + "epoch": 1.4216115757334395, + "grad_norm": 0.41445792450823365, + "learning_rate": 4.356475327167167e-06, + "loss": 0.5795, + "step": 5355 + }, + { + "epoch": 1.4218770742068232, + "grad_norm": 0.41088643333676433, + "learning_rate": 4.35624148198842e-06, + "loss": 0.561, + "step": 5356 + }, + { + "epoch": 1.422142572680207, + "grad_norm": 0.40542690902190504, + "learning_rate": 4.356007600608419e-06, + "loss": 0.6155, + "step": 5357 + }, + { + "epoch": 1.422408071153591, + "grad_norm": 0.4297874724677386, + "learning_rate": 4.355773683031725e-06, + "loss": 0.6235, + "step": 5358 + }, + { + "epoch": 1.4226735696269746, + "grad_norm": 0.40599526949998954, + "learning_rate": 4.355539729262901e-06, + "loss": 0.563, + "step": 5359 + }, + { + "epoch": 1.4229390681003584, + "grad_norm": 0.40884888221244214, + "learning_rate": 4.355305739306508e-06, + "loss": 0.5671, + "step": 5360 + }, + { + "epoch": 1.4232045665737423, + "grad_norm": 0.3997016891896433, + "learning_rate": 4.355071713167111e-06, + "loss": 0.5475, + "step": 5361 + }, + { + "epoch": 1.423470065047126, + "grad_norm": 0.402385255102974, + "learning_rate": 4.354837650849275e-06, + "loss": 0.5535, + "step": 5362 + }, + { + "epoch": 1.4237355635205098, + "grad_norm": 0.42051078246384344, + "learning_rate": 4.354603552357562e-06, + "loss": 0.6228, + "step": 5363 + }, + { + "epoch": 1.4240010619938936, + "grad_norm": 0.41455231571053097, + "learning_rate": 4.3543694176965394e-06, + "loss": 0.5993, + "step": 5364 + }, + { + "epoch": 1.4242665604672773, + "grad_norm": 0.41806032887970623, + "learning_rate": 4.354135246870773e-06, + "loss": 0.629, + "step": 5365 + }, + { + "epoch": 1.4245320589406612, + "grad_norm": 0.42407248250406526, + "learning_rate": 4.353901039884829e-06, + "loss": 0.6323, + "step": 5366 + }, + { + "epoch": 1.4247975574140448, + "grad_norm": 0.4024201674732508, + "learning_rate": 4.353666796743276e-06, + "loss": 0.5977, + "step": 5367 + }, + { + "epoch": 1.4250630558874287, + "grad_norm": 0.4036344051520522, + "learning_rate": 4.353432517450682e-06, + "loss": 0.5672, + "step": 5368 + }, + { + "epoch": 1.4253285543608123, + "grad_norm": 0.41543869187316923, + "learning_rate": 4.3531982020116155e-06, + "loss": 0.5912, + "step": 5369 + }, + { + "epoch": 1.4255940528341962, + "grad_norm": 0.40188075825548764, + "learning_rate": 4.352963850430647e-06, + "loss": 0.5869, + "step": 5370 + }, + { + "epoch": 1.42585955130758, + "grad_norm": 0.403358019921609, + "learning_rate": 4.3527294627123464e-06, + "loss": 0.5653, + "step": 5371 + }, + { + "epoch": 1.4261250497809637, + "grad_norm": 0.3935130719948093, + "learning_rate": 4.352495038861285e-06, + "loss": 0.6267, + "step": 5372 + }, + { + "epoch": 1.4263905482543475, + "grad_norm": 0.41265078077525935, + "learning_rate": 4.352260578882035e-06, + "loss": 0.5926, + "step": 5373 + }, + { + "epoch": 1.4266560467277314, + "grad_norm": 0.39721978921417833, + "learning_rate": 4.352026082779168e-06, + "loss": 0.5622, + "step": 5374 + }, + { + "epoch": 1.426921545201115, + "grad_norm": 0.3919134791898912, + "learning_rate": 4.351791550557259e-06, + "loss": 0.6206, + "step": 5375 + }, + { + "epoch": 1.427187043674499, + "grad_norm": 0.39853708208547006, + "learning_rate": 4.35155698222088e-06, + "loss": 0.5795, + "step": 5376 + }, + { + "epoch": 1.4274525421478828, + "grad_norm": 0.4007002271504504, + "learning_rate": 4.351322377774606e-06, + "loss": 0.5688, + "step": 5377 + }, + { + "epoch": 1.4277180406212664, + "grad_norm": 0.394594869417455, + "learning_rate": 4.351087737223013e-06, + "loss": 0.6253, + "step": 5378 + }, + { + "epoch": 1.4279835390946503, + "grad_norm": 0.38511800117446227, + "learning_rate": 4.350853060570678e-06, + "loss": 0.5768, + "step": 5379 + }, + { + "epoch": 1.4282490375680341, + "grad_norm": 0.4077100419432239, + "learning_rate": 4.350618347822175e-06, + "loss": 0.6031, + "step": 5380 + }, + { + "epoch": 1.4285145360414178, + "grad_norm": 0.40662903595869726, + "learning_rate": 4.350383598982084e-06, + "loss": 0.6137, + "step": 5381 + }, + { + "epoch": 1.4287800345148016, + "grad_norm": 0.4181155945222628, + "learning_rate": 4.350148814054982e-06, + "loss": 0.6298, + "step": 5382 + }, + { + "epoch": 1.4290455329881853, + "grad_norm": 0.4241685652598452, + "learning_rate": 4.349913993045448e-06, + "loss": 0.617, + "step": 5383 + }, + { + "epoch": 1.4293110314615691, + "grad_norm": 0.41344417837681113, + "learning_rate": 4.349679135958062e-06, + "loss": 0.6182, + "step": 5384 + }, + { + "epoch": 1.4295765299349528, + "grad_norm": 0.4186245162057323, + "learning_rate": 4.349444242797406e-06, + "loss": 0.5878, + "step": 5385 + }, + { + "epoch": 1.4298420284083366, + "grad_norm": 0.4203660862473339, + "learning_rate": 4.349209313568057e-06, + "loss": 0.5846, + "step": 5386 + }, + { + "epoch": 1.4301075268817205, + "grad_norm": 0.4118107730038185, + "learning_rate": 4.348974348274599e-06, + "loss": 0.6418, + "step": 5387 + }, + { + "epoch": 1.4303730253551041, + "grad_norm": 0.4235576926585897, + "learning_rate": 4.348739346921613e-06, + "loss": 0.5852, + "step": 5388 + }, + { + "epoch": 1.430638523828488, + "grad_norm": 0.3964713478502504, + "learning_rate": 4.348504309513685e-06, + "loss": 0.5905, + "step": 5389 + }, + { + "epoch": 1.4309040223018719, + "grad_norm": 0.4072504004544051, + "learning_rate": 4.348269236055396e-06, + "loss": 0.5795, + "step": 5390 + }, + { + "epoch": 1.4311695207752555, + "grad_norm": 0.39577504724901286, + "learning_rate": 4.348034126551333e-06, + "loss": 0.59, + "step": 5391 + }, + { + "epoch": 1.4314350192486394, + "grad_norm": 0.4066690657217721, + "learning_rate": 4.347798981006078e-06, + "loss": 0.6043, + "step": 5392 + }, + { + "epoch": 1.4317005177220232, + "grad_norm": 0.39525308153088334, + "learning_rate": 4.347563799424219e-06, + "loss": 0.563, + "step": 5393 + }, + { + "epoch": 1.4319660161954069, + "grad_norm": 0.3941996342086955, + "learning_rate": 4.347328581810343e-06, + "loss": 0.5973, + "step": 5394 + }, + { + "epoch": 1.4322315146687907, + "grad_norm": 0.3954181035257636, + "learning_rate": 4.347093328169036e-06, + "loss": 0.5797, + "step": 5395 + }, + { + "epoch": 1.4324970131421744, + "grad_norm": 0.40958793499614976, + "learning_rate": 4.346858038504888e-06, + "loss": 0.604, + "step": 5396 + }, + { + "epoch": 1.4327625116155582, + "grad_norm": 0.4125830091648663, + "learning_rate": 4.3466227128224845e-06, + "loss": 0.6023, + "step": 5397 + }, + { + "epoch": 1.4330280100889419, + "grad_norm": 0.41549334230852086, + "learning_rate": 4.346387351126418e-06, + "loss": 0.6011, + "step": 5398 + }, + { + "epoch": 1.4332935085623257, + "grad_norm": 0.3952119580353629, + "learning_rate": 4.3461519534212774e-06, + "loss": 0.5814, + "step": 5399 + }, + { + "epoch": 1.4335590070357096, + "grad_norm": 0.40641000231135405, + "learning_rate": 4.345916519711653e-06, + "loss": 0.5883, + "step": 5400 + }, + { + "epoch": 1.4338245055090932, + "grad_norm": 0.3958097134277731, + "learning_rate": 4.3456810500021365e-06, + "loss": 0.5672, + "step": 5401 + }, + { + "epoch": 1.434090003982477, + "grad_norm": 0.41847262900874216, + "learning_rate": 4.345445544297321e-06, + "loss": 0.5684, + "step": 5402 + }, + { + "epoch": 1.434355502455861, + "grad_norm": 0.40777697246168537, + "learning_rate": 4.3452100026018e-06, + "loss": 0.6054, + "step": 5403 + }, + { + "epoch": 1.4346210009292446, + "grad_norm": 0.41588291312127035, + "learning_rate": 4.344974424920164e-06, + "loss": 0.5764, + "step": 5404 + }, + { + "epoch": 1.4348864994026285, + "grad_norm": 0.3930706260191455, + "learning_rate": 4.344738811257011e-06, + "loss": 0.5686, + "step": 5405 + }, + { + "epoch": 1.4351519978760123, + "grad_norm": 0.40908641818940145, + "learning_rate": 4.344503161616934e-06, + "loss": 0.5711, + "step": 5406 + }, + { + "epoch": 1.435417496349396, + "grad_norm": 0.399902104508832, + "learning_rate": 4.344267476004529e-06, + "loss": 0.5829, + "step": 5407 + }, + { + "epoch": 1.4356829948227798, + "grad_norm": 0.40441707037823643, + "learning_rate": 4.344031754424394e-06, + "loss": 0.5973, + "step": 5408 + }, + { + "epoch": 1.4359484932961635, + "grad_norm": 0.3972826785019991, + "learning_rate": 4.343795996881123e-06, + "loss": 0.5508, + "step": 5409 + }, + { + "epoch": 1.4362139917695473, + "grad_norm": 0.41285746563624176, + "learning_rate": 4.3435602033793166e-06, + "loss": 0.5814, + "step": 5410 + }, + { + "epoch": 1.436479490242931, + "grad_norm": 0.4113427960837754, + "learning_rate": 4.343324373923572e-06, + "loss": 0.5656, + "step": 5411 + }, + { + "epoch": 1.4367449887163148, + "grad_norm": 0.3936720183865051, + "learning_rate": 4.343088508518489e-06, + "loss": 0.5762, + "step": 5412 + }, + { + "epoch": 1.4370104871896987, + "grad_norm": 0.40742200774348103, + "learning_rate": 4.342852607168668e-06, + "loss": 0.5954, + "step": 5413 + }, + { + "epoch": 1.4372759856630823, + "grad_norm": 0.4080860404982624, + "learning_rate": 4.3426166698787085e-06, + "loss": 0.5944, + "step": 5414 + }, + { + "epoch": 1.4375414841364662, + "grad_norm": 0.40244449202505966, + "learning_rate": 4.342380696653212e-06, + "loss": 0.5952, + "step": 5415 + }, + { + "epoch": 1.43780698260985, + "grad_norm": 0.4039381803811445, + "learning_rate": 4.342144687496782e-06, + "loss": 0.5686, + "step": 5416 + }, + { + "epoch": 1.4380724810832337, + "grad_norm": 0.41499291111739744, + "learning_rate": 4.34190864241402e-06, + "loss": 0.5945, + "step": 5417 + }, + { + "epoch": 1.4383379795566176, + "grad_norm": 0.4177973178536254, + "learning_rate": 4.34167256140953e-06, + "loss": 0.5785, + "step": 5418 + }, + { + "epoch": 1.4386034780300014, + "grad_norm": 0.4128393275701462, + "learning_rate": 4.341436444487915e-06, + "loss": 0.6134, + "step": 5419 + }, + { + "epoch": 1.438868976503385, + "grad_norm": 0.41300285253860947, + "learning_rate": 4.341200291653781e-06, + "loss": 0.5908, + "step": 5420 + }, + { + "epoch": 1.439134474976769, + "grad_norm": 0.40712441379318276, + "learning_rate": 4.340964102911734e-06, + "loss": 0.6207, + "step": 5421 + }, + { + "epoch": 1.4393999734501526, + "grad_norm": 0.4185747047257086, + "learning_rate": 4.340727878266378e-06, + "loss": 0.6129, + "step": 5422 + }, + { + "epoch": 1.4396654719235364, + "grad_norm": 0.40830765602805963, + "learning_rate": 4.340491617722322e-06, + "loss": 0.636, + "step": 5423 + }, + { + "epoch": 1.43993097039692, + "grad_norm": 0.4026988027973634, + "learning_rate": 4.340255321284174e-06, + "loss": 0.5891, + "step": 5424 + }, + { + "epoch": 1.440196468870304, + "grad_norm": 0.405317914422975, + "learning_rate": 4.340018988956541e-06, + "loss": 0.5918, + "step": 5425 + }, + { + "epoch": 1.4404619673436878, + "grad_norm": 0.41181399368716043, + "learning_rate": 4.339782620744033e-06, + "loss": 0.5947, + "step": 5426 + }, + { + "epoch": 1.4407274658170715, + "grad_norm": 0.4039368767437816, + "learning_rate": 4.3395462166512596e-06, + "loss": 0.5927, + "step": 5427 + }, + { + "epoch": 1.4409929642904553, + "grad_norm": 0.40637221806875323, + "learning_rate": 4.33930977668283e-06, + "loss": 0.598, + "step": 5428 + }, + { + "epoch": 1.4412584627638392, + "grad_norm": 0.4178281724738597, + "learning_rate": 4.339073300843356e-06, + "loss": 0.6057, + "step": 5429 + }, + { + "epoch": 1.4415239612372228, + "grad_norm": 0.39391549625748806, + "learning_rate": 4.338836789137451e-06, + "loss": 0.5723, + "step": 5430 + }, + { + "epoch": 1.4417894597106067, + "grad_norm": 0.40899828343454736, + "learning_rate": 4.338600241569726e-06, + "loss": 0.608, + "step": 5431 + }, + { + "epoch": 1.4420549581839905, + "grad_norm": 0.40565685463235096, + "learning_rate": 4.338363658144794e-06, + "loss": 0.595, + "step": 5432 + }, + { + "epoch": 1.4423204566573742, + "grad_norm": 0.4118582770191195, + "learning_rate": 4.33812703886727e-06, + "loss": 0.6299, + "step": 5433 + }, + { + "epoch": 1.442585955130758, + "grad_norm": 0.42700345287573316, + "learning_rate": 4.337890383741769e-06, + "loss": 0.6084, + "step": 5434 + }, + { + "epoch": 1.442851453604142, + "grad_norm": 0.40277256222908836, + "learning_rate": 4.337653692772904e-06, + "loss": 0.5807, + "step": 5435 + }, + { + "epoch": 1.4431169520775255, + "grad_norm": 0.40443995077301803, + "learning_rate": 4.337416965965294e-06, + "loss": 0.6309, + "step": 5436 + }, + { + "epoch": 1.4433824505509094, + "grad_norm": 0.3951468780229571, + "learning_rate": 4.337180203323553e-06, + "loss": 0.5886, + "step": 5437 + }, + { + "epoch": 1.443647949024293, + "grad_norm": 0.41984358426849466, + "learning_rate": 4.336943404852301e-06, + "loss": 0.6055, + "step": 5438 + }, + { + "epoch": 1.443913447497677, + "grad_norm": 0.40774160937653453, + "learning_rate": 4.336706570556155e-06, + "loss": 0.5817, + "step": 5439 + }, + { + "epoch": 1.4441789459710606, + "grad_norm": 0.407572516765919, + "learning_rate": 4.3364697004397334e-06, + "loss": 0.5862, + "step": 5440 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.43177515256798255, + "learning_rate": 4.336232794507657e-06, + "loss": 0.6095, + "step": 5441 + }, + { + "epoch": 1.4447099429178283, + "grad_norm": 0.4173660347566688, + "learning_rate": 4.335995852764544e-06, + "loss": 0.6126, + "step": 5442 + }, + { + "epoch": 1.444975441391212, + "grad_norm": 0.4181341854904306, + "learning_rate": 4.335758875215018e-06, + "loss": 0.5978, + "step": 5443 + }, + { + "epoch": 1.4452409398645958, + "grad_norm": 0.4133611250793086, + "learning_rate": 4.335521861863698e-06, + "loss": 0.5723, + "step": 5444 + }, + { + "epoch": 1.4455064383379796, + "grad_norm": 0.411445085282278, + "learning_rate": 4.335284812715208e-06, + "loss": 0.5939, + "step": 5445 + }, + { + "epoch": 1.4457719368113633, + "grad_norm": 0.409531687595904, + "learning_rate": 4.335047727774171e-06, + "loss": 0.6025, + "step": 5446 + }, + { + "epoch": 1.4460374352847472, + "grad_norm": 0.4041967861213368, + "learning_rate": 4.334810607045209e-06, + "loss": 0.5762, + "step": 5447 + }, + { + "epoch": 1.446302933758131, + "grad_norm": 0.3916044838431704, + "learning_rate": 4.334573450532949e-06, + "loss": 0.5749, + "step": 5448 + }, + { + "epoch": 1.4465684322315147, + "grad_norm": 0.39981302592637735, + "learning_rate": 4.3343362582420144e-06, + "loss": 0.5651, + "step": 5449 + }, + { + "epoch": 1.4468339307048985, + "grad_norm": 0.4107400754193761, + "learning_rate": 4.334099030177032e-06, + "loss": 0.6287, + "step": 5450 + }, + { + "epoch": 1.4470994291782822, + "grad_norm": 0.4104590770130155, + "learning_rate": 4.333861766342627e-06, + "loss": 0.6355, + "step": 5451 + }, + { + "epoch": 1.447364927651666, + "grad_norm": 0.4166844764771257, + "learning_rate": 4.333624466743429e-06, + "loss": 0.5715, + "step": 5452 + }, + { + "epoch": 1.4476304261250497, + "grad_norm": 0.39199725133534785, + "learning_rate": 4.333387131384063e-06, + "loss": 0.5926, + "step": 5453 + }, + { + "epoch": 1.4478959245984335, + "grad_norm": 0.3911890903739129, + "learning_rate": 4.333149760269159e-06, + "loss": 0.582, + "step": 5454 + }, + { + "epoch": 1.4481614230718174, + "grad_norm": 0.3961386406296229, + "learning_rate": 4.332912353403348e-06, + "loss": 0.5566, + "step": 5455 + }, + { + "epoch": 1.448426921545201, + "grad_norm": 0.4071531075080342, + "learning_rate": 4.3326749107912565e-06, + "loss": 0.6022, + "step": 5456 + }, + { + "epoch": 1.448692420018585, + "grad_norm": 0.416994450285505, + "learning_rate": 4.332437432437518e-06, + "loss": 0.5732, + "step": 5457 + }, + { + "epoch": 1.4489579184919688, + "grad_norm": 0.41562233857098557, + "learning_rate": 4.332199918346763e-06, + "loss": 0.5871, + "step": 5458 + }, + { + "epoch": 1.4492234169653524, + "grad_norm": 0.42038990433640216, + "learning_rate": 4.331962368523623e-06, + "loss": 0.6306, + "step": 5459 + }, + { + "epoch": 1.4494889154387363, + "grad_norm": 0.4024305790045775, + "learning_rate": 4.331724782972732e-06, + "loss": 0.563, + "step": 5460 + }, + { + "epoch": 1.4497544139121201, + "grad_norm": 0.4188294426436788, + "learning_rate": 4.331487161698723e-06, + "loss": 0.6077, + "step": 5461 + }, + { + "epoch": 1.4500199123855038, + "grad_norm": 0.392663338312878, + "learning_rate": 4.33124950470623e-06, + "loss": 0.5897, + "step": 5462 + }, + { + "epoch": 1.4502854108588876, + "grad_norm": 0.4125620926323638, + "learning_rate": 4.331011811999887e-06, + "loss": 0.5943, + "step": 5463 + }, + { + "epoch": 1.4505509093322713, + "grad_norm": 0.4097834564106724, + "learning_rate": 4.330774083584332e-06, + "loss": 0.5922, + "step": 5464 + }, + { + "epoch": 1.4508164078056551, + "grad_norm": 0.42159881845540426, + "learning_rate": 4.330536319464198e-06, + "loss": 0.5948, + "step": 5465 + }, + { + "epoch": 1.4510819062790388, + "grad_norm": 0.4230387523286175, + "learning_rate": 4.330298519644126e-06, + "loss": 0.5959, + "step": 5466 + }, + { + "epoch": 1.4513474047524226, + "grad_norm": 0.4168478177364035, + "learning_rate": 4.330060684128751e-06, + "loss": 0.5926, + "step": 5467 + }, + { + "epoch": 1.4516129032258065, + "grad_norm": 0.41229228265412776, + "learning_rate": 4.32982281292271e-06, + "loss": 0.6202, + "step": 5468 + }, + { + "epoch": 1.4518784016991901, + "grad_norm": 0.40178209280550553, + "learning_rate": 4.329584906030646e-06, + "loss": 0.5662, + "step": 5469 + }, + { + "epoch": 1.452143900172574, + "grad_norm": 0.3980591501056759, + "learning_rate": 4.329346963457196e-06, + "loss": 0.593, + "step": 5470 + }, + { + "epoch": 1.4524093986459579, + "grad_norm": 0.40434182895856163, + "learning_rate": 4.329108985207002e-06, + "loss": 0.5754, + "step": 5471 + }, + { + "epoch": 1.4526748971193415, + "grad_norm": 0.39006608758269024, + "learning_rate": 4.328870971284703e-06, + "loss": 0.561, + "step": 5472 + }, + { + "epoch": 1.4529403955927254, + "grad_norm": 0.39959009725792033, + "learning_rate": 4.328632921694944e-06, + "loss": 0.5972, + "step": 5473 + }, + { + "epoch": 1.4532058940661092, + "grad_norm": 0.4138017002470512, + "learning_rate": 4.328394836442364e-06, + "loss": 0.5417, + "step": 5474 + }, + { + "epoch": 1.4534713925394929, + "grad_norm": 0.40348164169717077, + "learning_rate": 4.328156715531608e-06, + "loss": 0.5726, + "step": 5475 + }, + { + "epoch": 1.4537368910128767, + "grad_norm": 0.39324190531619213, + "learning_rate": 4.327918558967321e-06, + "loss": 0.5981, + "step": 5476 + }, + { + "epoch": 1.4540023894862604, + "grad_norm": 0.4144204931628961, + "learning_rate": 4.327680366754147e-06, + "loss": 0.5924, + "step": 5477 + }, + { + "epoch": 1.4542678879596442, + "grad_norm": 0.41712205163051763, + "learning_rate": 4.3274421388967305e-06, + "loss": 0.6086, + "step": 5478 + }, + { + "epoch": 1.4545333864330279, + "grad_norm": 0.42511867942034154, + "learning_rate": 4.327203875399717e-06, + "loss": 0.6086, + "step": 5479 + }, + { + "epoch": 1.4547988849064117, + "grad_norm": 0.39937049794595736, + "learning_rate": 4.326965576267755e-06, + "loss": 0.5834, + "step": 5480 + }, + { + "epoch": 1.4550643833797956, + "grad_norm": 0.3973577042092701, + "learning_rate": 4.32672724150549e-06, + "loss": 0.596, + "step": 5481 + }, + { + "epoch": 1.4553298818531792, + "grad_norm": 0.4013106328113882, + "learning_rate": 4.326488871117572e-06, + "loss": 0.6089, + "step": 5482 + }, + { + "epoch": 1.455595380326563, + "grad_norm": 0.4039276485466298, + "learning_rate": 4.3262504651086495e-06, + "loss": 0.5913, + "step": 5483 + }, + { + "epoch": 1.455860878799947, + "grad_norm": 0.3995375908847534, + "learning_rate": 4.326012023483371e-06, + "loss": 0.5787, + "step": 5484 + }, + { + "epoch": 1.4561263772733306, + "grad_norm": 0.4263769137357294, + "learning_rate": 4.325773546246387e-06, + "loss": 0.5842, + "step": 5485 + }, + { + "epoch": 1.4563918757467145, + "grad_norm": 0.41494019247186775, + "learning_rate": 4.325535033402349e-06, + "loss": 0.6301, + "step": 5486 + }, + { + "epoch": 1.4566573742200983, + "grad_norm": 0.404144692901755, + "learning_rate": 4.325296484955908e-06, + "loss": 0.5614, + "step": 5487 + }, + { + "epoch": 1.456922872693482, + "grad_norm": 0.41933221855008823, + "learning_rate": 4.325057900911716e-06, + "loss": 0.5875, + "step": 5488 + }, + { + "epoch": 1.4571883711668658, + "grad_norm": 0.4167875042132777, + "learning_rate": 4.324819281274427e-06, + "loss": 0.5988, + "step": 5489 + }, + { + "epoch": 1.4574538696402497, + "grad_norm": 0.39476948305622933, + "learning_rate": 4.324580626048694e-06, + "loss": 0.6044, + "step": 5490 + }, + { + "epoch": 1.4577193681136333, + "grad_norm": 0.40857762026441824, + "learning_rate": 4.324341935239171e-06, + "loss": 0.6044, + "step": 5491 + }, + { + "epoch": 1.4579848665870172, + "grad_norm": 0.4049614863481209, + "learning_rate": 4.324103208850514e-06, + "loss": 0.5639, + "step": 5492 + }, + { + "epoch": 1.4582503650604008, + "grad_norm": 0.41736952126280746, + "learning_rate": 4.323864446887378e-06, + "loss": 0.5811, + "step": 5493 + }, + { + "epoch": 1.4585158635337847, + "grad_norm": 0.4160546325409148, + "learning_rate": 4.32362564935442e-06, + "loss": 0.5786, + "step": 5494 + }, + { + "epoch": 1.4587813620071683, + "grad_norm": 0.4030609696339856, + "learning_rate": 4.323386816256297e-06, + "loss": 0.6011, + "step": 5495 + }, + { + "epoch": 1.4590468604805522, + "grad_norm": 0.4117288046483035, + "learning_rate": 4.323147947597667e-06, + "loss": 0.5981, + "step": 5496 + }, + { + "epoch": 1.459312358953936, + "grad_norm": 0.4067457796775077, + "learning_rate": 4.322909043383186e-06, + "loss": 0.5866, + "step": 5497 + }, + { + "epoch": 1.4595778574273197, + "grad_norm": 0.4077089464034527, + "learning_rate": 4.322670103617518e-06, + "loss": 0.6138, + "step": 5498 + }, + { + "epoch": 1.4598433559007036, + "grad_norm": 0.41988084076100224, + "learning_rate": 4.322431128305319e-06, + "loss": 0.5707, + "step": 5499 + }, + { + "epoch": 1.4601088543740874, + "grad_norm": 0.4036926688403312, + "learning_rate": 4.322192117451252e-06, + "loss": 0.5925, + "step": 5500 + }, + { + "epoch": 1.460374352847471, + "grad_norm": 0.40916019952040755, + "learning_rate": 4.321953071059976e-06, + "loss": 0.582, + "step": 5501 + }, + { + "epoch": 1.460639851320855, + "grad_norm": 0.4110460054913804, + "learning_rate": 4.321713989136154e-06, + "loss": 0.5838, + "step": 5502 + }, + { + "epoch": 1.4609053497942388, + "grad_norm": 0.41757126431902813, + "learning_rate": 4.321474871684449e-06, + "loss": 0.5882, + "step": 5503 + }, + { + "epoch": 1.4611708482676224, + "grad_norm": 0.42378763710163636, + "learning_rate": 4.321235718709525e-06, + "loss": 0.6033, + "step": 5504 + }, + { + "epoch": 1.4614363467410063, + "grad_norm": 0.4086764788524873, + "learning_rate": 4.320996530216045e-06, + "loss": 0.5869, + "step": 5505 + }, + { + "epoch": 1.46170184521439, + "grad_norm": 0.4049745795585868, + "learning_rate": 4.320757306208674e-06, + "loss": 0.5654, + "step": 5506 + }, + { + "epoch": 1.4619673436877738, + "grad_norm": 0.3941631829128186, + "learning_rate": 4.3205180466920784e-06, + "loss": 0.5845, + "step": 5507 + }, + { + "epoch": 1.4622328421611575, + "grad_norm": 0.411320031751828, + "learning_rate": 4.320278751670922e-06, + "loss": 0.5668, + "step": 5508 + }, + { + "epoch": 1.4624983406345413, + "grad_norm": 0.4240298495466126, + "learning_rate": 4.320039421149874e-06, + "loss": 0.6176, + "step": 5509 + }, + { + "epoch": 1.4627638391079252, + "grad_norm": 0.42734390919595755, + "learning_rate": 4.319800055133602e-06, + "loss": 0.5973, + "step": 5510 + }, + { + "epoch": 1.4630293375813088, + "grad_norm": 0.4011125372527518, + "learning_rate": 4.319560653626772e-06, + "loss": 0.5421, + "step": 5511 + }, + { + "epoch": 1.4632948360546927, + "grad_norm": 0.40815814109542436, + "learning_rate": 4.3193212166340535e-06, + "loss": 0.5808, + "step": 5512 + }, + { + "epoch": 1.4635603345280765, + "grad_norm": 0.40871228740343035, + "learning_rate": 4.319081744160119e-06, + "loss": 0.5859, + "step": 5513 + }, + { + "epoch": 1.4638258330014602, + "grad_norm": 0.4074011815085495, + "learning_rate": 4.318842236209635e-06, + "loss": 0.6056, + "step": 5514 + }, + { + "epoch": 1.464091331474844, + "grad_norm": 0.4052406086605638, + "learning_rate": 4.3186026927872736e-06, + "loss": 0.6182, + "step": 5515 + }, + { + "epoch": 1.464356829948228, + "grad_norm": 0.4008152565339012, + "learning_rate": 4.318363113897709e-06, + "loss": 0.5776, + "step": 5516 + }, + { + "epoch": 1.4646223284216116, + "grad_norm": 0.4180290683632047, + "learning_rate": 4.31812349954561e-06, + "loss": 0.5684, + "step": 5517 + }, + { + "epoch": 1.4648878268949954, + "grad_norm": 0.4151488415050085, + "learning_rate": 4.3178838497356515e-06, + "loss": 0.5506, + "step": 5518 + }, + { + "epoch": 1.465153325368379, + "grad_norm": 0.40188978367624584, + "learning_rate": 4.317644164472508e-06, + "loss": 0.6391, + "step": 5519 + }, + { + "epoch": 1.465418823841763, + "grad_norm": 0.40669239146877795, + "learning_rate": 4.317404443760851e-06, + "loss": 0.6353, + "step": 5520 + }, + { + "epoch": 1.4656843223151466, + "grad_norm": 0.42104072195555886, + "learning_rate": 4.317164687605359e-06, + "loss": 0.6036, + "step": 5521 + }, + { + "epoch": 1.4659498207885304, + "grad_norm": 0.399939047938581, + "learning_rate": 4.3169248960107054e-06, + "loss": 0.5626, + "step": 5522 + }, + { + "epoch": 1.4662153192619143, + "grad_norm": 0.4260673896839088, + "learning_rate": 4.316685068981569e-06, + "loss": 0.5985, + "step": 5523 + }, + { + "epoch": 1.466480817735298, + "grad_norm": 0.40033047321187815, + "learning_rate": 4.316445206522625e-06, + "loss": 0.5901, + "step": 5524 + }, + { + "epoch": 1.4667463162086818, + "grad_norm": 0.4163852826881481, + "learning_rate": 4.3162053086385515e-06, + "loss": 0.6048, + "step": 5525 + }, + { + "epoch": 1.4670118146820657, + "grad_norm": 0.41807279093668753, + "learning_rate": 4.315965375334029e-06, + "loss": 0.6199, + "step": 5526 + }, + { + "epoch": 1.4672773131554493, + "grad_norm": 0.39260821331302076, + "learning_rate": 4.315725406613735e-06, + "loss": 0.5873, + "step": 5527 + }, + { + "epoch": 1.4675428116288332, + "grad_norm": 0.4226695052785415, + "learning_rate": 4.315485402482349e-06, + "loss": 0.5824, + "step": 5528 + }, + { + "epoch": 1.467808310102217, + "grad_norm": 0.4210150617194616, + "learning_rate": 4.315245362944553e-06, + "loss": 0.6075, + "step": 5529 + }, + { + "epoch": 1.4680738085756007, + "grad_norm": 0.4140100363396281, + "learning_rate": 4.315005288005029e-06, + "loss": 0.5696, + "step": 5530 + }, + { + "epoch": 1.4683393070489845, + "grad_norm": 0.402729836590436, + "learning_rate": 4.314765177668456e-06, + "loss": 0.5956, + "step": 5531 + }, + { + "epoch": 1.4686048055223684, + "grad_norm": 0.4068745389019842, + "learning_rate": 4.31452503193952e-06, + "loss": 0.5878, + "step": 5532 + }, + { + "epoch": 1.468870303995752, + "grad_norm": 0.41315535902769135, + "learning_rate": 4.314284850822904e-06, + "loss": 0.5948, + "step": 5533 + }, + { + "epoch": 1.4691358024691357, + "grad_norm": 0.3973011188192901, + "learning_rate": 4.31404463432329e-06, + "loss": 0.5965, + "step": 5534 + }, + { + "epoch": 1.4694013009425195, + "grad_norm": 0.4007996448295039, + "learning_rate": 4.313804382445364e-06, + "loss": 0.601, + "step": 5535 + }, + { + "epoch": 1.4696667994159034, + "grad_norm": 0.41312856893792915, + "learning_rate": 4.313564095193812e-06, + "loss": 0.5947, + "step": 5536 + }, + { + "epoch": 1.469932297889287, + "grad_norm": 0.41232622730959095, + "learning_rate": 4.31332377257332e-06, + "loss": 0.5871, + "step": 5537 + }, + { + "epoch": 1.470197796362671, + "grad_norm": 0.41991647379237185, + "learning_rate": 4.313083414588574e-06, + "loss": 0.5644, + "step": 5538 + }, + { + "epoch": 1.4704632948360548, + "grad_norm": 0.3912478548717292, + "learning_rate": 4.312843021244263e-06, + "loss": 0.6015, + "step": 5539 + }, + { + "epoch": 1.4707287933094384, + "grad_norm": 0.40103338117562687, + "learning_rate": 4.312602592545073e-06, + "loss": 0.5993, + "step": 5540 + }, + { + "epoch": 1.4709942917828223, + "grad_norm": 0.41101252231516816, + "learning_rate": 4.312362128495697e-06, + "loss": 0.5663, + "step": 5541 + }, + { + "epoch": 1.4712597902562061, + "grad_norm": 0.40098559661083966, + "learning_rate": 4.31212162910082e-06, + "loss": 0.5907, + "step": 5542 + }, + { + "epoch": 1.4715252887295898, + "grad_norm": 0.4074430144974285, + "learning_rate": 4.311881094365134e-06, + "loss": 0.5553, + "step": 5543 + }, + { + "epoch": 1.4717907872029736, + "grad_norm": 0.39736619126303796, + "learning_rate": 4.311640524293332e-06, + "loss": 0.5693, + "step": 5544 + }, + { + "epoch": 1.4720562856763575, + "grad_norm": 0.41380392173836616, + "learning_rate": 4.3113999188901036e-06, + "loss": 0.5846, + "step": 5545 + }, + { + "epoch": 1.4723217841497411, + "grad_norm": 0.408249144675621, + "learning_rate": 4.311159278160142e-06, + "loss": 0.541, + "step": 5546 + }, + { + "epoch": 1.472587282623125, + "grad_norm": 0.4062292276377927, + "learning_rate": 4.31091860210814e-06, + "loss": 0.5312, + "step": 5547 + }, + { + "epoch": 1.4728527810965086, + "grad_norm": 0.4076363580452722, + "learning_rate": 4.31067789073879e-06, + "loss": 0.6293, + "step": 5548 + }, + { + "epoch": 1.4731182795698925, + "grad_norm": 0.4171891999021032, + "learning_rate": 4.31043714405679e-06, + "loss": 0.5758, + "step": 5549 + }, + { + "epoch": 1.4733837780432761, + "grad_norm": 0.40277192134787976, + "learning_rate": 4.310196362066831e-06, + "loss": 0.5979, + "step": 5550 + }, + { + "epoch": 1.47364927651666, + "grad_norm": 0.4177233212049592, + "learning_rate": 4.309955544773612e-06, + "loss": 0.5682, + "step": 5551 + }, + { + "epoch": 1.4739147749900439, + "grad_norm": 0.4102520481735466, + "learning_rate": 4.309714692181827e-06, + "loss": 0.623, + "step": 5552 + }, + { + "epoch": 1.4741802734634275, + "grad_norm": 0.39713997465951567, + "learning_rate": 4.3094738042961755e-06, + "loss": 0.5825, + "step": 5553 + }, + { + "epoch": 1.4744457719368114, + "grad_norm": 0.40795216533220846, + "learning_rate": 4.309232881121355e-06, + "loss": 0.5722, + "step": 5554 + }, + { + "epoch": 1.4747112704101952, + "grad_norm": 0.41206610467356236, + "learning_rate": 4.308991922662063e-06, + "loss": 0.5992, + "step": 5555 + }, + { + "epoch": 1.4749767688835789, + "grad_norm": 0.40942208276612935, + "learning_rate": 4.308750928922999e-06, + "loss": 0.5878, + "step": 5556 + }, + { + "epoch": 1.4752422673569627, + "grad_norm": 0.3881866358538866, + "learning_rate": 4.308509899908864e-06, + "loss": 0.6051, + "step": 5557 + }, + { + "epoch": 1.4755077658303466, + "grad_norm": 0.4133789310269571, + "learning_rate": 4.308268835624358e-06, + "loss": 0.6081, + "step": 5558 + }, + { + "epoch": 1.4757732643037302, + "grad_norm": 0.3975243911554642, + "learning_rate": 4.3080277360741805e-06, + "loss": 0.6048, + "step": 5559 + }, + { + "epoch": 1.476038762777114, + "grad_norm": 0.41559021125329, + "learning_rate": 4.307786601263037e-06, + "loss": 0.5606, + "step": 5560 + }, + { + "epoch": 1.4763042612504977, + "grad_norm": 0.405777975127508, + "learning_rate": 4.307545431195629e-06, + "loss": 0.6047, + "step": 5561 + }, + { + "epoch": 1.4765697597238816, + "grad_norm": 0.39733511192242815, + "learning_rate": 4.3073042258766586e-06, + "loss": 0.6069, + "step": 5562 + }, + { + "epoch": 1.4768352581972652, + "grad_norm": 0.41274048030382415, + "learning_rate": 4.30706298531083e-06, + "loss": 0.5628, + "step": 5563 + }, + { + "epoch": 1.477100756670649, + "grad_norm": 0.39774215958369075, + "learning_rate": 4.30682170950285e-06, + "loss": 0.5775, + "step": 5564 + }, + { + "epoch": 1.477366255144033, + "grad_norm": 0.4076207794860899, + "learning_rate": 4.306580398457423e-06, + "loss": 0.5623, + "step": 5565 + }, + { + "epoch": 1.4776317536174166, + "grad_norm": 0.39960304642648214, + "learning_rate": 4.306339052179254e-06, + "loss": 0.5978, + "step": 5566 + }, + { + "epoch": 1.4778972520908005, + "grad_norm": 0.41066224982186994, + "learning_rate": 4.306097670673051e-06, + "loss": 0.5464, + "step": 5567 + }, + { + "epoch": 1.4781627505641843, + "grad_norm": 0.41006629351145496, + "learning_rate": 4.305856253943522e-06, + "loss": 0.5546, + "step": 5568 + }, + { + "epoch": 1.478428249037568, + "grad_norm": 0.4015994292117701, + "learning_rate": 4.3056148019953735e-06, + "loss": 0.608, + "step": 5569 + }, + { + "epoch": 1.4786937475109518, + "grad_norm": 0.40961989751249295, + "learning_rate": 4.305373314833317e-06, + "loss": 0.5865, + "step": 5570 + }, + { + "epoch": 1.4789592459843357, + "grad_norm": 0.39969620914788334, + "learning_rate": 4.305131792462058e-06, + "loss": 0.5941, + "step": 5571 + }, + { + "epoch": 1.4792247444577193, + "grad_norm": 0.41959562060935424, + "learning_rate": 4.3048902348863116e-06, + "loss": 0.592, + "step": 5572 + }, + { + "epoch": 1.4794902429311032, + "grad_norm": 0.39824942342312053, + "learning_rate": 4.3046486421107854e-06, + "loss": 0.5705, + "step": 5573 + }, + { + "epoch": 1.4797557414044868, + "grad_norm": 0.41327026109964204, + "learning_rate": 4.3044070141401925e-06, + "loss": 0.5912, + "step": 5574 + }, + { + "epoch": 1.4800212398778707, + "grad_norm": 0.39486956605967033, + "learning_rate": 4.304165350979246e-06, + "loss": 0.6338, + "step": 5575 + }, + { + "epoch": 1.4802867383512543, + "grad_norm": 0.4158632378033139, + "learning_rate": 4.303923652632656e-06, + "loss": 0.6324, + "step": 5576 + }, + { + "epoch": 1.4805522368246382, + "grad_norm": 0.40948179239793003, + "learning_rate": 4.303681919105138e-06, + "loss": 0.5849, + "step": 5577 + }, + { + "epoch": 1.480817735298022, + "grad_norm": 0.41436358661688566, + "learning_rate": 4.3034401504014076e-06, + "loss": 0.6025, + "step": 5578 + }, + { + "epoch": 1.4810832337714057, + "grad_norm": 0.41119008776859284, + "learning_rate": 4.303198346526178e-06, + "loss": 0.5943, + "step": 5579 + }, + { + "epoch": 1.4813487322447896, + "grad_norm": 0.4043209080477481, + "learning_rate": 4.302956507484166e-06, + "loss": 0.5538, + "step": 5580 + }, + { + "epoch": 1.4816142307181734, + "grad_norm": 0.43261837570592837, + "learning_rate": 4.302714633280087e-06, + "loss": 0.5961, + "step": 5581 + }, + { + "epoch": 1.481879729191557, + "grad_norm": 0.40956763973232396, + "learning_rate": 4.30247272391866e-06, + "loss": 0.5865, + "step": 5582 + }, + { + "epoch": 1.482145227664941, + "grad_norm": 0.4113442762061393, + "learning_rate": 4.3022307794046006e-06, + "loss": 0.6032, + "step": 5583 + }, + { + "epoch": 1.4824107261383248, + "grad_norm": 0.4342482374146037, + "learning_rate": 4.301988799742629e-06, + "loss": 0.5536, + "step": 5584 + }, + { + "epoch": 1.4826762246117084, + "grad_norm": 0.41119700625275757, + "learning_rate": 4.301746784937464e-06, + "loss": 0.5671, + "step": 5585 + }, + { + "epoch": 1.4829417230850923, + "grad_norm": 0.4052253487530817, + "learning_rate": 4.3015047349938244e-06, + "loss": 0.5695, + "step": 5586 + }, + { + "epoch": 1.4832072215584762, + "grad_norm": 0.41187273250251494, + "learning_rate": 4.301262649916432e-06, + "loss": 0.5651, + "step": 5587 + }, + { + "epoch": 1.4834727200318598, + "grad_norm": 0.4095504145663514, + "learning_rate": 4.301020529710009e-06, + "loss": 0.5888, + "step": 5588 + }, + { + "epoch": 1.4837382185052437, + "grad_norm": 0.4113066764240779, + "learning_rate": 4.3007783743792735e-06, + "loss": 0.5786, + "step": 5589 + }, + { + "epoch": 1.4840037169786273, + "grad_norm": 0.42666649927641165, + "learning_rate": 4.3005361839289514e-06, + "loss": 0.5807, + "step": 5590 + }, + { + "epoch": 1.4842692154520112, + "grad_norm": 0.4091737005822595, + "learning_rate": 4.300293958363766e-06, + "loss": 0.588, + "step": 5591 + }, + { + "epoch": 1.4845347139253948, + "grad_norm": 0.4026541211913795, + "learning_rate": 4.3000516976884406e-06, + "loss": 0.5903, + "step": 5592 + }, + { + "epoch": 1.4848002123987787, + "grad_norm": 0.4132137671109537, + "learning_rate": 4.2998094019076996e-06, + "loss": 0.6284, + "step": 5593 + }, + { + "epoch": 1.4850657108721625, + "grad_norm": 0.4172679460127461, + "learning_rate": 4.299567071026268e-06, + "loss": 0.6206, + "step": 5594 + }, + { + "epoch": 1.4853312093455462, + "grad_norm": 0.4103474329947075, + "learning_rate": 4.299324705048873e-06, + "loss": 0.5795, + "step": 5595 + }, + { + "epoch": 1.48559670781893, + "grad_norm": 0.4097596880368327, + "learning_rate": 4.299082303980241e-06, + "loss": 0.601, + "step": 5596 + }, + { + "epoch": 1.485862206292314, + "grad_norm": 0.4211418978380807, + "learning_rate": 4.298839867825098e-06, + "loss": 0.598, + "step": 5597 + }, + { + "epoch": 1.4861277047656976, + "grad_norm": 0.39386632711745845, + "learning_rate": 4.298597396588175e-06, + "loss": 0.5865, + "step": 5598 + }, + { + "epoch": 1.4863932032390814, + "grad_norm": 0.43396563815849093, + "learning_rate": 4.298354890274198e-06, + "loss": 0.6054, + "step": 5599 + }, + { + "epoch": 1.4866587017124653, + "grad_norm": 0.41935494872928014, + "learning_rate": 4.298112348887897e-06, + "loss": 0.5791, + "step": 5600 + }, + { + "epoch": 1.486924200185849, + "grad_norm": 0.4075637480750658, + "learning_rate": 4.297869772434004e-06, + "loss": 0.5909, + "step": 5601 + }, + { + "epoch": 1.4871896986592328, + "grad_norm": 0.417287421316621, + "learning_rate": 4.297627160917247e-06, + "loss": 0.5819, + "step": 5602 + }, + { + "epoch": 1.4874551971326164, + "grad_norm": 0.4189200875337072, + "learning_rate": 4.29738451434236e-06, + "loss": 0.5842, + "step": 5603 + }, + { + "epoch": 1.4877206956060003, + "grad_norm": 0.4133173901574044, + "learning_rate": 4.297141832714073e-06, + "loss": 0.5375, + "step": 5604 + }, + { + "epoch": 1.487986194079384, + "grad_norm": 0.4153016247916373, + "learning_rate": 4.296899116037122e-06, + "loss": 0.5975, + "step": 5605 + }, + { + "epoch": 1.4882516925527678, + "grad_norm": 0.3949821939060139, + "learning_rate": 4.296656364316238e-06, + "loss": 0.5388, + "step": 5606 + }, + { + "epoch": 1.4885171910261517, + "grad_norm": 0.40581855738849826, + "learning_rate": 4.296413577556155e-06, + "loss": 0.5976, + "step": 5607 + }, + { + "epoch": 1.4887826894995353, + "grad_norm": 0.4024073387300952, + "learning_rate": 4.296170755761609e-06, + "loss": 0.5855, + "step": 5608 + }, + { + "epoch": 1.4890481879729192, + "grad_norm": 0.41911761598701847, + "learning_rate": 4.295927898937337e-06, + "loss": 0.5958, + "step": 5609 + }, + { + "epoch": 1.489313686446303, + "grad_norm": 0.4240188732259072, + "learning_rate": 4.2956850070880725e-06, + "loss": 0.602, + "step": 5610 + }, + { + "epoch": 1.4895791849196867, + "grad_norm": 0.4084060144334041, + "learning_rate": 4.295442080218554e-06, + "loss": 0.603, + "step": 5611 + }, + { + "epoch": 1.4898446833930705, + "grad_norm": 0.41550603533829994, + "learning_rate": 4.295199118333519e-06, + "loss": 0.577, + "step": 5612 + }, + { + "epoch": 1.4901101818664544, + "grad_norm": 0.4073914364998427, + "learning_rate": 4.294956121437706e-06, + "loss": 0.5901, + "step": 5613 + }, + { + "epoch": 1.490375680339838, + "grad_norm": 0.3979286568584305, + "learning_rate": 4.2947130895358535e-06, + "loss": 0.6033, + "step": 5614 + }, + { + "epoch": 1.490641178813222, + "grad_norm": 0.4065132966537831, + "learning_rate": 4.2944700226327014e-06, + "loss": 0.5816, + "step": 5615 + }, + { + "epoch": 1.4909066772866055, + "grad_norm": 0.41421185621331674, + "learning_rate": 4.29422692073299e-06, + "loss": 0.6022, + "step": 5616 + }, + { + "epoch": 1.4911721757599894, + "grad_norm": 0.41242826486158735, + "learning_rate": 4.293983783841461e-06, + "loss": 0.5619, + "step": 5617 + }, + { + "epoch": 1.491437674233373, + "grad_norm": 0.4019956423160651, + "learning_rate": 4.293740611962856e-06, + "loss": 0.6135, + "step": 5618 + }, + { + "epoch": 1.491703172706757, + "grad_norm": 0.39929011099356776, + "learning_rate": 4.293497405101917e-06, + "loss": 0.5938, + "step": 5619 + }, + { + "epoch": 1.4919686711801408, + "grad_norm": 0.39606208367172996, + "learning_rate": 4.293254163263388e-06, + "loss": 0.5931, + "step": 5620 + }, + { + "epoch": 1.4922341696535244, + "grad_norm": 0.41259615063719823, + "learning_rate": 4.293010886452012e-06, + "loss": 0.5544, + "step": 5621 + }, + { + "epoch": 1.4924996681269083, + "grad_norm": 0.41513473088555203, + "learning_rate": 4.292767574672532e-06, + "loss": 0.5432, + "step": 5622 + }, + { + "epoch": 1.4927651666002921, + "grad_norm": 0.4007579597252526, + "learning_rate": 4.2925242279296965e-06, + "loss": 0.5993, + "step": 5623 + }, + { + "epoch": 1.4930306650736758, + "grad_norm": 0.43662004007940564, + "learning_rate": 4.29228084622825e-06, + "loss": 0.5723, + "step": 5624 + }, + { + "epoch": 1.4932961635470596, + "grad_norm": 0.42985856343764267, + "learning_rate": 4.292037429572937e-06, + "loss": 0.6088, + "step": 5625 + }, + { + "epoch": 1.4935616620204435, + "grad_norm": 0.4215415654202423, + "learning_rate": 4.291793977968508e-06, + "loss": 0.5864, + "step": 5626 + }, + { + "epoch": 1.4938271604938271, + "grad_norm": 0.40658037424266164, + "learning_rate": 4.291550491419709e-06, + "loss": 0.6058, + "step": 5627 + }, + { + "epoch": 1.494092658967211, + "grad_norm": 0.41499367657110897, + "learning_rate": 4.291306969931289e-06, + "loss": 0.5511, + "step": 5628 + }, + { + "epoch": 1.4943581574405946, + "grad_norm": 0.4171361813430919, + "learning_rate": 4.291063413507997e-06, + "loss": 0.5857, + "step": 5629 + }, + { + "epoch": 1.4946236559139785, + "grad_norm": 0.4032400113309654, + "learning_rate": 4.2908198221545824e-06, + "loss": 0.6097, + "step": 5630 + }, + { + "epoch": 1.4948891543873621, + "grad_norm": 0.43116954204269214, + "learning_rate": 4.2905761958757975e-06, + "loss": 0.5851, + "step": 5631 + }, + { + "epoch": 1.495154652860746, + "grad_norm": 0.4099033401462102, + "learning_rate": 4.290332534676393e-06, + "loss": 0.6219, + "step": 5632 + }, + { + "epoch": 1.4954201513341299, + "grad_norm": 0.4225885236505941, + "learning_rate": 4.2900888385611205e-06, + "loss": 0.5766, + "step": 5633 + }, + { + "epoch": 1.4956856498075135, + "grad_norm": 0.4261721246243504, + "learning_rate": 4.289845107534732e-06, + "loss": 0.5972, + "step": 5634 + }, + { + "epoch": 1.4959511482808974, + "grad_norm": 0.43151428229846045, + "learning_rate": 4.289601341601982e-06, + "loss": 0.5847, + "step": 5635 + }, + { + "epoch": 1.4962166467542812, + "grad_norm": 0.397997685381986, + "learning_rate": 4.289357540767625e-06, + "loss": 0.5915, + "step": 5636 + }, + { + "epoch": 1.4964821452276649, + "grad_norm": 0.40833929534906654, + "learning_rate": 4.289113705036413e-06, + "loss": 0.5814, + "step": 5637 + }, + { + "epoch": 1.4967476437010487, + "grad_norm": 0.40920675231347575, + "learning_rate": 4.288869834413104e-06, + "loss": 0.6153, + "step": 5638 + }, + { + "epoch": 1.4970131421744326, + "grad_norm": 0.4001000938239612, + "learning_rate": 4.288625928902454e-06, + "loss": 0.5514, + "step": 5639 + }, + { + "epoch": 1.4972786406478162, + "grad_norm": 0.4003037292230063, + "learning_rate": 4.288381988509219e-06, + "loss": 0.5851, + "step": 5640 + }, + { + "epoch": 1.4975441391212, + "grad_norm": 0.4093169466843017, + "learning_rate": 4.288138013238155e-06, + "loss": 0.6148, + "step": 5641 + }, + { + "epoch": 1.497809637594584, + "grad_norm": 0.41305181203290936, + "learning_rate": 4.287894003094024e-06, + "loss": 0.5959, + "step": 5642 + }, + { + "epoch": 1.4980751360679676, + "grad_norm": 0.40953119504945756, + "learning_rate": 4.2876499580815816e-06, + "loss": 0.6023, + "step": 5643 + }, + { + "epoch": 1.4983406345413515, + "grad_norm": 0.4269553731026164, + "learning_rate": 4.287405878205587e-06, + "loss": 0.5959, + "step": 5644 + }, + { + "epoch": 1.498606133014735, + "grad_norm": 0.4038855634942212, + "learning_rate": 4.287161763470802e-06, + "loss": 0.5732, + "step": 5645 + }, + { + "epoch": 1.498871631488119, + "grad_norm": 0.41406221287188316, + "learning_rate": 4.286917613881987e-06, + "loss": 0.6278, + "step": 5646 + }, + { + "epoch": 1.4991371299615026, + "grad_norm": 0.40379250161406066, + "learning_rate": 4.286673429443904e-06, + "loss": 0.5852, + "step": 5647 + }, + { + "epoch": 1.4994026284348865, + "grad_norm": 0.4069517055843132, + "learning_rate": 4.286429210161313e-06, + "loss": 0.6056, + "step": 5648 + }, + { + "epoch": 1.4996681269082703, + "grad_norm": 0.3965435612616709, + "learning_rate": 4.28618495603898e-06, + "loss": 0.613, + "step": 5649 + }, + { + "epoch": 1.499933625381654, + "grad_norm": 0.40007288627181575, + "learning_rate": 4.285940667081667e-06, + "loss": 0.5757, + "step": 5650 + }, + { + "epoch": 1.5001991238550378, + "grad_norm": 0.39495602156663484, + "learning_rate": 4.2856963432941376e-06, + "loss": 0.5975, + "step": 5651 + }, + { + "epoch": 1.5004646223284217, + "grad_norm": 0.39340035894210856, + "learning_rate": 4.285451984681157e-06, + "loss": 0.5915, + "step": 5652 + }, + { + "epoch": 1.5007301208018053, + "grad_norm": 0.40184427972918263, + "learning_rate": 4.285207591247492e-06, + "loss": 0.5773, + "step": 5653 + }, + { + "epoch": 1.5009956192751892, + "grad_norm": 0.4057439125318073, + "learning_rate": 4.284963162997908e-06, + "loss": 0.5936, + "step": 5654 + }, + { + "epoch": 1.501261117748573, + "grad_norm": 0.3986417073629942, + "learning_rate": 4.284718699937172e-06, + "loss": 0.5497, + "step": 5655 + }, + { + "epoch": 1.5015266162219567, + "grad_norm": 0.415730078997179, + "learning_rate": 4.284474202070052e-06, + "loss": 0.5706, + "step": 5656 + }, + { + "epoch": 1.5017921146953404, + "grad_norm": 0.39952821522491205, + "learning_rate": 4.284229669401314e-06, + "loss": 0.5805, + "step": 5657 + }, + { + "epoch": 1.5020576131687244, + "grad_norm": 0.42519066384807086, + "learning_rate": 4.28398510193573e-06, + "loss": 0.5801, + "step": 5658 + }, + { + "epoch": 1.502323111642108, + "grad_norm": 0.4308340176687171, + "learning_rate": 4.283740499678069e-06, + "loss": 0.5639, + "step": 5659 + }, + { + "epoch": 1.5025886101154917, + "grad_norm": 0.4123144627485876, + "learning_rate": 4.2834958626331006e-06, + "loss": 0.6236, + "step": 5660 + }, + { + "epoch": 1.5028541085888756, + "grad_norm": 0.42368446770308654, + "learning_rate": 4.283251190805596e-06, + "loss": 0.582, + "step": 5661 + }, + { + "epoch": 1.5031196070622594, + "grad_norm": 0.4292458098889344, + "learning_rate": 4.283006484200327e-06, + "loss": 0.618, + "step": 5662 + }, + { + "epoch": 1.503385105535643, + "grad_norm": 0.4420354203264926, + "learning_rate": 4.282761742822066e-06, + "loss": 0.5935, + "step": 5663 + }, + { + "epoch": 1.503650604009027, + "grad_norm": 0.4040574009279775, + "learning_rate": 4.282516966675586e-06, + "loss": 0.5504, + "step": 5664 + }, + { + "epoch": 1.5039161024824108, + "grad_norm": 0.4161243728781349, + "learning_rate": 4.282272155765661e-06, + "loss": 0.582, + "step": 5665 + }, + { + "epoch": 1.5041816009557945, + "grad_norm": 0.42018307719944964, + "learning_rate": 4.282027310097066e-06, + "loss": 0.5947, + "step": 5666 + }, + { + "epoch": 1.5044470994291783, + "grad_norm": 0.41669890656749975, + "learning_rate": 4.281782429674574e-06, + "loss": 0.5719, + "step": 5667 + }, + { + "epoch": 1.5047125979025622, + "grad_norm": 0.43574601468676943, + "learning_rate": 4.281537514502962e-06, + "loss": 0.5602, + "step": 5668 + }, + { + "epoch": 1.5049780963759458, + "grad_norm": 0.4146290532136355, + "learning_rate": 4.281292564587009e-06, + "loss": 0.5518, + "step": 5669 + }, + { + "epoch": 1.5052435948493295, + "grad_norm": 0.43743126605669674, + "learning_rate": 4.2810475799314864e-06, + "loss": 0.6038, + "step": 5670 + }, + { + "epoch": 1.5055090933227135, + "grad_norm": 0.4222172900325046, + "learning_rate": 4.2808025605411775e-06, + "loss": 0.6315, + "step": 5671 + }, + { + "epoch": 1.5057745917960972, + "grad_norm": 0.41348021273746804, + "learning_rate": 4.280557506420857e-06, + "loss": 0.5809, + "step": 5672 + }, + { + "epoch": 1.5060400902694808, + "grad_norm": 0.41289531247665456, + "learning_rate": 4.2803124175753064e-06, + "loss": 0.602, + "step": 5673 + }, + { + "epoch": 1.5063055887428647, + "grad_norm": 0.4180819198976172, + "learning_rate": 4.280067294009305e-06, + "loss": 0.5733, + "step": 5674 + }, + { + "epoch": 1.5065710872162486, + "grad_norm": 0.40901871899706793, + "learning_rate": 4.279822135727631e-06, + "loss": 0.5808, + "step": 5675 + }, + { + "epoch": 1.5068365856896322, + "grad_norm": 0.4074075843669644, + "learning_rate": 4.27957694273507e-06, + "loss": 0.6026, + "step": 5676 + }, + { + "epoch": 1.507102084163016, + "grad_norm": 0.4089481149917887, + "learning_rate": 4.2793317150364e-06, + "loss": 0.5734, + "step": 5677 + }, + { + "epoch": 1.5073675826364, + "grad_norm": 0.4079017601675781, + "learning_rate": 4.279086452636406e-06, + "loss": 0.629, + "step": 5678 + }, + { + "epoch": 1.5076330811097836, + "grad_norm": 0.40210392018110563, + "learning_rate": 4.27884115553987e-06, + "loss": 0.6176, + "step": 5679 + }, + { + "epoch": 1.5078985795831674, + "grad_norm": 0.4169193803859788, + "learning_rate": 4.2785958237515755e-06, + "loss": 0.5864, + "step": 5680 + }, + { + "epoch": 1.5081640780565513, + "grad_norm": 0.414436725305469, + "learning_rate": 4.278350457276308e-06, + "loss": 0.58, + "step": 5681 + }, + { + "epoch": 1.508429576529935, + "grad_norm": 0.4249506065618703, + "learning_rate": 4.2781050561188526e-06, + "loss": 0.636, + "step": 5682 + }, + { + "epoch": 1.5086950750033188, + "grad_norm": 0.4331827324970614, + "learning_rate": 4.277859620283995e-06, + "loss": 0.5802, + "step": 5683 + }, + { + "epoch": 1.5089605734767026, + "grad_norm": 0.40661131321595523, + "learning_rate": 4.277614149776522e-06, + "loss": 0.5961, + "step": 5684 + }, + { + "epoch": 1.5092260719500863, + "grad_norm": 0.42209748960133614, + "learning_rate": 4.27736864460122e-06, + "loss": 0.5759, + "step": 5685 + }, + { + "epoch": 1.50949157042347, + "grad_norm": 0.40884949798421116, + "learning_rate": 4.277123104762878e-06, + "loss": 0.6121, + "step": 5686 + }, + { + "epoch": 1.5097570688968538, + "grad_norm": 0.4037067491065205, + "learning_rate": 4.276877530266284e-06, + "loss": 0.5943, + "step": 5687 + }, + { + "epoch": 1.5100225673702377, + "grad_norm": 0.4074580698543477, + "learning_rate": 4.276631921116229e-06, + "loss": 0.6042, + "step": 5688 + }, + { + "epoch": 1.5102880658436213, + "grad_norm": 0.4111341664884331, + "learning_rate": 4.2763862773175005e-06, + "loss": 0.5979, + "step": 5689 + }, + { + "epoch": 1.5105535643170052, + "grad_norm": 0.4249869181036253, + "learning_rate": 4.276140598874891e-06, + "loss": 0.5879, + "step": 5690 + }, + { + "epoch": 1.510819062790389, + "grad_norm": 0.4052432796809206, + "learning_rate": 4.2758948857931904e-06, + "loss": 0.5819, + "step": 5691 + }, + { + "epoch": 1.5110845612637727, + "grad_norm": 0.416293544723965, + "learning_rate": 4.275649138077191e-06, + "loss": 0.5805, + "step": 5692 + }, + { + "epoch": 1.5113500597371565, + "grad_norm": 0.41801282225434344, + "learning_rate": 4.275403355731687e-06, + "loss": 0.6356, + "step": 5693 + }, + { + "epoch": 1.5116155582105404, + "grad_norm": 0.4329096586032273, + "learning_rate": 4.27515753876147e-06, + "loss": 0.5745, + "step": 5694 + }, + { + "epoch": 1.511881056683924, + "grad_norm": 0.43026700494688946, + "learning_rate": 4.2749116871713345e-06, + "loss": 0.6215, + "step": 5695 + }, + { + "epoch": 1.512146555157308, + "grad_norm": 0.4087202495096165, + "learning_rate": 4.274665800966076e-06, + "loss": 0.5805, + "step": 5696 + }, + { + "epoch": 1.5124120536306918, + "grad_norm": 0.40682328917061755, + "learning_rate": 4.2744198801504875e-06, + "loss": 0.5683, + "step": 5697 + }, + { + "epoch": 1.5126775521040754, + "grad_norm": 0.39125673378603715, + "learning_rate": 4.274173924729369e-06, + "loss": 0.5659, + "step": 5698 + }, + { + "epoch": 1.512943050577459, + "grad_norm": 0.4119223366222529, + "learning_rate": 4.273927934707514e-06, + "loss": 0.6062, + "step": 5699 + }, + { + "epoch": 1.5132085490508431, + "grad_norm": 0.40881559721118427, + "learning_rate": 4.273681910089721e-06, + "loss": 0.5922, + "step": 5700 + }, + { + "epoch": 1.5134740475242268, + "grad_norm": 0.40717292570261626, + "learning_rate": 4.273435850880788e-06, + "loss": 0.6317, + "step": 5701 + }, + { + "epoch": 1.5137395459976104, + "grad_norm": 0.40586660086368254, + "learning_rate": 4.273189757085514e-06, + "loss": 0.5974, + "step": 5702 + }, + { + "epoch": 1.5140050444709943, + "grad_norm": 0.4144374750707454, + "learning_rate": 4.272943628708698e-06, + "loss": 0.5633, + "step": 5703 + }, + { + "epoch": 1.5142705429443781, + "grad_norm": 0.40129072911171604, + "learning_rate": 4.27269746575514e-06, + "loss": 0.5963, + "step": 5704 + }, + { + "epoch": 1.5145360414177618, + "grad_norm": 0.3923367085830904, + "learning_rate": 4.272451268229642e-06, + "loss": 0.5636, + "step": 5705 + }, + { + "epoch": 1.5148015398911456, + "grad_norm": 0.40976214723567633, + "learning_rate": 4.272205036137005e-06, + "loss": 0.5965, + "step": 5706 + }, + { + "epoch": 1.5150670383645295, + "grad_norm": 0.4182397749786371, + "learning_rate": 4.271958769482029e-06, + "loss": 0.6112, + "step": 5707 + }, + { + "epoch": 1.5153325368379131, + "grad_norm": 0.4114383139556441, + "learning_rate": 4.271712468269519e-06, + "loss": 0.6215, + "step": 5708 + }, + { + "epoch": 1.515598035311297, + "grad_norm": 0.41693883299641066, + "learning_rate": 4.271466132504279e-06, + "loss": 0.5964, + "step": 5709 + }, + { + "epoch": 1.5158635337846809, + "grad_norm": 0.40690146982240954, + "learning_rate": 4.271219762191111e-06, + "loss": 0.6151, + "step": 5710 + }, + { + "epoch": 1.5161290322580645, + "grad_norm": 0.42503877756829644, + "learning_rate": 4.270973357334821e-06, + "loss": 0.5879, + "step": 5711 + }, + { + "epoch": 1.5163945307314481, + "grad_norm": 0.40951056623422405, + "learning_rate": 4.270726917940216e-06, + "loss": 0.6076, + "step": 5712 + }, + { + "epoch": 1.5166600292048322, + "grad_norm": 0.42331275595948265, + "learning_rate": 4.270480444012098e-06, + "loss": 0.5758, + "step": 5713 + }, + { + "epoch": 1.5169255276782159, + "grad_norm": 0.40193668472741895, + "learning_rate": 4.270233935555278e-06, + "loss": 0.5894, + "step": 5714 + }, + { + "epoch": 1.5171910261515995, + "grad_norm": 0.39882600344474484, + "learning_rate": 4.2699873925745626e-06, + "loss": 0.5939, + "step": 5715 + }, + { + "epoch": 1.5174565246249834, + "grad_norm": 0.3954701330949738, + "learning_rate": 4.269740815074758e-06, + "loss": 0.5595, + "step": 5716 + }, + { + "epoch": 1.5177220230983672, + "grad_norm": 0.3953214868975649, + "learning_rate": 4.269494203060675e-06, + "loss": 0.5541, + "step": 5717 + }, + { + "epoch": 1.5179875215717509, + "grad_norm": 0.41101915563990105, + "learning_rate": 4.269247556537123e-06, + "loss": 0.6199, + "step": 5718 + }, + { + "epoch": 1.5182530200451347, + "grad_norm": 0.42744969013463696, + "learning_rate": 4.269000875508911e-06, + "loss": 0.5958, + "step": 5719 + }, + { + "epoch": 1.5185185185185186, + "grad_norm": 0.40595400291397254, + "learning_rate": 4.268754159980851e-06, + "loss": 0.5732, + "step": 5720 + }, + { + "epoch": 1.5187840169919022, + "grad_norm": 0.40696951479015514, + "learning_rate": 4.2685074099577545e-06, + "loss": 0.5584, + "step": 5721 + }, + { + "epoch": 1.519049515465286, + "grad_norm": 0.40036171005748317, + "learning_rate": 4.268260625444433e-06, + "loss": 0.5927, + "step": 5722 + }, + { + "epoch": 1.51931501393867, + "grad_norm": 0.4143565478246165, + "learning_rate": 4.2680138064457e-06, + "loss": 0.5797, + "step": 5723 + }, + { + "epoch": 1.5195805124120536, + "grad_norm": 0.404597709231146, + "learning_rate": 4.267766952966369e-06, + "loss": 0.5663, + "step": 5724 + }, + { + "epoch": 1.5198460108854372, + "grad_norm": 0.4157517574357017, + "learning_rate": 4.267520065011255e-06, + "loss": 0.6102, + "step": 5725 + }, + { + "epoch": 1.5201115093588213, + "grad_norm": 0.41114391194162436, + "learning_rate": 4.267273142585171e-06, + "loss": 0.5724, + "step": 5726 + }, + { + "epoch": 1.520377007832205, + "grad_norm": 0.4164424914595022, + "learning_rate": 4.2670261856929335e-06, + "loss": 0.5973, + "step": 5727 + }, + { + "epoch": 1.5206425063055886, + "grad_norm": 0.4249100794165623, + "learning_rate": 4.26677919433936e-06, + "loss": 0.5673, + "step": 5728 + }, + { + "epoch": 1.5209080047789725, + "grad_norm": 0.4101077677346282, + "learning_rate": 4.2665321685292655e-06, + "loss": 0.5814, + "step": 5729 + }, + { + "epoch": 1.5211735032523563, + "grad_norm": 0.4082484490267254, + "learning_rate": 4.266285108267468e-06, + "loss": 0.5258, + "step": 5730 + }, + { + "epoch": 1.52143900172574, + "grad_norm": 0.40682684243895173, + "learning_rate": 4.266038013558788e-06, + "loss": 0.6033, + "step": 5731 + }, + { + "epoch": 1.5217045001991238, + "grad_norm": 0.43040652090542003, + "learning_rate": 4.265790884408042e-06, + "loss": 0.5905, + "step": 5732 + }, + { + "epoch": 1.5219699986725077, + "grad_norm": 0.4087725821179951, + "learning_rate": 4.26554372082005e-06, + "loss": 0.5415, + "step": 5733 + }, + { + "epoch": 1.5222354971458913, + "grad_norm": 0.43616185181429407, + "learning_rate": 4.2652965227996325e-06, + "loss": 0.5873, + "step": 5734 + }, + { + "epoch": 1.5225009956192752, + "grad_norm": 0.40345535276470496, + "learning_rate": 4.265049290351611e-06, + "loss": 0.5676, + "step": 5735 + }, + { + "epoch": 1.522766494092659, + "grad_norm": 0.39985845694822797, + "learning_rate": 4.264802023480806e-06, + "loss": 0.5905, + "step": 5736 + }, + { + "epoch": 1.5230319925660427, + "grad_norm": 0.40959956238084144, + "learning_rate": 4.264554722192042e-06, + "loss": 0.5812, + "step": 5737 + }, + { + "epoch": 1.5232974910394266, + "grad_norm": 0.4003863360931367, + "learning_rate": 4.264307386490139e-06, + "loss": 0.6032, + "step": 5738 + }, + { + "epoch": 1.5235629895128104, + "grad_norm": 0.4246943408432233, + "learning_rate": 4.264060016379924e-06, + "loss": 0.563, + "step": 5739 + }, + { + "epoch": 1.523828487986194, + "grad_norm": 0.40813454209444033, + "learning_rate": 4.263812611866218e-06, + "loss": 0.5831, + "step": 5740 + }, + { + "epoch": 1.5240939864595777, + "grad_norm": 0.4188415604131818, + "learning_rate": 4.2635651729538485e-06, + "loss": 0.6085, + "step": 5741 + }, + { + "epoch": 1.5243594849329616, + "grad_norm": 0.4031768279681957, + "learning_rate": 4.263317699647639e-06, + "loss": 0.6094, + "step": 5742 + }, + { + "epoch": 1.5246249834063454, + "grad_norm": 0.40269882335524293, + "learning_rate": 4.263070191952418e-06, + "loss": 0.6232, + "step": 5743 + }, + { + "epoch": 1.524890481879729, + "grad_norm": 0.41501177730664696, + "learning_rate": 4.262822649873011e-06, + "loss": 0.6433, + "step": 5744 + }, + { + "epoch": 1.525155980353113, + "grad_norm": 0.4033869910141432, + "learning_rate": 4.262575073414247e-06, + "loss": 0.5687, + "step": 5745 + }, + { + "epoch": 1.5254214788264968, + "grad_norm": 0.41613654408175105, + "learning_rate": 4.262327462580953e-06, + "loss": 0.5986, + "step": 5746 + }, + { + "epoch": 1.5256869772998805, + "grad_norm": 0.40565225424344326, + "learning_rate": 4.262079817377958e-06, + "loss": 0.5849, + "step": 5747 + }, + { + "epoch": 1.5259524757732643, + "grad_norm": 0.43132185406208706, + "learning_rate": 4.261832137810093e-06, + "loss": 0.6044, + "step": 5748 + }, + { + "epoch": 1.5262179742466482, + "grad_norm": 0.4225896186641911, + "learning_rate": 4.2615844238821864e-06, + "loss": 0.603, + "step": 5749 + }, + { + "epoch": 1.5264834727200318, + "grad_norm": 0.40717071804314, + "learning_rate": 4.261336675599072e-06, + "loss": 0.6102, + "step": 5750 + }, + { + "epoch": 1.5267489711934157, + "grad_norm": 0.4164106350921646, + "learning_rate": 4.261088892965579e-06, + "loss": 0.5976, + "step": 5751 + }, + { + "epoch": 1.5270144696667995, + "grad_norm": 0.41520198117847645, + "learning_rate": 4.260841075986542e-06, + "loss": 0.5741, + "step": 5752 + }, + { + "epoch": 1.5272799681401832, + "grad_norm": 0.43780824406482194, + "learning_rate": 4.2605932246667914e-06, + "loss": 0.6134, + "step": 5753 + }, + { + "epoch": 1.5275454666135668, + "grad_norm": 0.4050832055131384, + "learning_rate": 4.260345339011163e-06, + "loss": 0.6281, + "step": 5754 + }, + { + "epoch": 1.527810965086951, + "grad_norm": 0.4012649197372533, + "learning_rate": 4.2600974190244896e-06, + "loss": 0.5465, + "step": 5755 + }, + { + "epoch": 1.5280764635603346, + "grad_norm": 0.424276300870772, + "learning_rate": 4.259849464711607e-06, + "loss": 0.6334, + "step": 5756 + }, + { + "epoch": 1.5283419620337182, + "grad_norm": 0.4309303162873798, + "learning_rate": 4.259601476077352e-06, + "loss": 0.6129, + "step": 5757 + }, + { + "epoch": 1.528607460507102, + "grad_norm": 0.40009392847866054, + "learning_rate": 4.259353453126559e-06, + "loss": 0.5734, + "step": 5758 + }, + { + "epoch": 1.528872958980486, + "grad_norm": 0.4005208881435323, + "learning_rate": 4.2591053958640666e-06, + "loss": 0.5864, + "step": 5759 + }, + { + "epoch": 1.5291384574538696, + "grad_norm": 0.4249982754784965, + "learning_rate": 4.258857304294712e-06, + "loss": 0.602, + "step": 5760 + }, + { + "epoch": 1.5294039559272534, + "grad_norm": 0.4141603237565349, + "learning_rate": 4.258609178423333e-06, + "loss": 0.5363, + "step": 5761 + }, + { + "epoch": 1.5296694544006373, + "grad_norm": 0.40413341864793056, + "learning_rate": 4.258361018254769e-06, + "loss": 0.5438, + "step": 5762 + }, + { + "epoch": 1.529934952874021, + "grad_norm": 0.4200763894297239, + "learning_rate": 4.258112823793861e-06, + "loss": 0.5974, + "step": 5763 + }, + { + "epoch": 1.5302004513474048, + "grad_norm": 0.4197995183917972, + "learning_rate": 4.2578645950454475e-06, + "loss": 0.6084, + "step": 5764 + }, + { + "epoch": 1.5304659498207887, + "grad_norm": 0.42058011282831137, + "learning_rate": 4.257616332014372e-06, + "loss": 0.5796, + "step": 5765 + }, + { + "epoch": 1.5307314482941723, + "grad_norm": 0.406127012476845, + "learning_rate": 4.257368034705473e-06, + "loss": 0.5966, + "step": 5766 + }, + { + "epoch": 1.530996946767556, + "grad_norm": 0.39841963386881635, + "learning_rate": 4.257119703123594e-06, + "loss": 0.5635, + "step": 5767 + }, + { + "epoch": 1.53126244524094, + "grad_norm": 0.4175207304720907, + "learning_rate": 4.25687133727358e-06, + "loss": 0.5921, + "step": 5768 + }, + { + "epoch": 1.5315279437143237, + "grad_norm": 0.4074352266903592, + "learning_rate": 4.256622937160273e-06, + "loss": 0.566, + "step": 5769 + }, + { + "epoch": 1.5317934421877073, + "grad_norm": 0.40506171798647617, + "learning_rate": 4.256374502788517e-06, + "loss": 0.5833, + "step": 5770 + }, + { + "epoch": 1.5320589406610912, + "grad_norm": 0.3832598521750574, + "learning_rate": 4.25612603416316e-06, + "loss": 0.5672, + "step": 5771 + }, + { + "epoch": 1.532324439134475, + "grad_norm": 0.4393559869051958, + "learning_rate": 4.255877531289043e-06, + "loss": 0.5876, + "step": 5772 + }, + { + "epoch": 1.5325899376078587, + "grad_norm": 0.4223043351269334, + "learning_rate": 4.255628994171016e-06, + "loss": 0.5959, + "step": 5773 + }, + { + "epoch": 1.5328554360812425, + "grad_norm": 0.42145960012246664, + "learning_rate": 4.255380422813925e-06, + "loss": 0.598, + "step": 5774 + }, + { + "epoch": 1.5331209345546264, + "grad_norm": 0.41480488591362813, + "learning_rate": 4.255131817222619e-06, + "loss": 0.58, + "step": 5775 + }, + { + "epoch": 1.53338643302801, + "grad_norm": 0.40980118405916865, + "learning_rate": 4.254883177401944e-06, + "loss": 0.6268, + "step": 5776 + }, + { + "epoch": 1.533651931501394, + "grad_norm": 0.42656038660278023, + "learning_rate": 4.25463450335675e-06, + "loss": 0.5875, + "step": 5777 + }, + { + "epoch": 1.5339174299747778, + "grad_norm": 0.39327612686813335, + "learning_rate": 4.254385795091888e-06, + "loss": 0.5823, + "step": 5778 + }, + { + "epoch": 1.5341829284481614, + "grad_norm": 0.4194634308998548, + "learning_rate": 4.254137052612207e-06, + "loss": 0.59, + "step": 5779 + }, + { + "epoch": 1.534448426921545, + "grad_norm": 0.4297277855058897, + "learning_rate": 4.2538882759225595e-06, + "loss": 0.5557, + "step": 5780 + }, + { + "epoch": 1.5347139253949291, + "grad_norm": 0.3957925631192835, + "learning_rate": 4.253639465027796e-06, + "loss": 0.5631, + "step": 5781 + }, + { + "epoch": 1.5349794238683128, + "grad_norm": 0.4107791274312506, + "learning_rate": 4.2533906199327685e-06, + "loss": 0.5838, + "step": 5782 + }, + { + "epoch": 1.5352449223416964, + "grad_norm": 0.3995563226634287, + "learning_rate": 4.2531417406423315e-06, + "loss": 0.5918, + "step": 5783 + }, + { + "epoch": 1.5355104208150803, + "grad_norm": 0.41867451064553973, + "learning_rate": 4.252892827161338e-06, + "loss": 0.569, + "step": 5784 + }, + { + "epoch": 1.5357759192884641, + "grad_norm": 0.42245589469063777, + "learning_rate": 4.252643879494642e-06, + "loss": 0.6175, + "step": 5785 + }, + { + "epoch": 1.5360414177618478, + "grad_norm": 0.4218231256229024, + "learning_rate": 4.252394897647101e-06, + "loss": 0.5801, + "step": 5786 + }, + { + "epoch": 1.5363069162352316, + "grad_norm": 0.42133253523658604, + "learning_rate": 4.252145881623567e-06, + "loss": 0.621, + "step": 5787 + }, + { + "epoch": 1.5365724147086155, + "grad_norm": 0.4257051952221697, + "learning_rate": 4.2518968314288985e-06, + "loss": 0.5368, + "step": 5788 + }, + { + "epoch": 1.5368379131819991, + "grad_norm": 0.40312463098060375, + "learning_rate": 4.251647747067953e-06, + "loss": 0.5941, + "step": 5789 + }, + { + "epoch": 1.537103411655383, + "grad_norm": 0.4033068783655608, + "learning_rate": 4.251398628545588e-06, + "loss": 0.6304, + "step": 5790 + }, + { + "epoch": 1.5373689101287669, + "grad_norm": 0.42136270751274707, + "learning_rate": 4.25114947586666e-06, + "loss": 0.5503, + "step": 5791 + }, + { + "epoch": 1.5376344086021505, + "grad_norm": 0.42328521480102993, + "learning_rate": 4.250900289036032e-06, + "loss": 0.6142, + "step": 5792 + }, + { + "epoch": 1.5378999070755344, + "grad_norm": 0.4348095111276332, + "learning_rate": 4.250651068058559e-06, + "loss": 0.6094, + "step": 5793 + }, + { + "epoch": 1.5381654055489182, + "grad_norm": 0.40941184330175634, + "learning_rate": 4.250401812939106e-06, + "loss": 0.5909, + "step": 5794 + }, + { + "epoch": 1.5384309040223019, + "grad_norm": 0.41099698682452285, + "learning_rate": 4.25015252368253e-06, + "loss": 0.5962, + "step": 5795 + }, + { + "epoch": 1.5386964024956855, + "grad_norm": 0.42019408909110584, + "learning_rate": 4.2499032002936955e-06, + "loss": 0.6372, + "step": 5796 + }, + { + "epoch": 1.5389619009690694, + "grad_norm": 0.431586458670598, + "learning_rate": 4.249653842777465e-06, + "loss": 0.5749, + "step": 5797 + }, + { + "epoch": 1.5392273994424532, + "grad_norm": 0.44193927000098066, + "learning_rate": 4.2494044511387e-06, + "loss": 0.5617, + "step": 5798 + }, + { + "epoch": 1.5394928979158369, + "grad_norm": 0.3975668084401015, + "learning_rate": 4.249155025382265e-06, + "loss": 0.6112, + "step": 5799 + }, + { + "epoch": 1.5397583963892207, + "grad_norm": 0.4143076607183988, + "learning_rate": 4.248905565513023e-06, + "loss": 0.5796, + "step": 5800 + }, + { + "epoch": 1.5400238948626046, + "grad_norm": 0.4864694341534911, + "learning_rate": 4.248656071535842e-06, + "loss": 0.5872, + "step": 5801 + }, + { + "epoch": 1.5402893933359882, + "grad_norm": 0.4351270774596731, + "learning_rate": 4.2484065434555855e-06, + "loss": 0.574, + "step": 5802 + }, + { + "epoch": 1.540554891809372, + "grad_norm": 0.4064014103644156, + "learning_rate": 4.24815698127712e-06, + "loss": 0.5622, + "step": 5803 + }, + { + "epoch": 1.540820390282756, + "grad_norm": 0.42124028804181896, + "learning_rate": 4.247907385005314e-06, + "loss": 0.6258, + "step": 5804 + }, + { + "epoch": 1.5410858887561396, + "grad_norm": 0.47537886182196665, + "learning_rate": 4.247657754645035e-06, + "loss": 0.6112, + "step": 5805 + }, + { + "epoch": 1.5413513872295235, + "grad_norm": 0.40091313115791594, + "learning_rate": 4.24740809020115e-06, + "loss": 0.5904, + "step": 5806 + }, + { + "epoch": 1.5416168857029073, + "grad_norm": 0.409271462210794, + "learning_rate": 4.2471583916785286e-06, + "loss": 0.5797, + "step": 5807 + }, + { + "epoch": 1.541882384176291, + "grad_norm": 0.4323446005740633, + "learning_rate": 4.246908659082041e-06, + "loss": 0.5897, + "step": 5808 + }, + { + "epoch": 1.5421478826496746, + "grad_norm": 0.41291909640193863, + "learning_rate": 4.246658892416557e-06, + "loss": 0.6087, + "step": 5809 + }, + { + "epoch": 1.5424133811230587, + "grad_norm": 0.4100678291779503, + "learning_rate": 4.246409091686949e-06, + "loss": 0.5939, + "step": 5810 + }, + { + "epoch": 1.5426788795964423, + "grad_norm": 0.41200681198697153, + "learning_rate": 4.246159256898087e-06, + "loss": 0.5434, + "step": 5811 + }, + { + "epoch": 1.542944378069826, + "grad_norm": 0.40083888715965993, + "learning_rate": 4.2459093880548454e-06, + "loss": 0.5984, + "step": 5812 + }, + { + "epoch": 1.5432098765432098, + "grad_norm": 0.41168998752501446, + "learning_rate": 4.245659485162094e-06, + "loss": 0.5641, + "step": 5813 + }, + { + "epoch": 1.5434753750165937, + "grad_norm": 0.4219655027983877, + "learning_rate": 4.24540954822471e-06, + "loss": 0.5843, + "step": 5814 + }, + { + "epoch": 1.5437408734899773, + "grad_norm": 0.40222172974883863, + "learning_rate": 4.245159577247566e-06, + "loss": 0.5937, + "step": 5815 + }, + { + "epoch": 1.5440063719633612, + "grad_norm": 0.41471412291120374, + "learning_rate": 4.244909572235537e-06, + "loss": 0.5673, + "step": 5816 + }, + { + "epoch": 1.544271870436745, + "grad_norm": 0.41427992842533257, + "learning_rate": 4.2446595331934994e-06, + "loss": 0.6154, + "step": 5817 + }, + { + "epoch": 1.5445373689101287, + "grad_norm": 0.41141856956047185, + "learning_rate": 4.2444094601263294e-06, + "loss": 0.5963, + "step": 5818 + }, + { + "epoch": 1.5448028673835126, + "grad_norm": 0.41672842956731926, + "learning_rate": 4.244159353038903e-06, + "loss": 0.5678, + "step": 5819 + }, + { + "epoch": 1.5450683658568964, + "grad_norm": 0.3978426799042112, + "learning_rate": 4.243909211936099e-06, + "loss": 0.6011, + "step": 5820 + }, + { + "epoch": 1.54533386433028, + "grad_norm": 0.42027792805681297, + "learning_rate": 4.243659036822797e-06, + "loss": 0.608, + "step": 5821 + }, + { + "epoch": 1.5455993628036637, + "grad_norm": 0.4126590542300162, + "learning_rate": 4.243408827703873e-06, + "loss": 0.5786, + "step": 5822 + }, + { + "epoch": 1.5458648612770478, + "grad_norm": 0.41610498361438586, + "learning_rate": 4.243158584584208e-06, + "loss": 0.6142, + "step": 5823 + }, + { + "epoch": 1.5461303597504314, + "grad_norm": 0.4129868549044019, + "learning_rate": 4.242908307468684e-06, + "loss": 0.6091, + "step": 5824 + }, + { + "epoch": 1.546395858223815, + "grad_norm": 0.4043845470339631, + "learning_rate": 4.24265799636218e-06, + "loss": 0.6179, + "step": 5825 + }, + { + "epoch": 1.546661356697199, + "grad_norm": 0.4157360122665706, + "learning_rate": 4.242407651269578e-06, + "loss": 0.57, + "step": 5826 + }, + { + "epoch": 1.5469268551705828, + "grad_norm": 0.4048007360187365, + "learning_rate": 4.242157272195762e-06, + "loss": 0.5828, + "step": 5827 + }, + { + "epoch": 1.5471923536439665, + "grad_norm": 0.40256862972778257, + "learning_rate": 4.241906859145611e-06, + "loss": 0.5625, + "step": 5828 + }, + { + "epoch": 1.5474578521173503, + "grad_norm": 0.4016118657995483, + "learning_rate": 4.2416564121240135e-06, + "loss": 0.5954, + "step": 5829 + }, + { + "epoch": 1.5477233505907342, + "grad_norm": 0.4037782828613498, + "learning_rate": 4.2414059311358516e-06, + "loss": 0.5909, + "step": 5830 + }, + { + "epoch": 1.5479888490641178, + "grad_norm": 0.41781049004740584, + "learning_rate": 4.2411554161860095e-06, + "loss": 0.5459, + "step": 5831 + }, + { + "epoch": 1.5482543475375017, + "grad_norm": 0.4019925451988796, + "learning_rate": 4.240904867279374e-06, + "loss": 0.6088, + "step": 5832 + }, + { + "epoch": 1.5485198460108855, + "grad_norm": 0.4055333673670718, + "learning_rate": 4.240654284420831e-06, + "loss": 0.6078, + "step": 5833 + }, + { + "epoch": 1.5487853444842692, + "grad_norm": 0.4186215945187696, + "learning_rate": 4.240403667615268e-06, + "loss": 0.6023, + "step": 5834 + }, + { + "epoch": 1.5490508429576528, + "grad_norm": 0.405876514870244, + "learning_rate": 4.240153016867572e-06, + "loss": 0.5933, + "step": 5835 + }, + { + "epoch": 1.549316341431037, + "grad_norm": 0.41630704898642606, + "learning_rate": 4.2399023321826316e-06, + "loss": 0.6102, + "step": 5836 + }, + { + "epoch": 1.5495818399044206, + "grad_norm": 0.3978300852038402, + "learning_rate": 4.239651613565335e-06, + "loss": 0.593, + "step": 5837 + }, + { + "epoch": 1.5498473383778042, + "grad_norm": 0.4072855438589772, + "learning_rate": 4.239400861020574e-06, + "loss": 0.6128, + "step": 5838 + }, + { + "epoch": 1.550112836851188, + "grad_norm": 0.4139393078405513, + "learning_rate": 4.239150074553236e-06, + "loss": 0.6129, + "step": 5839 + }, + { + "epoch": 1.550378335324572, + "grad_norm": 0.42332588918592934, + "learning_rate": 4.238899254168214e-06, + "loss": 0.5804, + "step": 5840 + }, + { + "epoch": 1.5506438337979556, + "grad_norm": 0.40530174807263925, + "learning_rate": 4.238648399870399e-06, + "loss": 0.5945, + "step": 5841 + }, + { + "epoch": 1.5509093322713394, + "grad_norm": 0.41599619653155445, + "learning_rate": 4.238397511664683e-06, + "loss": 0.5572, + "step": 5842 + }, + { + "epoch": 1.5511748307447233, + "grad_norm": 0.4035182662794045, + "learning_rate": 4.238146589555959e-06, + "loss": 0.5615, + "step": 5843 + }, + { + "epoch": 1.551440329218107, + "grad_norm": 0.42470619214545546, + "learning_rate": 4.2378956335491216e-06, + "loss": 0.6011, + "step": 5844 + }, + { + "epoch": 1.5517058276914908, + "grad_norm": 0.4190231135994607, + "learning_rate": 4.237644643649064e-06, + "loss": 0.5819, + "step": 5845 + }, + { + "epoch": 1.5519713261648747, + "grad_norm": 0.39722209428262373, + "learning_rate": 4.237393619860681e-06, + "loss": 0.603, + "step": 5846 + }, + { + "epoch": 1.5522368246382583, + "grad_norm": 0.43323014455335324, + "learning_rate": 4.237142562188869e-06, + "loss": 0.5904, + "step": 5847 + }, + { + "epoch": 1.5525023231116422, + "grad_norm": 0.415443619871322, + "learning_rate": 4.236891470638524e-06, + "loss": 0.6152, + "step": 5848 + }, + { + "epoch": 1.552767821585026, + "grad_norm": 0.4098809981824822, + "learning_rate": 4.236640345214541e-06, + "loss": 0.5871, + "step": 5849 + }, + { + "epoch": 1.5530333200584097, + "grad_norm": 0.405467005488627, + "learning_rate": 4.236389185921821e-06, + "loss": 0.5832, + "step": 5850 + }, + { + "epoch": 1.5532988185317933, + "grad_norm": 0.4034172674951067, + "learning_rate": 4.236137992765259e-06, + "loss": 0.6032, + "step": 5851 + }, + { + "epoch": 1.5535643170051774, + "grad_norm": 0.4320247273570949, + "learning_rate": 4.235886765749757e-06, + "loss": 0.5925, + "step": 5852 + }, + { + "epoch": 1.553829815478561, + "grad_norm": 0.41077635104360777, + "learning_rate": 4.235635504880212e-06, + "loss": 0.5762, + "step": 5853 + }, + { + "epoch": 1.5540953139519447, + "grad_norm": 0.3926395662340222, + "learning_rate": 4.235384210161525e-06, + "loss": 0.5661, + "step": 5854 + }, + { + "epoch": 1.5543608124253285, + "grad_norm": 0.4046592172890656, + "learning_rate": 4.235132881598597e-06, + "loss": 0.5823, + "step": 5855 + }, + { + "epoch": 1.5546263108987124, + "grad_norm": 0.39844549991809963, + "learning_rate": 4.23488151919633e-06, + "loss": 0.566, + "step": 5856 + }, + { + "epoch": 1.554891809372096, + "grad_norm": 0.43014457763183267, + "learning_rate": 4.2346301229596255e-06, + "loss": 0.5721, + "step": 5857 + }, + { + "epoch": 1.55515730784548, + "grad_norm": 0.39965980908173704, + "learning_rate": 4.234378692893386e-06, + "loss": 0.5692, + "step": 5858 + }, + { + "epoch": 1.5554228063188638, + "grad_norm": 0.4075590108772619, + "learning_rate": 4.234127229002516e-06, + "loss": 0.6058, + "step": 5859 + }, + { + "epoch": 1.5556883047922474, + "grad_norm": 0.4023674993181511, + "learning_rate": 4.2338757312919184e-06, + "loss": 0.6049, + "step": 5860 + }, + { + "epoch": 1.5559538032656313, + "grad_norm": 0.4014781129877831, + "learning_rate": 4.233624199766499e-06, + "loss": 0.5843, + "step": 5861 + }, + { + "epoch": 1.5562193017390151, + "grad_norm": 0.4235314768565759, + "learning_rate": 4.233372634431164e-06, + "loss": 0.5637, + "step": 5862 + }, + { + "epoch": 1.5564848002123988, + "grad_norm": 0.4087101517562995, + "learning_rate": 4.233121035290818e-06, + "loss": 0.568, + "step": 5863 + }, + { + "epoch": 1.5567502986857824, + "grad_norm": 0.40102443397848125, + "learning_rate": 4.232869402350368e-06, + "loss": 0.5987, + "step": 5864 + }, + { + "epoch": 1.5570157971591665, + "grad_norm": 0.4285702642848026, + "learning_rate": 4.232617735614723e-06, + "loss": 0.5869, + "step": 5865 + }, + { + "epoch": 1.5572812956325501, + "grad_norm": 0.41656109106347255, + "learning_rate": 4.232366035088789e-06, + "loss": 0.5616, + "step": 5866 + }, + { + "epoch": 1.5575467941059338, + "grad_norm": 0.41884577853795607, + "learning_rate": 4.232114300777476e-06, + "loss": 0.6016, + "step": 5867 + }, + { + "epoch": 1.5578122925793176, + "grad_norm": 0.4013414373630112, + "learning_rate": 4.231862532685693e-06, + "loss": 0.6185, + "step": 5868 + }, + { + "epoch": 1.5580777910527015, + "grad_norm": 0.4124732319704868, + "learning_rate": 4.23161073081835e-06, + "loss": 0.6101, + "step": 5869 + }, + { + "epoch": 1.5583432895260851, + "grad_norm": 0.41176002739281914, + "learning_rate": 4.231358895180358e-06, + "loss": 0.5694, + "step": 5870 + }, + { + "epoch": 1.558608787999469, + "grad_norm": 0.41132520728093724, + "learning_rate": 4.23110702577663e-06, + "loss": 0.5927, + "step": 5871 + }, + { + "epoch": 1.5588742864728529, + "grad_norm": 0.41557830903833526, + "learning_rate": 4.2308551226120745e-06, + "loss": 0.5622, + "step": 5872 + }, + { + "epoch": 1.5591397849462365, + "grad_norm": 0.40339539707210914, + "learning_rate": 4.230603185691607e-06, + "loss": 0.558, + "step": 5873 + }, + { + "epoch": 1.5594052834196204, + "grad_norm": 0.40176190724417543, + "learning_rate": 4.230351215020141e-06, + "loss": 0.6058, + "step": 5874 + }, + { + "epoch": 1.5596707818930042, + "grad_norm": 0.41817120041728995, + "learning_rate": 4.230099210602588e-06, + "loss": 0.6008, + "step": 5875 + }, + { + "epoch": 1.5599362803663879, + "grad_norm": 0.4010513109388064, + "learning_rate": 4.229847172443866e-06, + "loss": 0.6136, + "step": 5876 + }, + { + "epoch": 1.5602017788397715, + "grad_norm": 0.39602537916768554, + "learning_rate": 4.229595100548888e-06, + "loss": 0.5022, + "step": 5877 + }, + { + "epoch": 1.5604672773131556, + "grad_norm": 0.4044732716956632, + "learning_rate": 4.229342994922571e-06, + "loss": 0.5858, + "step": 5878 + }, + { + "epoch": 1.5607327757865392, + "grad_norm": 0.4217154112969024, + "learning_rate": 4.2290908555698315e-06, + "loss": 0.5923, + "step": 5879 + }, + { + "epoch": 1.5609982742599229, + "grad_norm": 0.4132991401352576, + "learning_rate": 4.228838682495587e-06, + "loss": 0.6226, + "step": 5880 + }, + { + "epoch": 1.5612637727333067, + "grad_norm": 0.4091460801184228, + "learning_rate": 4.228586475704753e-06, + "loss": 0.5588, + "step": 5881 + }, + { + "epoch": 1.5615292712066906, + "grad_norm": 0.4042625894178759, + "learning_rate": 4.228334235202253e-06, + "loss": 0.5985, + "step": 5882 + }, + { + "epoch": 1.5617947696800742, + "grad_norm": 0.41057629366254517, + "learning_rate": 4.228081960993003e-06, + "loss": 0.5645, + "step": 5883 + }, + { + "epoch": 1.562060268153458, + "grad_norm": 0.4027692636260605, + "learning_rate": 4.227829653081924e-06, + "loss": 0.604, + "step": 5884 + }, + { + "epoch": 1.562325766626842, + "grad_norm": 0.41716933357947744, + "learning_rate": 4.227577311473936e-06, + "loss": 0.5976, + "step": 5885 + }, + { + "epoch": 1.5625912651002256, + "grad_norm": 0.41674950240607905, + "learning_rate": 4.22732493617396e-06, + "loss": 0.6123, + "step": 5886 + }, + { + "epoch": 1.5628567635736095, + "grad_norm": 0.4081189975922936, + "learning_rate": 4.2270725271869195e-06, + "loss": 0.5792, + "step": 5887 + }, + { + "epoch": 1.5631222620469933, + "grad_norm": 0.41343739306195226, + "learning_rate": 4.226820084517736e-06, + "loss": 0.568, + "step": 5888 + }, + { + "epoch": 1.563387760520377, + "grad_norm": 0.4002282207509223, + "learning_rate": 4.226567608171332e-06, + "loss": 0.5809, + "step": 5889 + }, + { + "epoch": 1.5636532589937606, + "grad_norm": 0.41760442619703514, + "learning_rate": 4.226315098152633e-06, + "loss": 0.5817, + "step": 5890 + }, + { + "epoch": 1.5639187574671447, + "grad_norm": 0.4160844121834743, + "learning_rate": 4.226062554466562e-06, + "loss": 0.6201, + "step": 5891 + }, + { + "epoch": 1.5641842559405283, + "grad_norm": 0.398262143753868, + "learning_rate": 4.225809977118046e-06, + "loss": 0.5656, + "step": 5892 + }, + { + "epoch": 1.564449754413912, + "grad_norm": 0.41444698291950066, + "learning_rate": 4.2255573661120095e-06, + "loss": 0.5772, + "step": 5893 + }, + { + "epoch": 1.5647152528872958, + "grad_norm": 0.4063777185920357, + "learning_rate": 4.22530472145338e-06, + "loss": 0.6136, + "step": 5894 + }, + { + "epoch": 1.5649807513606797, + "grad_norm": 0.41199836514667787, + "learning_rate": 4.225052043147083e-06, + "loss": 0.564, + "step": 5895 + }, + { + "epoch": 1.5652462498340634, + "grad_norm": 0.4068644710297008, + "learning_rate": 4.224799331198049e-06, + "loss": 0.6002, + "step": 5896 + }, + { + "epoch": 1.5655117483074472, + "grad_norm": 0.4170324572753706, + "learning_rate": 4.2245465856112044e-06, + "loss": 0.5929, + "step": 5897 + }, + { + "epoch": 1.565777246780831, + "grad_norm": 0.4076004639658558, + "learning_rate": 4.2242938063914784e-06, + "loss": 0.5926, + "step": 5898 + }, + { + "epoch": 1.5660427452542147, + "grad_norm": 0.4136370170116393, + "learning_rate": 4.2240409935438024e-06, + "loss": 0.5934, + "step": 5899 + }, + { + "epoch": 1.5663082437275986, + "grad_norm": 0.4006854615382548, + "learning_rate": 4.2237881470731044e-06, + "loss": 0.5873, + "step": 5900 + }, + { + "epoch": 1.5665737422009824, + "grad_norm": 0.40757664191477816, + "learning_rate": 4.2235352669843185e-06, + "loss": 0.59, + "step": 5901 + }, + { + "epoch": 1.566839240674366, + "grad_norm": 0.4030121772108053, + "learning_rate": 4.223282353282374e-06, + "loss": 0.6202, + "step": 5902 + }, + { + "epoch": 1.56710473914775, + "grad_norm": 0.43258334978943097, + "learning_rate": 4.223029405972204e-06, + "loss": 0.6048, + "step": 5903 + }, + { + "epoch": 1.5673702376211338, + "grad_norm": 0.4218620751956037, + "learning_rate": 4.222776425058743e-06, + "loss": 0.5808, + "step": 5904 + }, + { + "epoch": 1.5676357360945175, + "grad_norm": 0.418534280963697, + "learning_rate": 4.222523410546924e-06, + "loss": 0.5855, + "step": 5905 + }, + { + "epoch": 1.567901234567901, + "grad_norm": 0.43832169659519216, + "learning_rate": 4.2222703624416795e-06, + "loss": 0.5931, + "step": 5906 + }, + { + "epoch": 1.5681667330412852, + "grad_norm": 0.4219943324386712, + "learning_rate": 4.222017280747947e-06, + "loss": 0.6069, + "step": 5907 + }, + { + "epoch": 1.5684322315146688, + "grad_norm": 0.4150603646712336, + "learning_rate": 4.221764165470661e-06, + "loss": 0.6103, + "step": 5908 + }, + { + "epoch": 1.5686977299880525, + "grad_norm": 0.4295903044425061, + "learning_rate": 4.221511016614757e-06, + "loss": 0.566, + "step": 5909 + }, + { + "epoch": 1.5689632284614363, + "grad_norm": 0.4356518803526646, + "learning_rate": 4.221257834185175e-06, + "loss": 0.5799, + "step": 5910 + }, + { + "epoch": 1.5692287269348202, + "grad_norm": 0.4157975325252088, + "learning_rate": 4.22100461818685e-06, + "loss": 0.6036, + "step": 5911 + }, + { + "epoch": 1.5694942254082038, + "grad_norm": 0.4052430668887788, + "learning_rate": 4.220751368624722e-06, + "loss": 0.5994, + "step": 5912 + }, + { + "epoch": 1.5697597238815877, + "grad_norm": 0.428700519188456, + "learning_rate": 4.220498085503728e-06, + "loss": 0.5947, + "step": 5913 + }, + { + "epoch": 1.5700252223549716, + "grad_norm": 0.41918695540082573, + "learning_rate": 4.2202447688288095e-06, + "loss": 0.5877, + "step": 5914 + }, + { + "epoch": 1.5702907208283552, + "grad_norm": 0.4153244902174952, + "learning_rate": 4.219991418604906e-06, + "loss": 0.6168, + "step": 5915 + }, + { + "epoch": 1.570556219301739, + "grad_norm": 0.42847546456605407, + "learning_rate": 4.2197380348369585e-06, + "loss": 0.5769, + "step": 5916 + }, + { + "epoch": 1.570821717775123, + "grad_norm": 0.4196710552880703, + "learning_rate": 4.219484617529909e-06, + "loss": 0.5765, + "step": 5917 + }, + { + "epoch": 1.5710872162485066, + "grad_norm": 0.41646039173416527, + "learning_rate": 4.2192311666887e-06, + "loss": 0.6288, + "step": 5918 + }, + { + "epoch": 1.5713527147218902, + "grad_norm": 0.42930802234671644, + "learning_rate": 4.2189776823182725e-06, + "loss": 0.5775, + "step": 5919 + }, + { + "epoch": 1.5716182131952743, + "grad_norm": 0.45519510858263085, + "learning_rate": 4.218724164423572e-06, + "loss": 0.5958, + "step": 5920 + }, + { + "epoch": 1.571883711668658, + "grad_norm": 0.43972190133326644, + "learning_rate": 4.218470613009542e-06, + "loss": 0.5628, + "step": 5921 + }, + { + "epoch": 1.5721492101420416, + "grad_norm": 0.389497709810723, + "learning_rate": 4.218217028081128e-06, + "loss": 0.584, + "step": 5922 + }, + { + "epoch": 1.5724147086154254, + "grad_norm": 0.452147906159794, + "learning_rate": 4.217963409643274e-06, + "loss": 0.6105, + "step": 5923 + }, + { + "epoch": 1.5726802070888093, + "grad_norm": 0.43340560000034933, + "learning_rate": 4.217709757700928e-06, + "loss": 0.5949, + "step": 5924 + }, + { + "epoch": 1.572945705562193, + "grad_norm": 0.43131757974404655, + "learning_rate": 4.217456072259036e-06, + "loss": 0.6204, + "step": 5925 + }, + { + "epoch": 1.5732112040355768, + "grad_norm": 0.4044872744621555, + "learning_rate": 4.2172023533225455e-06, + "loss": 0.6001, + "step": 5926 + }, + { + "epoch": 1.5734767025089607, + "grad_norm": 0.4109284103509459, + "learning_rate": 4.216948600896404e-06, + "loss": 0.5759, + "step": 5927 + }, + { + "epoch": 1.5737422009823443, + "grad_norm": 0.40287804903654656, + "learning_rate": 4.2166948149855625e-06, + "loss": 0.5871, + "step": 5928 + }, + { + "epoch": 1.5740076994557282, + "grad_norm": 0.4044696521998476, + "learning_rate": 4.216440995594968e-06, + "loss": 0.5743, + "step": 5929 + }, + { + "epoch": 1.574273197929112, + "grad_norm": 0.39533156802861547, + "learning_rate": 4.216187142729571e-06, + "loss": 0.5603, + "step": 5930 + }, + { + "epoch": 1.5745386964024957, + "grad_norm": 0.40899463646742273, + "learning_rate": 4.2159332563943236e-06, + "loss": 0.5852, + "step": 5931 + }, + { + "epoch": 1.5748041948758793, + "grad_norm": 0.41925956785571183, + "learning_rate": 4.215679336594176e-06, + "loss": 0.6013, + "step": 5932 + }, + { + "epoch": 1.5750696933492634, + "grad_norm": 0.4189328959409041, + "learning_rate": 4.2154253833340806e-06, + "loss": 0.5815, + "step": 5933 + }, + { + "epoch": 1.575335191822647, + "grad_norm": 0.40386936387151323, + "learning_rate": 4.21517139661899e-06, + "loss": 0.5757, + "step": 5934 + }, + { + "epoch": 1.5756006902960307, + "grad_norm": 0.4170304415279702, + "learning_rate": 4.2149173764538585e-06, + "loss": 0.5919, + "step": 5935 + }, + { + "epoch": 1.5758661887694145, + "grad_norm": 0.40576731645784875, + "learning_rate": 4.214663322843639e-06, + "loss": 0.6074, + "step": 5936 + }, + { + "epoch": 1.5761316872427984, + "grad_norm": 0.40562659783285193, + "learning_rate": 4.2144092357932866e-06, + "loss": 0.5477, + "step": 5937 + }, + { + "epoch": 1.576397185716182, + "grad_norm": 0.40788260610754606, + "learning_rate": 4.214155115307755e-06, + "loss": 0.6253, + "step": 5938 + }, + { + "epoch": 1.576662684189566, + "grad_norm": 0.40154662351773246, + "learning_rate": 4.2139009613920035e-06, + "loss": 0.5475, + "step": 5939 + }, + { + "epoch": 1.5769281826629498, + "grad_norm": 0.4115552774617222, + "learning_rate": 4.213646774050986e-06, + "loss": 0.5875, + "step": 5940 + }, + { + "epoch": 1.5771936811363334, + "grad_norm": 0.4144542959024575, + "learning_rate": 4.2133925532896604e-06, + "loss": 0.5713, + "step": 5941 + }, + { + "epoch": 1.5774591796097173, + "grad_norm": 0.41059310937891563, + "learning_rate": 4.213138299112986e-06, + "loss": 0.6193, + "step": 5942 + }, + { + "epoch": 1.5777246780831011, + "grad_norm": 0.4147022724289457, + "learning_rate": 4.21288401152592e-06, + "loss": 0.6137, + "step": 5943 + }, + { + "epoch": 1.5779901765564848, + "grad_norm": 0.40987596971745355, + "learning_rate": 4.212629690533421e-06, + "loss": 0.6239, + "step": 5944 + }, + { + "epoch": 1.5782556750298686, + "grad_norm": 0.41194164881596446, + "learning_rate": 4.212375336140451e-06, + "loss": 0.5555, + "step": 5945 + }, + { + "epoch": 1.5785211735032525, + "grad_norm": 0.40441671216400144, + "learning_rate": 4.2121209483519685e-06, + "loss": 0.5932, + "step": 5946 + }, + { + "epoch": 1.5787866719766361, + "grad_norm": 0.4129208472987318, + "learning_rate": 4.211866527172935e-06, + "loss": 0.591, + "step": 5947 + }, + { + "epoch": 1.5790521704500198, + "grad_norm": 0.4188609381846575, + "learning_rate": 4.211612072608314e-06, + "loss": 0.5388, + "step": 5948 + }, + { + "epoch": 1.5793176689234036, + "grad_norm": 0.40912721622715353, + "learning_rate": 4.211357584663066e-06, + "loss": 0.6259, + "step": 5949 + }, + { + "epoch": 1.5795831673967875, + "grad_norm": 0.4060790567069496, + "learning_rate": 4.2111030633421556e-06, + "loss": 0.5751, + "step": 5950 + }, + { + "epoch": 1.5798486658701711, + "grad_norm": 0.41568493885817687, + "learning_rate": 4.210848508650546e-06, + "loss": 0.6012, + "step": 5951 + }, + { + "epoch": 1.580114164343555, + "grad_norm": 0.4237085114846836, + "learning_rate": 4.210593920593201e-06, + "loss": 0.6048, + "step": 5952 + }, + { + "epoch": 1.5803796628169389, + "grad_norm": 0.4087352108341222, + "learning_rate": 4.2103392991750865e-06, + "loss": 0.5951, + "step": 5953 + }, + { + "epoch": 1.5806451612903225, + "grad_norm": 0.44598374346046615, + "learning_rate": 4.210084644401169e-06, + "loss": 0.5876, + "step": 5954 + }, + { + "epoch": 1.5809106597637064, + "grad_norm": 0.4029743827997855, + "learning_rate": 4.209829956276413e-06, + "loss": 0.5981, + "step": 5955 + }, + { + "epoch": 1.5811761582370902, + "grad_norm": 0.4121599418975682, + "learning_rate": 4.209575234805786e-06, + "loss": 0.5556, + "step": 5956 + }, + { + "epoch": 1.5814416567104739, + "grad_norm": 0.425859398239993, + "learning_rate": 4.2093204799942565e-06, + "loss": 0.5854, + "step": 5957 + }, + { + "epoch": 1.5817071551838577, + "grad_norm": 0.43541061108265505, + "learning_rate": 4.2090656918467935e-06, + "loss": 0.5773, + "step": 5958 + }, + { + "epoch": 1.5819726536572416, + "grad_norm": 0.40816877271914176, + "learning_rate": 4.208810870368364e-06, + "loss": 0.632, + "step": 5959 + }, + { + "epoch": 1.5822381521306252, + "grad_norm": 0.4155544602766432, + "learning_rate": 4.2085560155639385e-06, + "loss": 0.5858, + "step": 5960 + }, + { + "epoch": 1.5825036506040089, + "grad_norm": 0.3944216205024293, + "learning_rate": 4.208301127438487e-06, + "loss": 0.5781, + "step": 5961 + }, + { + "epoch": 1.582769149077393, + "grad_norm": 0.40462766703041153, + "learning_rate": 4.208046205996983e-06, + "loss": 0.5528, + "step": 5962 + }, + { + "epoch": 1.5830346475507766, + "grad_norm": 0.4379330632387086, + "learning_rate": 4.207791251244394e-06, + "loss": 0.5587, + "step": 5963 + }, + { + "epoch": 1.5833001460241602, + "grad_norm": 0.4188592762053884, + "learning_rate": 4.207536263185695e-06, + "loss": 0.5745, + "step": 5964 + }, + { + "epoch": 1.5835656444975441, + "grad_norm": 0.3981961743754073, + "learning_rate": 4.207281241825858e-06, + "loss": 0.5647, + "step": 5965 + }, + { + "epoch": 1.583831142970928, + "grad_norm": 0.4189791875112949, + "learning_rate": 4.207026187169856e-06, + "loss": 0.6078, + "step": 5966 + }, + { + "epoch": 1.5840966414443116, + "grad_norm": 0.4063913160380496, + "learning_rate": 4.206771099222665e-06, + "loss": 0.5822, + "step": 5967 + }, + { + "epoch": 1.5843621399176955, + "grad_norm": 0.4149144916651215, + "learning_rate": 4.206515977989258e-06, + "loss": 0.603, + "step": 5968 + }, + { + "epoch": 1.5846276383910793, + "grad_norm": 0.4016819064469472, + "learning_rate": 4.206260823474612e-06, + "loss": 0.5855, + "step": 5969 + }, + { + "epoch": 1.584893136864463, + "grad_norm": 0.4057378993228367, + "learning_rate": 4.206005635683701e-06, + "loss": 0.5834, + "step": 5970 + }, + { + "epoch": 1.5851586353378468, + "grad_norm": 0.38687882269812934, + "learning_rate": 4.205750414621503e-06, + "loss": 0.5702, + "step": 5971 + }, + { + "epoch": 1.5854241338112307, + "grad_norm": 0.4060709000613773, + "learning_rate": 4.205495160292996e-06, + "loss": 0.5782, + "step": 5972 + }, + { + "epoch": 1.5856896322846143, + "grad_norm": 0.4145604152503478, + "learning_rate": 4.205239872703158e-06, + "loss": 0.5931, + "step": 5973 + }, + { + "epoch": 1.585955130757998, + "grad_norm": 0.39135702850820037, + "learning_rate": 4.204984551856968e-06, + "loss": 0.563, + "step": 5974 + }, + { + "epoch": 1.586220629231382, + "grad_norm": 0.4001288730345016, + "learning_rate": 4.204729197759403e-06, + "loss": 0.5772, + "step": 5975 + }, + { + "epoch": 1.5864861277047657, + "grad_norm": 0.41946455918791337, + "learning_rate": 4.204473810415446e-06, + "loss": 0.6101, + "step": 5976 + }, + { + "epoch": 1.5867516261781494, + "grad_norm": 0.3999852785381553, + "learning_rate": 4.204218389830076e-06, + "loss": 0.5842, + "step": 5977 + }, + { + "epoch": 1.5870171246515332, + "grad_norm": 0.4215461550554838, + "learning_rate": 4.203962936008275e-06, + "loss": 0.6235, + "step": 5978 + }, + { + "epoch": 1.587282623124917, + "grad_norm": 0.42373764730836094, + "learning_rate": 4.203707448955024e-06, + "loss": 0.5966, + "step": 5979 + }, + { + "epoch": 1.5875481215983007, + "grad_norm": 0.429760373243519, + "learning_rate": 4.203451928675306e-06, + "loss": 0.6217, + "step": 5980 + }, + { + "epoch": 1.5878136200716846, + "grad_norm": 0.4129260216197234, + "learning_rate": 4.203196375174106e-06, + "loss": 0.5545, + "step": 5981 + }, + { + "epoch": 1.5880791185450684, + "grad_norm": 0.39771812223466274, + "learning_rate": 4.202940788456405e-06, + "loss": 0.6288, + "step": 5982 + }, + { + "epoch": 1.588344617018452, + "grad_norm": 0.3903532495377373, + "learning_rate": 4.202685168527191e-06, + "loss": 0.5778, + "step": 5983 + }, + { + "epoch": 1.588610115491836, + "grad_norm": 0.39689511354125345, + "learning_rate": 4.202429515391446e-06, + "loss": 0.5216, + "step": 5984 + }, + { + "epoch": 1.5888756139652198, + "grad_norm": 0.41033234328965706, + "learning_rate": 4.2021738290541585e-06, + "loss": 0.5454, + "step": 5985 + }, + { + "epoch": 1.5891411124386035, + "grad_norm": 0.4057344824731183, + "learning_rate": 4.201918109520312e-06, + "loss": 0.6122, + "step": 5986 + }, + { + "epoch": 1.589406610911987, + "grad_norm": 0.40231300412898074, + "learning_rate": 4.201662356794896e-06, + "loss": 0.5632, + "step": 5987 + }, + { + "epoch": 1.5896721093853712, + "grad_norm": 0.4265490622011692, + "learning_rate": 4.201406570882898e-06, + "loss": 0.6224, + "step": 5988 + }, + { + "epoch": 1.5899376078587548, + "grad_norm": 0.40712687510895884, + "learning_rate": 4.201150751789305e-06, + "loss": 0.6053, + "step": 5989 + }, + { + "epoch": 1.5902031063321385, + "grad_norm": 0.4091628103122109, + "learning_rate": 4.2008948995191085e-06, + "loss": 0.571, + "step": 5990 + }, + { + "epoch": 1.5904686048055223, + "grad_norm": 0.41336670515306734, + "learning_rate": 4.200639014077297e-06, + "loss": 0.5782, + "step": 5991 + }, + { + "epoch": 1.5907341032789062, + "grad_norm": 0.4046226993967342, + "learning_rate": 4.2003830954688605e-06, + "loss": 0.5894, + "step": 5992 + }, + { + "epoch": 1.5909996017522898, + "grad_norm": 0.4128853195306421, + "learning_rate": 4.20012714369879e-06, + "loss": 0.6186, + "step": 5993 + }, + { + "epoch": 1.5912651002256737, + "grad_norm": 0.4313063298193808, + "learning_rate": 4.199871158772077e-06, + "loss": 0.5918, + "step": 5994 + }, + { + "epoch": 1.5915305986990576, + "grad_norm": 0.3914005958264913, + "learning_rate": 4.199615140693716e-06, + "loss": 0.546, + "step": 5995 + }, + { + "epoch": 1.5917960971724412, + "grad_norm": 0.400408273087365, + "learning_rate": 4.199359089468697e-06, + "loss": 0.582, + "step": 5996 + }, + { + "epoch": 1.592061595645825, + "grad_norm": 0.41062738685374334, + "learning_rate": 4.199103005102016e-06, + "loss": 0.6125, + "step": 5997 + }, + { + "epoch": 1.592327094119209, + "grad_norm": 0.4246184671474157, + "learning_rate": 4.198846887598666e-06, + "loss": 0.5955, + "step": 5998 + }, + { + "epoch": 1.5925925925925926, + "grad_norm": 0.3967529096668717, + "learning_rate": 4.198590736963642e-06, + "loss": 0.5416, + "step": 5999 + }, + { + "epoch": 1.5928580910659764, + "grad_norm": 0.4133297856797672, + "learning_rate": 4.19833455320194e-06, + "loss": 0.639, + "step": 6000 + }, + { + "epoch": 1.5931235895393603, + "grad_norm": 0.4070327314079298, + "learning_rate": 4.198078336318555e-06, + "loss": 0.5758, + "step": 6001 + }, + { + "epoch": 1.593389088012744, + "grad_norm": 0.4202513466676356, + "learning_rate": 4.197822086318486e-06, + "loss": 0.5555, + "step": 6002 + }, + { + "epoch": 1.5936545864861276, + "grad_norm": 0.39350257469331734, + "learning_rate": 4.197565803206729e-06, + "loss": 0.5931, + "step": 6003 + }, + { + "epoch": 1.5939200849595114, + "grad_norm": 0.40212818904911024, + "learning_rate": 4.197309486988283e-06, + "loss": 0.5873, + "step": 6004 + }, + { + "epoch": 1.5941855834328953, + "grad_norm": 0.4026076412576511, + "learning_rate": 4.197053137668145e-06, + "loss": 0.6058, + "step": 6005 + }, + { + "epoch": 1.594451081906279, + "grad_norm": 0.4421551757263256, + "learning_rate": 4.196796755251317e-06, + "loss": 0.5852, + "step": 6006 + }, + { + "epoch": 1.5947165803796628, + "grad_norm": 0.4147659196130696, + "learning_rate": 4.1965403397427965e-06, + "loss": 0.5753, + "step": 6007 + }, + { + "epoch": 1.5949820788530467, + "grad_norm": 0.40457771278237803, + "learning_rate": 4.196283891147587e-06, + "loss": 0.574, + "step": 6008 + }, + { + "epoch": 1.5952475773264303, + "grad_norm": 0.39508156118232685, + "learning_rate": 4.196027409470688e-06, + "loss": 0.6126, + "step": 6009 + }, + { + "epoch": 1.5955130757998142, + "grad_norm": 0.40919573593044006, + "learning_rate": 4.195770894717101e-06, + "loss": 0.6301, + "step": 6010 + }, + { + "epoch": 1.595778574273198, + "grad_norm": 0.4107629359874985, + "learning_rate": 4.19551434689183e-06, + "loss": 0.5823, + "step": 6011 + }, + { + "epoch": 1.5960440727465817, + "grad_norm": 0.41731025839725505, + "learning_rate": 4.195257765999879e-06, + "loss": 0.6038, + "step": 6012 + }, + { + "epoch": 1.5963095712199655, + "grad_norm": 0.38997182093140825, + "learning_rate": 4.1950011520462484e-06, + "loss": 0.5811, + "step": 6013 + }, + { + "epoch": 1.5965750696933494, + "grad_norm": 0.40432310650264053, + "learning_rate": 4.194744505035948e-06, + "loss": 0.554, + "step": 6014 + }, + { + "epoch": 1.596840568166733, + "grad_norm": 0.4201659505765884, + "learning_rate": 4.194487824973979e-06, + "loss": 0.5991, + "step": 6015 + }, + { + "epoch": 1.5971060666401167, + "grad_norm": 0.3993082821757404, + "learning_rate": 4.194231111865348e-06, + "loss": 0.5555, + "step": 6016 + }, + { + "epoch": 1.5973715651135008, + "grad_norm": 0.41904984827321556, + "learning_rate": 4.193974365715064e-06, + "loss": 0.621, + "step": 6017 + }, + { + "epoch": 1.5976370635868844, + "grad_norm": 0.4090905886977012, + "learning_rate": 4.1937175865281295e-06, + "loss": 0.5936, + "step": 6018 + }, + { + "epoch": 1.597902562060268, + "grad_norm": 0.4029763981599839, + "learning_rate": 4.193460774309557e-06, + "loss": 0.6192, + "step": 6019 + }, + { + "epoch": 1.598168060533652, + "grad_norm": 0.3937157037597357, + "learning_rate": 4.1932039290643534e-06, + "loss": 0.5982, + "step": 6020 + }, + { + "epoch": 1.5984335590070358, + "grad_norm": 0.41644591247177665, + "learning_rate": 4.192947050797527e-06, + "loss": 0.6061, + "step": 6021 + }, + { + "epoch": 1.5986990574804194, + "grad_norm": 0.4342225597248977, + "learning_rate": 4.192690139514088e-06, + "loss": 0.5792, + "step": 6022 + }, + { + "epoch": 1.5989645559538033, + "grad_norm": 0.41188019422404626, + "learning_rate": 4.192433195219047e-06, + "loss": 0.5776, + "step": 6023 + }, + { + "epoch": 1.5992300544271871, + "grad_norm": 0.4246544101093638, + "learning_rate": 4.1921762179174145e-06, + "loss": 0.5981, + "step": 6024 + }, + { + "epoch": 1.5994955529005708, + "grad_norm": 0.42876932450653327, + "learning_rate": 4.191919207614203e-06, + "loss": 0.5863, + "step": 6025 + }, + { + "epoch": 1.5997610513739546, + "grad_norm": 0.40472826607595974, + "learning_rate": 4.191662164314424e-06, + "loss": 0.5915, + "step": 6026 + }, + { + "epoch": 1.6000265498473385, + "grad_norm": 0.41168054434141055, + "learning_rate": 4.19140508802309e-06, + "loss": 0.564, + "step": 6027 + }, + { + "epoch": 1.6002920483207221, + "grad_norm": 0.3986275230207836, + "learning_rate": 4.191147978745218e-06, + "loss": 0.5798, + "step": 6028 + }, + { + "epoch": 1.6005575467941058, + "grad_norm": 0.40809307681144, + "learning_rate": 4.1908908364858185e-06, + "loss": 0.5786, + "step": 6029 + }, + { + "epoch": 1.6008230452674899, + "grad_norm": 0.4026854828048606, + "learning_rate": 4.190633661249907e-06, + "loss": 0.6128, + "step": 6030 + }, + { + "epoch": 1.6010885437408735, + "grad_norm": 0.41094788097923096, + "learning_rate": 4.1903764530425005e-06, + "loss": 0.639, + "step": 6031 + }, + { + "epoch": 1.6013540422142571, + "grad_norm": 0.4049171393338981, + "learning_rate": 4.190119211868614e-06, + "loss": 0.5963, + "step": 6032 + }, + { + "epoch": 1.601619540687641, + "grad_norm": 0.41232436452152416, + "learning_rate": 4.189861937733265e-06, + "loss": 0.6036, + "step": 6033 + }, + { + "epoch": 1.6018850391610249, + "grad_norm": 0.42445584696695715, + "learning_rate": 4.1896046306414705e-06, + "loss": 0.5707, + "step": 6034 + }, + { + "epoch": 1.6021505376344085, + "grad_norm": 0.40834564499552445, + "learning_rate": 4.189347290598249e-06, + "loss": 0.5594, + "step": 6035 + }, + { + "epoch": 1.6024160361077924, + "grad_norm": 0.41454216488797413, + "learning_rate": 4.1890899176086194e-06, + "loss": 0.575, + "step": 6036 + }, + { + "epoch": 1.6026815345811762, + "grad_norm": 0.4105445653650947, + "learning_rate": 4.1888325116776004e-06, + "loss": 0.5912, + "step": 6037 + }, + { + "epoch": 1.6029470330545599, + "grad_norm": 0.41284401700146206, + "learning_rate": 4.1885750728102135e-06, + "loss": 0.5906, + "step": 6038 + }, + { + "epoch": 1.6032125315279437, + "grad_norm": 0.42300140448743107, + "learning_rate": 4.1883176010114764e-06, + "loss": 0.6081, + "step": 6039 + }, + { + "epoch": 1.6034780300013276, + "grad_norm": 0.3989886367238546, + "learning_rate": 4.188060096286414e-06, + "loss": 0.5505, + "step": 6040 + }, + { + "epoch": 1.6037435284747112, + "grad_norm": 0.4086036100972658, + "learning_rate": 4.187802558640046e-06, + "loss": 0.576, + "step": 6041 + }, + { + "epoch": 1.6040090269480949, + "grad_norm": 0.4034879193494364, + "learning_rate": 4.187544988077395e-06, + "loss": 0.5413, + "step": 6042 + }, + { + "epoch": 1.604274525421479, + "grad_norm": 0.3935537817437524, + "learning_rate": 4.187287384603486e-06, + "loss": 0.5744, + "step": 6043 + }, + { + "epoch": 1.6045400238948626, + "grad_norm": 0.4040200452183954, + "learning_rate": 4.187029748223341e-06, + "loss": 0.5938, + "step": 6044 + }, + { + "epoch": 1.6048055223682463, + "grad_norm": 0.41149813239860483, + "learning_rate": 4.186772078941986e-06, + "loss": 0.5564, + "step": 6045 + }, + { + "epoch": 1.6050710208416301, + "grad_norm": 0.40439805725675165, + "learning_rate": 4.186514376764446e-06, + "loss": 0.5449, + "step": 6046 + }, + { + "epoch": 1.605336519315014, + "grad_norm": 0.40787848283245315, + "learning_rate": 4.186256641695745e-06, + "loss": 0.5689, + "step": 6047 + }, + { + "epoch": 1.6056020177883976, + "grad_norm": 0.3926588912516516, + "learning_rate": 4.1859988737409114e-06, + "loss": 0.5778, + "step": 6048 + }, + { + "epoch": 1.6058675162617815, + "grad_norm": 0.41539247350104747, + "learning_rate": 4.185741072904972e-06, + "loss": 0.5916, + "step": 6049 + }, + { + "epoch": 1.6061330147351653, + "grad_norm": 0.4298756280854937, + "learning_rate": 4.185483239192954e-06, + "loss": 0.6093, + "step": 6050 + }, + { + "epoch": 1.606398513208549, + "grad_norm": 0.41554725769750595, + "learning_rate": 4.185225372609886e-06, + "loss": 0.585, + "step": 6051 + }, + { + "epoch": 1.6066640116819328, + "grad_norm": 0.4038829145919097, + "learning_rate": 4.184967473160797e-06, + "loss": 0.5752, + "step": 6052 + }, + { + "epoch": 1.6069295101553167, + "grad_norm": 0.4057289541482558, + "learning_rate": 4.184709540850717e-06, + "loss": 0.5947, + "step": 6053 + }, + { + "epoch": 1.6071950086287004, + "grad_norm": 0.4034778598210146, + "learning_rate": 4.184451575684677e-06, + "loss": 0.5968, + "step": 6054 + }, + { + "epoch": 1.6074605071020842, + "grad_norm": 0.4092400161315561, + "learning_rate": 4.1841935776677055e-06, + "loss": 0.531, + "step": 6055 + }, + { + "epoch": 1.607726005575468, + "grad_norm": 0.41883555978447745, + "learning_rate": 4.183935546804836e-06, + "loss": 0.5784, + "step": 6056 + }, + { + "epoch": 1.6079915040488517, + "grad_norm": 0.4083646428369843, + "learning_rate": 4.183677483101101e-06, + "loss": 0.5878, + "step": 6057 + }, + { + "epoch": 1.6082570025222354, + "grad_norm": 0.39761457937439987, + "learning_rate": 4.183419386561532e-06, + "loss": 0.6052, + "step": 6058 + }, + { + "epoch": 1.6085225009956192, + "grad_norm": 0.40667292836753893, + "learning_rate": 4.183161257191163e-06, + "loss": 0.5998, + "step": 6059 + }, + { + "epoch": 1.608787999469003, + "grad_norm": 0.4000448845102318, + "learning_rate": 4.18290309499503e-06, + "loss": 0.597, + "step": 6060 + }, + { + "epoch": 1.6090534979423867, + "grad_norm": 0.4153649457019075, + "learning_rate": 4.1826448999781646e-06, + "loss": 0.617, + "step": 6061 + }, + { + "epoch": 1.6093189964157706, + "grad_norm": 0.4097173663044732, + "learning_rate": 4.182386672145604e-06, + "loss": 0.5489, + "step": 6062 + }, + { + "epoch": 1.6095844948891544, + "grad_norm": 0.40404261362103244, + "learning_rate": 4.1821284115023845e-06, + "loss": 0.5666, + "step": 6063 + }, + { + "epoch": 1.609849993362538, + "grad_norm": 0.4312765154231968, + "learning_rate": 4.181870118053543e-06, + "loss": 0.5771, + "step": 6064 + }, + { + "epoch": 1.610115491835922, + "grad_norm": 0.427891179051653, + "learning_rate": 4.181611791804115e-06, + "loss": 0.6098, + "step": 6065 + }, + { + "epoch": 1.6103809903093058, + "grad_norm": 0.40751219237961833, + "learning_rate": 4.1813534327591405e-06, + "loss": 0.6018, + "step": 6066 + }, + { + "epoch": 1.6106464887826895, + "grad_norm": 0.4150417504875353, + "learning_rate": 4.1810950409236575e-06, + "loss": 0.5792, + "step": 6067 + }, + { + "epoch": 1.6109119872560733, + "grad_norm": 0.40469706168129826, + "learning_rate": 4.180836616302704e-06, + "loss": 0.5881, + "step": 6068 + }, + { + "epoch": 1.6111774857294572, + "grad_norm": 0.39557641769068447, + "learning_rate": 4.180578158901323e-06, + "loss": 0.5948, + "step": 6069 + }, + { + "epoch": 1.6114429842028408, + "grad_norm": 0.40207327005497584, + "learning_rate": 4.180319668724551e-06, + "loss": 0.5793, + "step": 6070 + }, + { + "epoch": 1.6117084826762245, + "grad_norm": 0.3932159742067176, + "learning_rate": 4.180061145777433e-06, + "loss": 0.5839, + "step": 6071 + }, + { + "epoch": 1.6119739811496085, + "grad_norm": 0.41540833788977327, + "learning_rate": 4.1798025900650086e-06, + "loss": 0.6116, + "step": 6072 + }, + { + "epoch": 1.6122394796229922, + "grad_norm": 0.4045738217370343, + "learning_rate": 4.179544001592321e-06, + "loss": 0.5687, + "step": 6073 + }, + { + "epoch": 1.6125049780963758, + "grad_norm": 0.4098193846988828, + "learning_rate": 4.179285380364413e-06, + "loss": 0.5994, + "step": 6074 + }, + { + "epoch": 1.6127704765697597, + "grad_norm": 0.4160047075260533, + "learning_rate": 4.179026726386329e-06, + "loss": 0.622, + "step": 6075 + }, + { + "epoch": 1.6130359750431436, + "grad_norm": 0.4121566456455558, + "learning_rate": 4.178768039663111e-06, + "loss": 0.6006, + "step": 6076 + }, + { + "epoch": 1.6133014735165272, + "grad_norm": 0.4213164058973677, + "learning_rate": 4.178509320199808e-06, + "loss": 0.5687, + "step": 6077 + }, + { + "epoch": 1.613566971989911, + "grad_norm": 0.41204682901178885, + "learning_rate": 4.1782505680014625e-06, + "loss": 0.5979, + "step": 6078 + }, + { + "epoch": 1.613832470463295, + "grad_norm": 0.41544248228873504, + "learning_rate": 4.177991783073123e-06, + "loss": 0.5945, + "step": 6079 + }, + { + "epoch": 1.6140979689366786, + "grad_norm": 0.4021412627030649, + "learning_rate": 4.177732965419834e-06, + "loss": 0.5315, + "step": 6080 + }, + { + "epoch": 1.6143634674100624, + "grad_norm": 0.4047176103043243, + "learning_rate": 4.177474115046645e-06, + "loss": 0.6031, + "step": 6081 + }, + { + "epoch": 1.6146289658834463, + "grad_norm": 0.42015720900913267, + "learning_rate": 4.177215231958604e-06, + "loss": 0.5623, + "step": 6082 + }, + { + "epoch": 1.61489446435683, + "grad_norm": 0.4019744291178643, + "learning_rate": 4.176956316160759e-06, + "loss": 0.5612, + "step": 6083 + }, + { + "epoch": 1.6151599628302136, + "grad_norm": 0.4203494210920179, + "learning_rate": 4.176697367658159e-06, + "loss": 0.6212, + "step": 6084 + }, + { + "epoch": 1.6154254613035977, + "grad_norm": 0.41243660741486005, + "learning_rate": 4.176438386455856e-06, + "loss": 0.6063, + "step": 6085 + }, + { + "epoch": 1.6156909597769813, + "grad_norm": 0.40795909220205034, + "learning_rate": 4.1761793725588995e-06, + "loss": 0.5536, + "step": 6086 + }, + { + "epoch": 1.615956458250365, + "grad_norm": 0.42411925222603436, + "learning_rate": 4.175920325972341e-06, + "loss": 0.5951, + "step": 6087 + }, + { + "epoch": 1.6162219567237488, + "grad_norm": 0.4036072881164683, + "learning_rate": 4.175661246701234e-06, + "loss": 0.5804, + "step": 6088 + }, + { + "epoch": 1.6164874551971327, + "grad_norm": 0.42126595905352393, + "learning_rate": 4.175402134750629e-06, + "loss": 0.5979, + "step": 6089 + }, + { + "epoch": 1.6167529536705163, + "grad_norm": 0.4089282972814364, + "learning_rate": 4.175142990125581e-06, + "loss": 0.5587, + "step": 6090 + }, + { + "epoch": 1.6170184521439002, + "grad_norm": 0.40842373125551007, + "learning_rate": 4.174883812831142e-06, + "loss": 0.6072, + "step": 6091 + }, + { + "epoch": 1.617283950617284, + "grad_norm": 0.41307635953059874, + "learning_rate": 4.174624602872369e-06, + "loss": 0.6028, + "step": 6092 + }, + { + "epoch": 1.6175494490906677, + "grad_norm": 0.4041236531775181, + "learning_rate": 4.174365360254316e-06, + "loss": 0.5764, + "step": 6093 + }, + { + "epoch": 1.6178149475640515, + "grad_norm": 0.4160419026301076, + "learning_rate": 4.174106084982038e-06, + "loss": 0.6037, + "step": 6094 + }, + { + "epoch": 1.6180804460374354, + "grad_norm": 0.4084376416196963, + "learning_rate": 4.173846777060593e-06, + "loss": 0.5799, + "step": 6095 + }, + { + "epoch": 1.618345944510819, + "grad_norm": 0.42601488847702534, + "learning_rate": 4.1735874364950364e-06, + "loss": 0.6252, + "step": 6096 + }, + { + "epoch": 1.6186114429842027, + "grad_norm": 0.40899829897887763, + "learning_rate": 4.173328063290429e-06, + "loss": 0.5371, + "step": 6097 + }, + { + "epoch": 1.6188769414575868, + "grad_norm": 0.41847364874322346, + "learning_rate": 4.173068657451827e-06, + "loss": 0.5855, + "step": 6098 + }, + { + "epoch": 1.6191424399309704, + "grad_norm": 0.43040027552388593, + "learning_rate": 4.172809218984289e-06, + "loss": 0.6033, + "step": 6099 + }, + { + "epoch": 1.619407938404354, + "grad_norm": 0.42051275205866556, + "learning_rate": 4.1725497478928765e-06, + "loss": 0.5483, + "step": 6100 + }, + { + "epoch": 1.619673436877738, + "grad_norm": 0.40658005765987104, + "learning_rate": 4.172290244182647e-06, + "loss": 0.5613, + "step": 6101 + }, + { + "epoch": 1.6199389353511218, + "grad_norm": 0.4144136657336713, + "learning_rate": 4.172030707858665e-06, + "loss": 0.5799, + "step": 6102 + }, + { + "epoch": 1.6202044338245054, + "grad_norm": 0.414170883594481, + "learning_rate": 4.1717711389259905e-06, + "loss": 0.5949, + "step": 6103 + }, + { + "epoch": 1.6204699322978893, + "grad_norm": 0.4137007605600363, + "learning_rate": 4.171511537389684e-06, + "loss": 0.6115, + "step": 6104 + }, + { + "epoch": 1.6207354307712731, + "grad_norm": 0.4074563715173919, + "learning_rate": 4.171251903254812e-06, + "loss": 0.5686, + "step": 6105 + }, + { + "epoch": 1.6210009292446568, + "grad_norm": 0.40335924884148944, + "learning_rate": 4.170992236526434e-06, + "loss": 0.5591, + "step": 6106 + }, + { + "epoch": 1.6212664277180406, + "grad_norm": 0.4070786816692419, + "learning_rate": 4.170732537209617e-06, + "loss": 0.562, + "step": 6107 + }, + { + "epoch": 1.6215319261914245, + "grad_norm": 0.40881955865293984, + "learning_rate": 4.170472805309424e-06, + "loss": 0.5731, + "step": 6108 + }, + { + "epoch": 1.6217974246648081, + "grad_norm": 0.4050546172988652, + "learning_rate": 4.1702130408309226e-06, + "loss": 0.6221, + "step": 6109 + }, + { + "epoch": 1.622062923138192, + "grad_norm": 0.39869275073453253, + "learning_rate": 4.169953243779177e-06, + "loss": 0.5734, + "step": 6110 + }, + { + "epoch": 1.6223284216115759, + "grad_norm": 0.4057329653986426, + "learning_rate": 4.169693414159255e-06, + "loss": 0.5969, + "step": 6111 + }, + { + "epoch": 1.6225939200849595, + "grad_norm": 0.4232718662098156, + "learning_rate": 4.169433551976222e-06, + "loss": 0.593, + "step": 6112 + }, + { + "epoch": 1.6228594185583431, + "grad_norm": 0.41817767050446764, + "learning_rate": 4.1691736572351485e-06, + "loss": 0.6183, + "step": 6113 + }, + { + "epoch": 1.6231249170317272, + "grad_norm": 0.4242438733109001, + "learning_rate": 4.168913729941101e-06, + "loss": 0.5964, + "step": 6114 + }, + { + "epoch": 1.6233904155051109, + "grad_norm": 0.41650494557179485, + "learning_rate": 4.1686537700991505e-06, + "loss": 0.6018, + "step": 6115 + }, + { + "epoch": 1.6236559139784945, + "grad_norm": 0.41738744719303306, + "learning_rate": 4.168393777714364e-06, + "loss": 0.5742, + "step": 6116 + }, + { + "epoch": 1.6239214124518784, + "grad_norm": 0.40680178549473783, + "learning_rate": 4.168133752791815e-06, + "loss": 0.5983, + "step": 6117 + }, + { + "epoch": 1.6241869109252622, + "grad_norm": 0.4031023316747236, + "learning_rate": 4.167873695336575e-06, + "loss": 0.5914, + "step": 6118 + }, + { + "epoch": 1.6244524093986459, + "grad_norm": 0.42387610979480694, + "learning_rate": 4.167613605353712e-06, + "loss": 0.6312, + "step": 6119 + }, + { + "epoch": 1.6247179078720297, + "grad_norm": 0.42011859217972486, + "learning_rate": 4.167353482848302e-06, + "loss": 0.5983, + "step": 6120 + }, + { + "epoch": 1.6249834063454136, + "grad_norm": 0.4129244650761646, + "learning_rate": 4.167093327825417e-06, + "loss": 0.6035, + "step": 6121 + }, + { + "epoch": 1.6252489048187972, + "grad_norm": 0.3974062132472111, + "learning_rate": 4.166833140290128e-06, + "loss": 0.5511, + "step": 6122 + }, + { + "epoch": 1.625514403292181, + "grad_norm": 0.41200410883365246, + "learning_rate": 4.166572920247514e-06, + "loss": 0.574, + "step": 6123 + }, + { + "epoch": 1.625779901765565, + "grad_norm": 0.4030772574830196, + "learning_rate": 4.166312667702647e-06, + "loss": 0.581, + "step": 6124 + }, + { + "epoch": 1.6260454002389486, + "grad_norm": 0.40076109867172627, + "learning_rate": 4.166052382660604e-06, + "loss": 0.5719, + "step": 6125 + }, + { + "epoch": 1.6263108987123323, + "grad_norm": 0.4301905162364889, + "learning_rate": 4.165792065126459e-06, + "loss": 0.6099, + "step": 6126 + }, + { + "epoch": 1.6265763971857163, + "grad_norm": 0.4014776446171743, + "learning_rate": 4.1655317151052906e-06, + "loss": 0.5874, + "step": 6127 + }, + { + "epoch": 1.6268418956591, + "grad_norm": 0.4083216500620904, + "learning_rate": 4.1652713326021756e-06, + "loss": 0.5822, + "step": 6128 + }, + { + "epoch": 1.6271073941324836, + "grad_norm": 0.4242217715503586, + "learning_rate": 4.165010917622193e-06, + "loss": 0.5885, + "step": 6129 + }, + { + "epoch": 1.6273728926058675, + "grad_norm": 0.43271239320364385, + "learning_rate": 4.16475047017042e-06, + "loss": 0.5656, + "step": 6130 + }, + { + "epoch": 1.6276383910792513, + "grad_norm": 0.41445016707339577, + "learning_rate": 4.164489990251937e-06, + "loss": 0.5795, + "step": 6131 + }, + { + "epoch": 1.627903889552635, + "grad_norm": 0.4263278103983279, + "learning_rate": 4.164229477871824e-06, + "loss": 0.598, + "step": 6132 + }, + { + "epoch": 1.6281693880260188, + "grad_norm": 0.4155773499362417, + "learning_rate": 4.163968933035162e-06, + "loss": 0.5871, + "step": 6133 + }, + { + "epoch": 1.6284348864994027, + "grad_norm": 0.4208296658953933, + "learning_rate": 4.16370835574703e-06, + "loss": 0.6264, + "step": 6134 + }, + { + "epoch": 1.6287003849727864, + "grad_norm": 0.41994182283039133, + "learning_rate": 4.163447746012513e-06, + "loss": 0.5781, + "step": 6135 + }, + { + "epoch": 1.6289658834461702, + "grad_norm": 0.4024937743711282, + "learning_rate": 4.163187103836692e-06, + "loss": 0.5724, + "step": 6136 + }, + { + "epoch": 1.629231381919554, + "grad_norm": 0.4092982827039093, + "learning_rate": 4.1629264292246505e-06, + "loss": 0.5783, + "step": 6137 + }, + { + "epoch": 1.6294968803929377, + "grad_norm": 0.4144124204534514, + "learning_rate": 4.162665722181472e-06, + "loss": 0.6145, + "step": 6138 + }, + { + "epoch": 1.6297623788663214, + "grad_norm": 0.4235307821490689, + "learning_rate": 4.162404982712241e-06, + "loss": 0.5564, + "step": 6139 + }, + { + "epoch": 1.6300278773397054, + "grad_norm": 0.421892964490945, + "learning_rate": 4.1621442108220425e-06, + "loss": 0.5626, + "step": 6140 + }, + { + "epoch": 1.630293375813089, + "grad_norm": 0.42406328523486675, + "learning_rate": 4.161883406515962e-06, + "loss": 0.6085, + "step": 6141 + }, + { + "epoch": 1.6305588742864727, + "grad_norm": 0.4105191092035432, + "learning_rate": 4.161622569799086e-06, + "loss": 0.605, + "step": 6142 + }, + { + "epoch": 1.6308243727598566, + "grad_norm": 0.4006804805908787, + "learning_rate": 4.1613617006765016e-06, + "loss": 0.5936, + "step": 6143 + }, + { + "epoch": 1.6310898712332405, + "grad_norm": 0.38996310795407324, + "learning_rate": 4.161100799153297e-06, + "loss": 0.5865, + "step": 6144 + }, + { + "epoch": 1.631355369706624, + "grad_norm": 0.41671128644001837, + "learning_rate": 4.160839865234558e-06, + "loss": 0.6017, + "step": 6145 + }, + { + "epoch": 1.631620868180008, + "grad_norm": 0.4001074347833242, + "learning_rate": 4.160578898925377e-06, + "loss": 0.5526, + "step": 6146 + }, + { + "epoch": 1.6318863666533918, + "grad_norm": 0.3977811634343154, + "learning_rate": 4.160317900230841e-06, + "loss": 0.5923, + "step": 6147 + }, + { + "epoch": 1.6321518651267755, + "grad_norm": 0.4067900207900259, + "learning_rate": 4.160056869156041e-06, + "loss": 0.5642, + "step": 6148 + }, + { + "epoch": 1.6324173636001593, + "grad_norm": 0.4027236458197424, + "learning_rate": 4.1597958057060675e-06, + "loss": 0.5976, + "step": 6149 + }, + { + "epoch": 1.6326828620735432, + "grad_norm": 0.40172676177857874, + "learning_rate": 4.159534709886012e-06, + "loss": 0.5961, + "step": 6150 + }, + { + "epoch": 1.6329483605469268, + "grad_norm": 0.40478471902325897, + "learning_rate": 4.159273581700965e-06, + "loss": 0.5945, + "step": 6151 + }, + { + "epoch": 1.6332138590203107, + "grad_norm": 0.40050833918555934, + "learning_rate": 4.159012421156023e-06, + "loss": 0.5799, + "step": 6152 + }, + { + "epoch": 1.6334793574936946, + "grad_norm": 0.40050030821461574, + "learning_rate": 4.158751228256275e-06, + "loss": 0.603, + "step": 6153 + }, + { + "epoch": 1.6337448559670782, + "grad_norm": 0.3939591489093314, + "learning_rate": 4.158490003006817e-06, + "loss": 0.5409, + "step": 6154 + }, + { + "epoch": 1.6340103544404618, + "grad_norm": 0.4025942853463516, + "learning_rate": 4.158228745412745e-06, + "loss": 0.5724, + "step": 6155 + }, + { + "epoch": 1.6342758529138457, + "grad_norm": 0.4312021363071985, + "learning_rate": 4.1579674554791505e-06, + "loss": 0.6074, + "step": 6156 + }, + { + "epoch": 1.6345413513872296, + "grad_norm": 0.41423615721707946, + "learning_rate": 4.1577061332111316e-06, + "loss": 0.5903, + "step": 6157 + }, + { + "epoch": 1.6348068498606132, + "grad_norm": 0.41035197940726176, + "learning_rate": 4.157444778613784e-06, + "loss": 0.5845, + "step": 6158 + }, + { + "epoch": 1.635072348333997, + "grad_norm": 0.41212990153842805, + "learning_rate": 4.157183391692206e-06, + "loss": 0.5487, + "step": 6159 + }, + { + "epoch": 1.635337846807381, + "grad_norm": 0.41695461209939216, + "learning_rate": 4.156921972451494e-06, + "loss": 0.5738, + "step": 6160 + }, + { + "epoch": 1.6356033452807646, + "grad_norm": 0.41434622391201165, + "learning_rate": 4.1566605208967465e-06, + "loss": 0.6107, + "step": 6161 + }, + { + "epoch": 1.6358688437541484, + "grad_norm": 0.40577748694263566, + "learning_rate": 4.156399037033063e-06, + "loss": 0.5594, + "step": 6162 + }, + { + "epoch": 1.6361343422275323, + "grad_norm": 0.4125891360352178, + "learning_rate": 4.156137520865542e-06, + "loss": 0.5804, + "step": 6163 + }, + { + "epoch": 1.636399840700916, + "grad_norm": 0.42704791950403886, + "learning_rate": 4.1558759723992845e-06, + "loss": 0.589, + "step": 6164 + }, + { + "epoch": 1.6366653391742998, + "grad_norm": 0.4106778160593796, + "learning_rate": 4.155614391639392e-06, + "loss": 0.5958, + "step": 6165 + }, + { + "epoch": 1.6369308376476837, + "grad_norm": 0.4121664073279707, + "learning_rate": 4.155352778590964e-06, + "loss": 0.5783, + "step": 6166 + }, + { + "epoch": 1.6371963361210673, + "grad_norm": 0.41224868821400934, + "learning_rate": 4.155091133259105e-06, + "loss": 0.6216, + "step": 6167 + }, + { + "epoch": 1.637461834594451, + "grad_norm": 0.41527339349457415, + "learning_rate": 4.154829455648916e-06, + "loss": 0.5909, + "step": 6168 + }, + { + "epoch": 1.637727333067835, + "grad_norm": 0.4078061501894263, + "learning_rate": 4.154567745765501e-06, + "loss": 0.5594, + "step": 6169 + }, + { + "epoch": 1.6379928315412187, + "grad_norm": 0.3956864337507178, + "learning_rate": 4.154306003613965e-06, + "loss": 0.5724, + "step": 6170 + }, + { + "epoch": 1.6382583300146023, + "grad_norm": 0.3981378344183963, + "learning_rate": 4.15404422919941e-06, + "loss": 0.5263, + "step": 6171 + }, + { + "epoch": 1.6385238284879862, + "grad_norm": 0.4179824345946913, + "learning_rate": 4.153782422526943e-06, + "loss": 0.5875, + "step": 6172 + }, + { + "epoch": 1.63878932696137, + "grad_norm": 0.39134723518773457, + "learning_rate": 4.15352058360167e-06, + "loss": 0.5515, + "step": 6173 + }, + { + "epoch": 1.6390548254347537, + "grad_norm": 0.39961521046893517, + "learning_rate": 4.153258712428697e-06, + "loss": 0.6104, + "step": 6174 + }, + { + "epoch": 1.6393203239081375, + "grad_norm": 0.41425705749840463, + "learning_rate": 4.152996809013131e-06, + "loss": 0.5731, + "step": 6175 + }, + { + "epoch": 1.6395858223815214, + "grad_norm": 0.45333256468870214, + "learning_rate": 4.15273487336008e-06, + "loss": 0.584, + "step": 6176 + }, + { + "epoch": 1.639851320854905, + "grad_norm": 0.3958538958844591, + "learning_rate": 4.152472905474653e-06, + "loss": 0.5849, + "step": 6177 + }, + { + "epoch": 1.640116819328289, + "grad_norm": 0.4015134633521911, + "learning_rate": 4.152210905361958e-06, + "loss": 0.5867, + "step": 6178 + }, + { + "epoch": 1.6403823178016728, + "grad_norm": 0.40141478694589433, + "learning_rate": 4.151948873027105e-06, + "loss": 0.6068, + "step": 6179 + }, + { + "epoch": 1.6406478162750564, + "grad_norm": 0.42398501794559523, + "learning_rate": 4.151686808475204e-06, + "loss": 0.5731, + "step": 6180 + }, + { + "epoch": 1.64091331474844, + "grad_norm": 0.39558618040866333, + "learning_rate": 4.151424711711366e-06, + "loss": 0.6024, + "step": 6181 + }, + { + "epoch": 1.6411788132218241, + "grad_norm": 0.4220607933628933, + "learning_rate": 4.151162582740703e-06, + "loss": 0.5831, + "step": 6182 + }, + { + "epoch": 1.6414443116952078, + "grad_norm": 0.3968453065135876, + "learning_rate": 4.150900421568328e-06, + "loss": 0.5617, + "step": 6183 + }, + { + "epoch": 1.6417098101685914, + "grad_norm": 0.41703052185711126, + "learning_rate": 4.150638228199352e-06, + "loss": 0.5792, + "step": 6184 + }, + { + "epoch": 1.6419753086419753, + "grad_norm": 0.4094710152008768, + "learning_rate": 4.150376002638888e-06, + "loss": 0.5735, + "step": 6185 + }, + { + "epoch": 1.6422408071153591, + "grad_norm": 0.4041939781155795, + "learning_rate": 4.150113744892053e-06, + "loss": 0.5687, + "step": 6186 + }, + { + "epoch": 1.6425063055887428, + "grad_norm": 0.4049060040768663, + "learning_rate": 4.1498514549639575e-06, + "loss": 0.5877, + "step": 6187 + }, + { + "epoch": 1.6427718040621266, + "grad_norm": 0.4113525493215451, + "learning_rate": 4.149589132859721e-06, + "loss": 0.6165, + "step": 6188 + }, + { + "epoch": 1.6430373025355105, + "grad_norm": 0.40495989894501877, + "learning_rate": 4.149326778584457e-06, + "loss": 0.6204, + "step": 6189 + }, + { + "epoch": 1.6433028010088941, + "grad_norm": 0.41350056409386493, + "learning_rate": 4.149064392143282e-06, + "loss": 0.5889, + "step": 6190 + }, + { + "epoch": 1.643568299482278, + "grad_norm": 0.40710807425726075, + "learning_rate": 4.1488019735413145e-06, + "loss": 0.5504, + "step": 6191 + }, + { + "epoch": 1.6438337979556619, + "grad_norm": 0.41068614620172866, + "learning_rate": 4.148539522783671e-06, + "loss": 0.5608, + "step": 6192 + }, + { + "epoch": 1.6440992964290455, + "grad_norm": 0.40957905468107303, + "learning_rate": 4.148277039875471e-06, + "loss": 0.5783, + "step": 6193 + }, + { + "epoch": 1.6443647949024291, + "grad_norm": 0.40382362750619155, + "learning_rate": 4.1480145248218325e-06, + "loss": 0.5468, + "step": 6194 + }, + { + "epoch": 1.6446302933758132, + "grad_norm": 0.4195215216697413, + "learning_rate": 4.147751977627876e-06, + "loss": 0.5671, + "step": 6195 + }, + { + "epoch": 1.6448957918491969, + "grad_norm": 0.4106618374739023, + "learning_rate": 4.147489398298722e-06, + "loss": 0.6221, + "step": 6196 + }, + { + "epoch": 1.6451612903225805, + "grad_norm": 0.40584045928985996, + "learning_rate": 4.14722678683949e-06, + "loss": 0.5791, + "step": 6197 + }, + { + "epoch": 1.6454267887959644, + "grad_norm": 0.40636200741855466, + "learning_rate": 4.146964143255304e-06, + "loss": 0.5786, + "step": 6198 + }, + { + "epoch": 1.6456922872693482, + "grad_norm": 0.40943823554126807, + "learning_rate": 4.146701467551284e-06, + "loss": 0.5888, + "step": 6199 + }, + { + "epoch": 1.6459577857427319, + "grad_norm": 0.40840696304599533, + "learning_rate": 4.1464387597325525e-06, + "loss": 0.5617, + "step": 6200 + }, + { + "epoch": 1.6462232842161157, + "grad_norm": 0.4158556573842827, + "learning_rate": 4.1461760198042355e-06, + "loss": 0.6274, + "step": 6201 + }, + { + "epoch": 1.6464887826894996, + "grad_norm": 0.414135298042347, + "learning_rate": 4.145913247771455e-06, + "loss": 0.6094, + "step": 6202 + }, + { + "epoch": 1.6467542811628832, + "grad_norm": 0.43872145995368317, + "learning_rate": 4.145650443639336e-06, + "loss": 0.5528, + "step": 6203 + }, + { + "epoch": 1.6470197796362671, + "grad_norm": 0.40543703759873323, + "learning_rate": 4.1453876074130046e-06, + "loss": 0.5588, + "step": 6204 + }, + { + "epoch": 1.647285278109651, + "grad_norm": 0.4043137008848618, + "learning_rate": 4.145124739097585e-06, + "loss": 0.5521, + "step": 6205 + }, + { + "epoch": 1.6475507765830346, + "grad_norm": 0.40171402098633396, + "learning_rate": 4.1448618386982065e-06, + "loss": 0.5914, + "step": 6206 + }, + { + "epoch": 1.6478162750564185, + "grad_norm": 0.42962939831299696, + "learning_rate": 4.144598906219995e-06, + "loss": 0.5761, + "step": 6207 + }, + { + "epoch": 1.6480817735298023, + "grad_norm": 0.4137795924604853, + "learning_rate": 4.144335941668077e-06, + "loss": 0.5954, + "step": 6208 + }, + { + "epoch": 1.648347272003186, + "grad_norm": 0.41575333958277927, + "learning_rate": 4.144072945047582e-06, + "loss": 0.593, + "step": 6209 + }, + { + "epoch": 1.6486127704765696, + "grad_norm": 0.395273843277485, + "learning_rate": 4.1438099163636406e-06, + "loss": 0.5566, + "step": 6210 + }, + { + "epoch": 1.6488782689499535, + "grad_norm": 0.41298271518847307, + "learning_rate": 4.1435468556213804e-06, + "loss": 0.6236, + "step": 6211 + }, + { + "epoch": 1.6491437674233373, + "grad_norm": 0.4125252242818947, + "learning_rate": 4.1432837628259325e-06, + "loss": 0.5707, + "step": 6212 + }, + { + "epoch": 1.649409265896721, + "grad_norm": 0.4253046340483282, + "learning_rate": 4.143020637982428e-06, + "loss": 0.5664, + "step": 6213 + }, + { + "epoch": 1.6496747643701049, + "grad_norm": 0.40589552261925216, + "learning_rate": 4.1427574810959966e-06, + "loss": 0.58, + "step": 6214 + }, + { + "epoch": 1.6499402628434887, + "grad_norm": 0.4038100950343004, + "learning_rate": 4.142494292171774e-06, + "loss": 0.5882, + "step": 6215 + }, + { + "epoch": 1.6502057613168724, + "grad_norm": 0.40530702140073943, + "learning_rate": 4.14223107121489e-06, + "loss": 0.5724, + "step": 6216 + }, + { + "epoch": 1.6504712597902562, + "grad_norm": 0.4203119053990801, + "learning_rate": 4.1419678182304805e-06, + "loss": 0.5779, + "step": 6217 + }, + { + "epoch": 1.65073675826364, + "grad_norm": 0.41514364892484984, + "learning_rate": 4.141704533223678e-06, + "loss": 0.5872, + "step": 6218 + }, + { + "epoch": 1.6510022567370237, + "grad_norm": 0.4045781292121557, + "learning_rate": 4.141441216199617e-06, + "loss": 0.5948, + "step": 6219 + }, + { + "epoch": 1.6512677552104076, + "grad_norm": 0.415084793543445, + "learning_rate": 4.141177867163433e-06, + "loss": 0.5741, + "step": 6220 + }, + { + "epoch": 1.6515332536837914, + "grad_norm": 0.42018925226627823, + "learning_rate": 4.140914486120263e-06, + "loss": 0.6233, + "step": 6221 + }, + { + "epoch": 1.651798752157175, + "grad_norm": 0.40602602960171846, + "learning_rate": 4.140651073075243e-06, + "loss": 0.5402, + "step": 6222 + }, + { + "epoch": 1.6520642506305587, + "grad_norm": 0.41734485862591525, + "learning_rate": 4.14038762803351e-06, + "loss": 0.559, + "step": 6223 + }, + { + "epoch": 1.6523297491039428, + "grad_norm": 0.39546839570875353, + "learning_rate": 4.140124151000201e-06, + "loss": 0.6167, + "step": 6224 + }, + { + "epoch": 1.6525952475773265, + "grad_norm": 0.4119933688878461, + "learning_rate": 4.1398606419804565e-06, + "loss": 0.6, + "step": 6225 + }, + { + "epoch": 1.65286074605071, + "grad_norm": 0.39970356730416085, + "learning_rate": 4.139597100979413e-06, + "loss": 0.5859, + "step": 6226 + }, + { + "epoch": 1.653126244524094, + "grad_norm": 0.40138469202406823, + "learning_rate": 4.139333528002213e-06, + "loss": 0.5551, + "step": 6227 + }, + { + "epoch": 1.6533917429974778, + "grad_norm": 0.4000983143834468, + "learning_rate": 4.139069923053995e-06, + "loss": 0.5891, + "step": 6228 + }, + { + "epoch": 1.6536572414708615, + "grad_norm": 0.4098431275921327, + "learning_rate": 4.1388062861399004e-06, + "loss": 0.5655, + "step": 6229 + }, + { + "epoch": 1.6539227399442453, + "grad_norm": 0.4178427771067677, + "learning_rate": 4.1385426172650715e-06, + "loss": 0.5642, + "step": 6230 + }, + { + "epoch": 1.6541882384176292, + "grad_norm": 0.3981866275904858, + "learning_rate": 4.138278916434648e-06, + "loss": 0.5802, + "step": 6231 + }, + { + "epoch": 1.6544537368910128, + "grad_norm": 0.40325194615294147, + "learning_rate": 4.138015183653775e-06, + "loss": 0.5712, + "step": 6232 + }, + { + "epoch": 1.6547192353643967, + "grad_norm": 0.39890934406055234, + "learning_rate": 4.137751418927596e-06, + "loss": 0.6222, + "step": 6233 + }, + { + "epoch": 1.6549847338377806, + "grad_norm": 0.42068599178016536, + "learning_rate": 4.137487622261254e-06, + "loss": 0.5974, + "step": 6234 + }, + { + "epoch": 1.6552502323111642, + "grad_norm": 0.4098890913665752, + "learning_rate": 4.137223793659894e-06, + "loss": 0.594, + "step": 6235 + }, + { + "epoch": 1.6555157307845478, + "grad_norm": 0.42397914087060007, + "learning_rate": 4.1369599331286615e-06, + "loss": 0.5374, + "step": 6236 + }, + { + "epoch": 1.655781229257932, + "grad_norm": 0.41304192083496677, + "learning_rate": 4.136696040672702e-06, + "loss": 0.6274, + "step": 6237 + }, + { + "epoch": 1.6560467277313156, + "grad_norm": 0.41278506832165685, + "learning_rate": 4.136432116297164e-06, + "loss": 0.578, + "step": 6238 + }, + { + "epoch": 1.6563122262046992, + "grad_norm": 0.4094875937980845, + "learning_rate": 4.136168160007191e-06, + "loss": 0.6053, + "step": 6239 + }, + { + "epoch": 1.656577724678083, + "grad_norm": 0.4035127617560246, + "learning_rate": 4.135904171807934e-06, + "loss": 0.5596, + "step": 6240 + }, + { + "epoch": 1.656843223151467, + "grad_norm": 0.4204748616645181, + "learning_rate": 4.1356401517045395e-06, + "loss": 0.5335, + "step": 6241 + }, + { + "epoch": 1.6571087216248506, + "grad_norm": 0.40997332975393685, + "learning_rate": 4.135376099702158e-06, + "loss": 0.544, + "step": 6242 + }, + { + "epoch": 1.6573742200982344, + "grad_norm": 0.3992096663527178, + "learning_rate": 4.135112015805938e-06, + "loss": 0.5609, + "step": 6243 + }, + { + "epoch": 1.6576397185716183, + "grad_norm": 0.407378546327697, + "learning_rate": 4.1348479000210305e-06, + "loss": 0.5875, + "step": 6244 + }, + { + "epoch": 1.657905217045002, + "grad_norm": 0.4154709615900247, + "learning_rate": 4.1345837523525865e-06, + "loss": 0.5688, + "step": 6245 + }, + { + "epoch": 1.6581707155183858, + "grad_norm": 0.40770356163791593, + "learning_rate": 4.134319572805756e-06, + "loss": 0.5302, + "step": 6246 + }, + { + "epoch": 1.6584362139917697, + "grad_norm": 0.3880617767005316, + "learning_rate": 4.134055361385694e-06, + "loss": 0.5859, + "step": 6247 + }, + { + "epoch": 1.6587017124651533, + "grad_norm": 0.4196745597244992, + "learning_rate": 4.1337911180975506e-06, + "loss": 0.6098, + "step": 6248 + }, + { + "epoch": 1.658967210938537, + "grad_norm": 0.4129199285056074, + "learning_rate": 4.133526842946481e-06, + "loss": 0.572, + "step": 6249 + }, + { + "epoch": 1.659232709411921, + "grad_norm": 0.41145264862768005, + "learning_rate": 4.133262535937637e-06, + "loss": 0.5706, + "step": 6250 + }, + { + "epoch": 1.6594982078853047, + "grad_norm": 0.4148045692625393, + "learning_rate": 4.132998197076175e-06, + "loss": 0.5707, + "step": 6251 + }, + { + "epoch": 1.6597637063586883, + "grad_norm": 0.3957170698282108, + "learning_rate": 4.13273382636725e-06, + "loss": 0.5821, + "step": 6252 + }, + { + "epoch": 1.6600292048320722, + "grad_norm": 0.4118281769956654, + "learning_rate": 4.132469423816019e-06, + "loss": 0.5864, + "step": 6253 + }, + { + "epoch": 1.660294703305456, + "grad_norm": 0.4226261841297954, + "learning_rate": 4.132204989427636e-06, + "loss": 0.607, + "step": 6254 + }, + { + "epoch": 1.6605602017788397, + "grad_norm": 0.425985370340082, + "learning_rate": 4.1319405232072596e-06, + "loss": 0.5963, + "step": 6255 + }, + { + "epoch": 1.6608257002522235, + "grad_norm": 0.3996984870942285, + "learning_rate": 4.131676025160047e-06, + "loss": 0.5826, + "step": 6256 + }, + { + "epoch": 1.6610911987256074, + "grad_norm": 0.4063701415508513, + "learning_rate": 4.131411495291158e-06, + "loss": 0.6088, + "step": 6257 + }, + { + "epoch": 1.661356697198991, + "grad_norm": 0.406081896704152, + "learning_rate": 4.131146933605749e-06, + "loss": 0.5785, + "step": 6258 + }, + { + "epoch": 1.661622195672375, + "grad_norm": 0.4230551885595814, + "learning_rate": 4.130882340108981e-06, + "loss": 0.556, + "step": 6259 + }, + { + "epoch": 1.6618876941457588, + "grad_norm": 0.4149994063430953, + "learning_rate": 4.130617714806016e-06, + "loss": 0.587, + "step": 6260 + }, + { + "epoch": 1.6621531926191424, + "grad_norm": 0.4257823328443519, + "learning_rate": 4.130353057702012e-06, + "loss": 0.6051, + "step": 6261 + }, + { + "epoch": 1.6624186910925263, + "grad_norm": 0.40865431395602214, + "learning_rate": 4.130088368802132e-06, + "loss": 0.5961, + "step": 6262 + }, + { + "epoch": 1.6626841895659101, + "grad_norm": 0.42533649853128425, + "learning_rate": 4.129823648111537e-06, + "loss": 0.6312, + "step": 6263 + }, + { + "epoch": 1.6629496880392938, + "grad_norm": 0.39085554844775955, + "learning_rate": 4.129558895635391e-06, + "loss": 0.5607, + "step": 6264 + }, + { + "epoch": 1.6632151865126774, + "grad_norm": 0.4207350142253288, + "learning_rate": 4.129294111378857e-06, + "loss": 0.6031, + "step": 6265 + }, + { + "epoch": 1.6634806849860613, + "grad_norm": 0.39123248553015144, + "learning_rate": 4.129029295347097e-06, + "loss": 0.5895, + "step": 6266 + }, + { + "epoch": 1.6637461834594451, + "grad_norm": 0.4148025621611677, + "learning_rate": 4.128764447545278e-06, + "loss": 0.5855, + "step": 6267 + }, + { + "epoch": 1.6640116819328288, + "grad_norm": 0.4021487051768757, + "learning_rate": 4.1284995679785655e-06, + "loss": 0.6107, + "step": 6268 + }, + { + "epoch": 1.6642771804062126, + "grad_norm": 0.41992299218051976, + "learning_rate": 4.128234656652122e-06, + "loss": 0.6095, + "step": 6269 + }, + { + "epoch": 1.6645426788795965, + "grad_norm": 0.4116384967035457, + "learning_rate": 4.127969713571119e-06, + "loss": 0.5836, + "step": 6270 + }, + { + "epoch": 1.6648081773529801, + "grad_norm": 0.4119548148414317, + "learning_rate": 4.127704738740719e-06, + "loss": 0.5876, + "step": 6271 + }, + { + "epoch": 1.665073675826364, + "grad_norm": 0.398825976868628, + "learning_rate": 4.127439732166091e-06, + "loss": 0.5589, + "step": 6272 + }, + { + "epoch": 1.6653391742997479, + "grad_norm": 0.4090683604248852, + "learning_rate": 4.127174693852404e-06, + "loss": 0.6122, + "step": 6273 + }, + { + "epoch": 1.6656046727731315, + "grad_norm": 0.405529061429767, + "learning_rate": 4.126909623804825e-06, + "loss": 0.6155, + "step": 6274 + }, + { + "epoch": 1.6658701712465154, + "grad_norm": 0.4126587093699257, + "learning_rate": 4.126644522028526e-06, + "loss": 0.5725, + "step": 6275 + }, + { + "epoch": 1.6661356697198992, + "grad_norm": 0.4124320374860765, + "learning_rate": 4.1263793885286775e-06, + "loss": 0.5382, + "step": 6276 + }, + { + "epoch": 1.6664011681932829, + "grad_norm": 0.3889636497590304, + "learning_rate": 4.1261142233104475e-06, + "loss": 0.564, + "step": 6277 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.42026093390239305, + "learning_rate": 4.125849026379009e-06, + "loss": 0.6312, + "step": 6278 + }, + { + "epoch": 1.6669321651400506, + "grad_norm": 0.4236148123260068, + "learning_rate": 4.125583797739534e-06, + "loss": 0.5489, + "step": 6279 + }, + { + "epoch": 1.6671976636134342, + "grad_norm": 0.4012570714568701, + "learning_rate": 4.125318537397195e-06, + "loss": 0.5674, + "step": 6280 + }, + { + "epoch": 1.6674631620868179, + "grad_norm": 0.4078922674826175, + "learning_rate": 4.125053245357164e-06, + "loss": 0.5979, + "step": 6281 + }, + { + "epoch": 1.6677286605602017, + "grad_norm": 0.41892003489855617, + "learning_rate": 4.124787921624617e-06, + "loss": 0.5884, + "step": 6282 + }, + { + "epoch": 1.6679941590335856, + "grad_norm": 0.40938248580799247, + "learning_rate": 4.124522566204727e-06, + "loss": 0.5889, + "step": 6283 + }, + { + "epoch": 1.6682596575069693, + "grad_norm": 0.42011994660105495, + "learning_rate": 4.124257179102668e-06, + "loss": 0.6041, + "step": 6284 + }, + { + "epoch": 1.6685251559803531, + "grad_norm": 0.4191644433253345, + "learning_rate": 4.1239917603236195e-06, + "loss": 0.5809, + "step": 6285 + }, + { + "epoch": 1.668790654453737, + "grad_norm": 0.40211573031454656, + "learning_rate": 4.123726309872754e-06, + "loss": 0.6034, + "step": 6286 + }, + { + "epoch": 1.6690561529271206, + "grad_norm": 0.41975737406106656, + "learning_rate": 4.12346082775525e-06, + "loss": 0.5695, + "step": 6287 + }, + { + "epoch": 1.6693216514005045, + "grad_norm": 0.39337615849979457, + "learning_rate": 4.123195313976286e-06, + "loss": 0.5824, + "step": 6288 + }, + { + "epoch": 1.6695871498738883, + "grad_norm": 0.4303546203791357, + "learning_rate": 4.122929768541038e-06, + "loss": 0.5812, + "step": 6289 + }, + { + "epoch": 1.669852648347272, + "grad_norm": 0.4164017797520493, + "learning_rate": 4.122664191454686e-06, + "loss": 0.5779, + "step": 6290 + }, + { + "epoch": 1.6701181468206556, + "grad_norm": 0.4097090631863583, + "learning_rate": 4.12239858272241e-06, + "loss": 0.5981, + "step": 6291 + }, + { + "epoch": 1.6703836452940397, + "grad_norm": 0.416787858211982, + "learning_rate": 4.1221329423493885e-06, + "loss": 0.6036, + "step": 6292 + }, + { + "epoch": 1.6706491437674234, + "grad_norm": 0.4292695720492115, + "learning_rate": 4.121867270340803e-06, + "loss": 0.5547, + "step": 6293 + }, + { + "epoch": 1.670914642240807, + "grad_norm": 0.41061784102903787, + "learning_rate": 4.121601566701836e-06, + "loss": 0.5737, + "step": 6294 + }, + { + "epoch": 1.6711801407141909, + "grad_norm": 0.41479910673626835, + "learning_rate": 4.121335831437667e-06, + "loss": 0.6225, + "step": 6295 + }, + { + "epoch": 1.6714456391875747, + "grad_norm": 0.39647962614135507, + "learning_rate": 4.121070064553479e-06, + "loss": 0.5911, + "step": 6296 + }, + { + "epoch": 1.6717111376609584, + "grad_norm": 0.4036427208188757, + "learning_rate": 4.120804266054457e-06, + "loss": 0.579, + "step": 6297 + }, + { + "epoch": 1.6719766361343422, + "grad_norm": 0.4056796986607875, + "learning_rate": 4.120538435945782e-06, + "loss": 0.5496, + "step": 6298 + }, + { + "epoch": 1.672242134607726, + "grad_norm": 0.4080395713944233, + "learning_rate": 4.120272574232641e-06, + "loss": 0.5717, + "step": 6299 + }, + { + "epoch": 1.6725076330811097, + "grad_norm": 0.4015556865683174, + "learning_rate": 4.1200066809202175e-06, + "loss": 0.5959, + "step": 6300 + }, + { + "epoch": 1.6727731315544936, + "grad_norm": 0.41772861638897374, + "learning_rate": 4.119740756013697e-06, + "loss": 0.6041, + "step": 6301 + }, + { + "epoch": 1.6730386300278774, + "grad_norm": 0.4121327627603656, + "learning_rate": 4.119474799518266e-06, + "loss": 0.5957, + "step": 6302 + }, + { + "epoch": 1.673304128501261, + "grad_norm": 0.41607194339937, + "learning_rate": 4.119208811439111e-06, + "loss": 0.5918, + "step": 6303 + }, + { + "epoch": 1.6735696269746447, + "grad_norm": 0.41607659660043184, + "learning_rate": 4.11894279178142e-06, + "loss": 0.5651, + "step": 6304 + }, + { + "epoch": 1.6738351254480288, + "grad_norm": 0.40703456582321434, + "learning_rate": 4.11867674055038e-06, + "loss": 0.5779, + "step": 6305 + }, + { + "epoch": 1.6741006239214125, + "grad_norm": 0.39641516024729906, + "learning_rate": 4.11841065775118e-06, + "loss": 0.6025, + "step": 6306 + }, + { + "epoch": 1.674366122394796, + "grad_norm": 0.41827020132395565, + "learning_rate": 4.118144543389011e-06, + "loss": 0.585, + "step": 6307 + }, + { + "epoch": 1.67463162086818, + "grad_norm": 0.4082423123129381, + "learning_rate": 4.117878397469062e-06, + "loss": 0.5751, + "step": 6308 + }, + { + "epoch": 1.6748971193415638, + "grad_norm": 0.3940808378832142, + "learning_rate": 4.117612219996522e-06, + "loss": 0.5856, + "step": 6309 + }, + { + "epoch": 1.6751626178149475, + "grad_norm": 0.408874741784608, + "learning_rate": 4.117346010976583e-06, + "loss": 0.6043, + "step": 6310 + }, + { + "epoch": 1.6754281162883313, + "grad_norm": 0.40966683365669165, + "learning_rate": 4.117079770414436e-06, + "loss": 0.6044, + "step": 6311 + }, + { + "epoch": 1.6756936147617152, + "grad_norm": 0.4305904864209752, + "learning_rate": 4.1168134983152756e-06, + "loss": 0.5756, + "step": 6312 + }, + { + "epoch": 1.6759591132350988, + "grad_norm": 0.40715643782085553, + "learning_rate": 4.116547194684293e-06, + "loss": 0.5931, + "step": 6313 + }, + { + "epoch": 1.6762246117084827, + "grad_norm": 0.4096397716811137, + "learning_rate": 4.1162808595266825e-06, + "loss": 0.574, + "step": 6314 + }, + { + "epoch": 1.6764901101818666, + "grad_norm": 0.4025989393357566, + "learning_rate": 4.1160144928476366e-06, + "loss": 0.5715, + "step": 6315 + }, + { + "epoch": 1.6767556086552502, + "grad_norm": 0.42442510676230444, + "learning_rate": 4.115748094652352e-06, + "loss": 0.6085, + "step": 6316 + }, + { + "epoch": 1.677021107128634, + "grad_norm": 0.4121507733462747, + "learning_rate": 4.115481664946024e-06, + "loss": 0.5848, + "step": 6317 + }, + { + "epoch": 1.677286605602018, + "grad_norm": 0.3941377941422974, + "learning_rate": 4.115215203733848e-06, + "loss": 0.5657, + "step": 6318 + }, + { + "epoch": 1.6775521040754016, + "grad_norm": 0.41219243346894474, + "learning_rate": 4.11494871102102e-06, + "loss": 0.5647, + "step": 6319 + }, + { + "epoch": 1.6778176025487852, + "grad_norm": 0.43268918353710634, + "learning_rate": 4.114682186812739e-06, + "loss": 0.5848, + "step": 6320 + }, + { + "epoch": 1.6780831010221693, + "grad_norm": 0.40701781828082384, + "learning_rate": 4.1144156311142025e-06, + "loss": 0.5764, + "step": 6321 + }, + { + "epoch": 1.678348599495553, + "grad_norm": 0.40048312256767066, + "learning_rate": 4.114149043930607e-06, + "loss": 0.6017, + "step": 6322 + }, + { + "epoch": 1.6786140979689366, + "grad_norm": 0.4255823562558479, + "learning_rate": 4.113882425267154e-06, + "loss": 0.5919, + "step": 6323 + }, + { + "epoch": 1.6788795964423204, + "grad_norm": 0.41201070477762003, + "learning_rate": 4.113615775129042e-06, + "loss": 0.5839, + "step": 6324 + }, + { + "epoch": 1.6791450949157043, + "grad_norm": 0.4111270706763679, + "learning_rate": 4.113349093521472e-06, + "loss": 0.5922, + "step": 6325 + }, + { + "epoch": 1.679410593389088, + "grad_norm": 0.41365891011181766, + "learning_rate": 4.113082380449644e-06, + "loss": 0.613, + "step": 6326 + }, + { + "epoch": 1.6796760918624718, + "grad_norm": 0.4116880308580051, + "learning_rate": 4.11281563591876e-06, + "loss": 0.5837, + "step": 6327 + }, + { + "epoch": 1.6799415903358557, + "grad_norm": 0.42748097326883955, + "learning_rate": 4.112548859934021e-06, + "loss": 0.5838, + "step": 6328 + }, + { + "epoch": 1.6802070888092393, + "grad_norm": 0.4114554528539254, + "learning_rate": 4.112282052500632e-06, + "loss": 0.5831, + "step": 6329 + }, + { + "epoch": 1.6804725872826232, + "grad_norm": 0.41978008815731305, + "learning_rate": 4.112015213623796e-06, + "loss": 0.5986, + "step": 6330 + }, + { + "epoch": 1.680738085756007, + "grad_norm": 0.4155852396845013, + "learning_rate": 4.111748343308717e-06, + "loss": 0.6073, + "step": 6331 + }, + { + "epoch": 1.6810035842293907, + "grad_norm": 0.40970980697927045, + "learning_rate": 4.111481441560598e-06, + "loss": 0.5686, + "step": 6332 + }, + { + "epoch": 1.6812690827027743, + "grad_norm": 0.41972736975756464, + "learning_rate": 4.1112145083846455e-06, + "loss": 0.5641, + "step": 6333 + }, + { + "epoch": 1.6815345811761584, + "grad_norm": 0.4028261859791443, + "learning_rate": 4.110947543786065e-06, + "loss": 0.5802, + "step": 6334 + }, + { + "epoch": 1.681800079649542, + "grad_norm": 0.41341272091908965, + "learning_rate": 4.110680547770063e-06, + "loss": 0.5456, + "step": 6335 + }, + { + "epoch": 1.6820655781229257, + "grad_norm": 0.3977026297832417, + "learning_rate": 4.110413520341847e-06, + "loss": 0.5823, + "step": 6336 + }, + { + "epoch": 1.6823310765963095, + "grad_norm": 0.39243486546212913, + "learning_rate": 4.1101464615066245e-06, + "loss": 0.5759, + "step": 6337 + }, + { + "epoch": 1.6825965750696934, + "grad_norm": 0.43224512890557176, + "learning_rate": 4.109879371269604e-06, + "loss": 0.5964, + "step": 6338 + }, + { + "epoch": 1.682862073543077, + "grad_norm": 0.41444387962089035, + "learning_rate": 4.109612249635993e-06, + "loss": 0.5783, + "step": 6339 + }, + { + "epoch": 1.683127572016461, + "grad_norm": 0.3935197047036064, + "learning_rate": 4.109345096611004e-06, + "loss": 0.6114, + "step": 6340 + }, + { + "epoch": 1.6833930704898448, + "grad_norm": 0.42624563039776986, + "learning_rate": 4.1090779121998435e-06, + "loss": 0.5435, + "step": 6341 + }, + { + "epoch": 1.6836585689632284, + "grad_norm": 0.4230824834732495, + "learning_rate": 4.108810696407725e-06, + "loss": 0.5779, + "step": 6342 + }, + { + "epoch": 1.6839240674366123, + "grad_norm": 0.4054209585045214, + "learning_rate": 4.108543449239858e-06, + "loss": 0.5661, + "step": 6343 + }, + { + "epoch": 1.6841895659099961, + "grad_norm": 0.44395284151553416, + "learning_rate": 4.108276170701456e-06, + "loss": 0.5984, + "step": 6344 + }, + { + "epoch": 1.6844550643833798, + "grad_norm": 0.41026685187487105, + "learning_rate": 4.108008860797731e-06, + "loss": 0.5893, + "step": 6345 + }, + { + "epoch": 1.6847205628567634, + "grad_norm": 0.43986690557286967, + "learning_rate": 4.107741519533895e-06, + "loss": 0.5737, + "step": 6346 + }, + { + "epoch": 1.6849860613301475, + "grad_norm": 0.3969660905177006, + "learning_rate": 4.107474146915164e-06, + "loss": 0.5712, + "step": 6347 + }, + { + "epoch": 1.6852515598035311, + "grad_norm": 0.41012061916371767, + "learning_rate": 4.107206742946751e-06, + "loss": 0.5807, + "step": 6348 + }, + { + "epoch": 1.6855170582769148, + "grad_norm": 0.41499185474366906, + "learning_rate": 4.106939307633871e-06, + "loss": 0.5554, + "step": 6349 + }, + { + "epoch": 1.6857825567502986, + "grad_norm": 0.4450379986225015, + "learning_rate": 4.106671840981741e-06, + "loss": 0.5689, + "step": 6350 + }, + { + "epoch": 1.6860480552236825, + "grad_norm": 0.426476334776983, + "learning_rate": 4.1064043429955755e-06, + "loss": 0.5829, + "step": 6351 + }, + { + "epoch": 1.6863135536970661, + "grad_norm": 0.42478931827500366, + "learning_rate": 4.106136813680593e-06, + "loss": 0.5912, + "step": 6352 + }, + { + "epoch": 1.68657905217045, + "grad_norm": 0.4096086951383353, + "learning_rate": 4.1058692530420096e-06, + "loss": 0.598, + "step": 6353 + }, + { + "epoch": 1.6868445506438339, + "grad_norm": 0.4373314025858963, + "learning_rate": 4.105601661085044e-06, + "loss": 0.5864, + "step": 6354 + }, + { + "epoch": 1.6871100491172175, + "grad_norm": 0.40282744081911964, + "learning_rate": 4.105334037814915e-06, + "loss": 0.5894, + "step": 6355 + }, + { + "epoch": 1.6873755475906014, + "grad_norm": 0.42586938814405817, + "learning_rate": 4.105066383236841e-06, + "loss": 0.5836, + "step": 6356 + }, + { + "epoch": 1.6876410460639852, + "grad_norm": 0.4302025906478626, + "learning_rate": 4.104798697356044e-06, + "loss": 0.6025, + "step": 6357 + }, + { + "epoch": 1.6879065445373689, + "grad_norm": 0.4074057891769, + "learning_rate": 4.104530980177742e-06, + "loss": 0.5973, + "step": 6358 + }, + { + "epoch": 1.6881720430107527, + "grad_norm": 0.4229883427210265, + "learning_rate": 4.104263231707157e-06, + "loss": 0.6113, + "step": 6359 + }, + { + "epoch": 1.6884375414841366, + "grad_norm": 0.4287571724094451, + "learning_rate": 4.103995451949512e-06, + "loss": 0.5975, + "step": 6360 + }, + { + "epoch": 1.6887030399575202, + "grad_norm": 0.42496527565666037, + "learning_rate": 4.103727640910028e-06, + "loss": 0.5863, + "step": 6361 + }, + { + "epoch": 1.6889685384309039, + "grad_norm": 0.4016826548906657, + "learning_rate": 4.103459798593928e-06, + "loss": 0.5978, + "step": 6362 + }, + { + "epoch": 1.6892340369042878, + "grad_norm": 0.40552571059681175, + "learning_rate": 4.103191925006438e-06, + "loss": 0.5661, + "step": 6363 + }, + { + "epoch": 1.6894995353776716, + "grad_norm": 0.4165922795596941, + "learning_rate": 4.102924020152777e-06, + "loss": 0.6139, + "step": 6364 + }, + { + "epoch": 1.6897650338510553, + "grad_norm": 0.42427768752913114, + "learning_rate": 4.102656084038176e-06, + "loss": 0.6129, + "step": 6365 + }, + { + "epoch": 1.6900305323244391, + "grad_norm": 0.40137273718291483, + "learning_rate": 4.102388116667856e-06, + "loss": 0.6225, + "step": 6366 + }, + { + "epoch": 1.690296030797823, + "grad_norm": 0.42317600116874127, + "learning_rate": 4.102120118047044e-06, + "loss": 0.5697, + "step": 6367 + }, + { + "epoch": 1.6905615292712066, + "grad_norm": 0.40658546027517756, + "learning_rate": 4.101852088180967e-06, + "loss": 0.6202, + "step": 6368 + }, + { + "epoch": 1.6908270277445905, + "grad_norm": 0.42287643576618905, + "learning_rate": 4.101584027074854e-06, + "loss": 0.6062, + "step": 6369 + }, + { + "epoch": 1.6910925262179743, + "grad_norm": 0.410552811964971, + "learning_rate": 4.101315934733929e-06, + "loss": 0.6238, + "step": 6370 + }, + { + "epoch": 1.691358024691358, + "grad_norm": 0.4278473776348954, + "learning_rate": 4.101047811163424e-06, + "loss": 0.5711, + "step": 6371 + }, + { + "epoch": 1.6916235231647418, + "grad_norm": 0.42162881407903335, + "learning_rate": 4.100779656368566e-06, + "loss": 0.6005, + "step": 6372 + }, + { + "epoch": 1.6918890216381257, + "grad_norm": 0.42086775835127227, + "learning_rate": 4.100511470354585e-06, + "loss": 0.5609, + "step": 6373 + }, + { + "epoch": 1.6921545201115094, + "grad_norm": 0.4168206115018423, + "learning_rate": 4.100243253126712e-06, + "loss": 0.5981, + "step": 6374 + }, + { + "epoch": 1.692420018584893, + "grad_norm": 0.43982691121191747, + "learning_rate": 4.099975004690178e-06, + "loss": 0.5925, + "step": 6375 + }, + { + "epoch": 1.692685517058277, + "grad_norm": 0.4070570648371935, + "learning_rate": 4.0997067250502134e-06, + "loss": 0.5768, + "step": 6376 + }, + { + "epoch": 1.6929510155316607, + "grad_norm": 0.41347871930142327, + "learning_rate": 4.099438414212051e-06, + "loss": 0.5498, + "step": 6377 + }, + { + "epoch": 1.6932165140050444, + "grad_norm": 0.41634321760258314, + "learning_rate": 4.099170072180923e-06, + "loss": 0.57, + "step": 6378 + }, + { + "epoch": 1.6934820124784282, + "grad_norm": 0.43542058241940884, + "learning_rate": 4.098901698962063e-06, + "loss": 0.5903, + "step": 6379 + }, + { + "epoch": 1.693747510951812, + "grad_norm": 0.4229693905665481, + "learning_rate": 4.098633294560707e-06, + "loss": 0.5431, + "step": 6380 + }, + { + "epoch": 1.6940130094251957, + "grad_norm": 0.4096668223370534, + "learning_rate": 4.098364858982085e-06, + "loss": 0.5871, + "step": 6381 + }, + { + "epoch": 1.6942785078985796, + "grad_norm": 0.41890765798353896, + "learning_rate": 4.098096392231437e-06, + "loss": 0.5661, + "step": 6382 + }, + { + "epoch": 1.6945440063719635, + "grad_norm": 0.41402569766025216, + "learning_rate": 4.097827894313995e-06, + "loss": 0.571, + "step": 6383 + }, + { + "epoch": 1.694809504845347, + "grad_norm": 0.42179137110450254, + "learning_rate": 4.097559365234998e-06, + "loss": 0.5534, + "step": 6384 + }, + { + "epoch": 1.695075003318731, + "grad_norm": 0.404547654102909, + "learning_rate": 4.097290804999681e-06, + "loss": 0.5778, + "step": 6385 + }, + { + "epoch": 1.6953405017921148, + "grad_norm": 0.4149106010204333, + "learning_rate": 4.097022213613283e-06, + "loss": 0.5862, + "step": 6386 + }, + { + "epoch": 1.6956060002654985, + "grad_norm": 0.415602586451017, + "learning_rate": 4.096753591081041e-06, + "loss": 0.5767, + "step": 6387 + }, + { + "epoch": 1.695871498738882, + "grad_norm": 0.40146208630611885, + "learning_rate": 4.096484937408195e-06, + "loss": 0.5506, + "step": 6388 + }, + { + "epoch": 1.6961369972122662, + "grad_norm": 0.43260958103687774, + "learning_rate": 4.096216252599984e-06, + "loss": 0.5553, + "step": 6389 + }, + { + "epoch": 1.6964024956856498, + "grad_norm": 0.40107115383249686, + "learning_rate": 4.095947536661648e-06, + "loss": 0.5973, + "step": 6390 + }, + { + "epoch": 1.6966679941590335, + "grad_norm": 0.40664575830641714, + "learning_rate": 4.0956787895984276e-06, + "loss": 0.5655, + "step": 6391 + }, + { + "epoch": 1.6969334926324173, + "grad_norm": 0.42616997752416796, + "learning_rate": 4.095410011415563e-06, + "loss": 0.5847, + "step": 6392 + }, + { + "epoch": 1.6971989911058012, + "grad_norm": 0.4325767776143576, + "learning_rate": 4.095141202118297e-06, + "loss": 0.5978, + "step": 6393 + }, + { + "epoch": 1.6974644895791848, + "grad_norm": 0.41061979151503814, + "learning_rate": 4.094872361711872e-06, + "loss": 0.5908, + "step": 6394 + }, + { + "epoch": 1.6977299880525687, + "grad_norm": 0.41320415833809454, + "learning_rate": 4.094603490201532e-06, + "loss": 0.6026, + "step": 6395 + }, + { + "epoch": 1.6979954865259526, + "grad_norm": 0.4252031107905295, + "learning_rate": 4.094334587592519e-06, + "loss": 0.5992, + "step": 6396 + }, + { + "epoch": 1.6982609849993362, + "grad_norm": 0.4171249749714777, + "learning_rate": 4.0940656538900785e-06, + "loss": 0.5963, + "step": 6397 + }, + { + "epoch": 1.69852648347272, + "grad_norm": 0.42979606864536424, + "learning_rate": 4.093796689099453e-06, + "loss": 0.5785, + "step": 6398 + }, + { + "epoch": 1.698791981946104, + "grad_norm": 0.3998948301091688, + "learning_rate": 4.093527693225891e-06, + "loss": 0.6166, + "step": 6399 + }, + { + "epoch": 1.6990574804194876, + "grad_norm": 0.3999341502225336, + "learning_rate": 4.093258666274639e-06, + "loss": 0.5772, + "step": 6400 + }, + { + "epoch": 1.6993229788928712, + "grad_norm": 0.42143661748813177, + "learning_rate": 4.0929896082509395e-06, + "loss": 0.6068, + "step": 6401 + }, + { + "epoch": 1.6995884773662553, + "grad_norm": 0.40074130024828597, + "learning_rate": 4.092720519160044e-06, + "loss": 0.5657, + "step": 6402 + }, + { + "epoch": 1.699853975839639, + "grad_norm": 0.43700781928529603, + "learning_rate": 4.092451399007198e-06, + "loss": 0.562, + "step": 6403 + }, + { + "epoch": 1.7001194743130226, + "grad_norm": 0.4050483358338015, + "learning_rate": 4.09218224779765e-06, + "loss": 0.5783, + "step": 6404 + }, + { + "epoch": 1.7003849727864064, + "grad_norm": 0.4125435779686136, + "learning_rate": 4.0919130655366504e-06, + "loss": 0.5425, + "step": 6405 + }, + { + "epoch": 1.7006504712597903, + "grad_norm": 0.40616842361924405, + "learning_rate": 4.091643852229449e-06, + "loss": 0.5794, + "step": 6406 + }, + { + "epoch": 1.700915969733174, + "grad_norm": 0.4062933433129822, + "learning_rate": 4.091374607881294e-06, + "loss": 0.5627, + "step": 6407 + }, + { + "epoch": 1.7011814682065578, + "grad_norm": 0.4192116361728902, + "learning_rate": 4.0911053324974395e-06, + "loss": 0.6051, + "step": 6408 + }, + { + "epoch": 1.7014469666799417, + "grad_norm": 0.4137945760551017, + "learning_rate": 4.0908360260831345e-06, + "loss": 0.6003, + "step": 6409 + }, + { + "epoch": 1.7017124651533253, + "grad_norm": 0.41050716920187064, + "learning_rate": 4.090566688643632e-06, + "loss": 0.6169, + "step": 6410 + }, + { + "epoch": 1.7019779636267092, + "grad_norm": 0.42012432549972434, + "learning_rate": 4.090297320184185e-06, + "loss": 0.5621, + "step": 6411 + }, + { + "epoch": 1.702243462100093, + "grad_norm": 0.40871221296596166, + "learning_rate": 4.090027920710046e-06, + "loss": 0.5968, + "step": 6412 + }, + { + "epoch": 1.7025089605734767, + "grad_norm": 0.4090334544123141, + "learning_rate": 4.08975849022647e-06, + "loss": 0.5975, + "step": 6413 + }, + { + "epoch": 1.7027744590468605, + "grad_norm": 0.41812633835746, + "learning_rate": 4.089489028738711e-06, + "loss": 0.5654, + "step": 6414 + }, + { + "epoch": 1.7030399575202444, + "grad_norm": 0.4116453932312294, + "learning_rate": 4.089219536252024e-06, + "loss": 0.5729, + "step": 6415 + }, + { + "epoch": 1.703305455993628, + "grad_norm": 0.41001971523236247, + "learning_rate": 4.0889500127716645e-06, + "loss": 0.5363, + "step": 6416 + }, + { + "epoch": 1.7035709544670117, + "grad_norm": 0.411511221648785, + "learning_rate": 4.088680458302891e-06, + "loss": 0.5983, + "step": 6417 + }, + { + "epoch": 1.7038364529403955, + "grad_norm": 0.39979271999330074, + "learning_rate": 4.0884108728509566e-06, + "loss": 0.5591, + "step": 6418 + }, + { + "epoch": 1.7041019514137794, + "grad_norm": 0.4105004758211771, + "learning_rate": 4.088141256421122e-06, + "loss": 0.5765, + "step": 6419 + }, + { + "epoch": 1.704367449887163, + "grad_norm": 0.41020123287357524, + "learning_rate": 4.087871609018645e-06, + "loss": 0.636, + "step": 6420 + }, + { + "epoch": 1.704632948360547, + "grad_norm": 0.40750140931572637, + "learning_rate": 4.087601930648783e-06, + "loss": 0.569, + "step": 6421 + }, + { + "epoch": 1.7048984468339308, + "grad_norm": 0.417438342071532, + "learning_rate": 4.087332221316797e-06, + "loss": 0.5523, + "step": 6422 + }, + { + "epoch": 1.7051639453073144, + "grad_norm": 0.41578059154919833, + "learning_rate": 4.087062481027946e-06, + "loss": 0.5877, + "step": 6423 + }, + { + "epoch": 1.7054294437806983, + "grad_norm": 0.41194798600497096, + "learning_rate": 4.08679270978749e-06, + "loss": 0.6104, + "step": 6424 + }, + { + "epoch": 1.7056949422540821, + "grad_norm": 0.4268203060982029, + "learning_rate": 4.086522907600692e-06, + "loss": 0.5771, + "step": 6425 + }, + { + "epoch": 1.7059604407274658, + "grad_norm": 0.4270837848572207, + "learning_rate": 4.086253074472812e-06, + "loss": 0.5695, + "step": 6426 + }, + { + "epoch": 1.7062259392008496, + "grad_norm": 0.40103513994611456, + "learning_rate": 4.085983210409114e-06, + "loss": 0.5901, + "step": 6427 + }, + { + "epoch": 1.7064914376742335, + "grad_norm": 0.4040823716747356, + "learning_rate": 4.08571331541486e-06, + "loss": 0.5956, + "step": 6428 + }, + { + "epoch": 1.7067569361476171, + "grad_norm": 0.425820221533074, + "learning_rate": 4.085443389495314e-06, + "loss": 0.5642, + "step": 6429 + }, + { + "epoch": 1.7070224346210008, + "grad_norm": 0.4227111587250706, + "learning_rate": 4.08517343265574e-06, + "loss": 0.59, + "step": 6430 + }, + { + "epoch": 1.7072879330943849, + "grad_norm": 0.4077812801578376, + "learning_rate": 4.084903444901402e-06, + "loss": 0.5754, + "step": 6431 + }, + { + "epoch": 1.7075534315677685, + "grad_norm": 0.4231708024750626, + "learning_rate": 4.084633426237568e-06, + "loss": 0.5746, + "step": 6432 + }, + { + "epoch": 1.7078189300411522, + "grad_norm": 0.4181944281392462, + "learning_rate": 4.084363376669501e-06, + "loss": 0.567, + "step": 6433 + }, + { + "epoch": 1.708084428514536, + "grad_norm": 0.4112539176711787, + "learning_rate": 4.0840932962024695e-06, + "loss": 0.5812, + "step": 6434 + }, + { + "epoch": 1.7083499269879199, + "grad_norm": 0.39504209909000465, + "learning_rate": 4.083823184841741e-06, + "loss": 0.5287, + "step": 6435 + }, + { + "epoch": 1.7086154254613035, + "grad_norm": 0.4077716554327273, + "learning_rate": 4.083553042592581e-06, + "loss": 0.528, + "step": 6436 + }, + { + "epoch": 1.7088809239346874, + "grad_norm": 0.41785135524504974, + "learning_rate": 4.08328286946026e-06, + "loss": 0.5905, + "step": 6437 + }, + { + "epoch": 1.7091464224080712, + "grad_norm": 0.393431350335522, + "learning_rate": 4.083012665450047e-06, + "loss": 0.597, + "step": 6438 + }, + { + "epoch": 1.7094119208814549, + "grad_norm": 0.3964504355407762, + "learning_rate": 4.08274243056721e-06, + "loss": 0.5872, + "step": 6439 + }, + { + "epoch": 1.7096774193548387, + "grad_norm": 0.40600021646242535, + "learning_rate": 4.082472164817021e-06, + "loss": 0.5958, + "step": 6440 + }, + { + "epoch": 1.7099429178282226, + "grad_norm": 0.38750031990339173, + "learning_rate": 4.082201868204751e-06, + "loss": 0.5506, + "step": 6441 + }, + { + "epoch": 1.7102084163016062, + "grad_norm": 0.4055901711825753, + "learning_rate": 4.08193154073567e-06, + "loss": 0.5962, + "step": 6442 + }, + { + "epoch": 1.71047391477499, + "grad_norm": 0.4067784155990374, + "learning_rate": 4.081661182415051e-06, + "loss": 0.576, + "step": 6443 + }, + { + "epoch": 1.710739413248374, + "grad_norm": 0.416429523114435, + "learning_rate": 4.081390793248166e-06, + "loss": 0.5791, + "step": 6444 + }, + { + "epoch": 1.7110049117217576, + "grad_norm": 0.4072772005433632, + "learning_rate": 4.081120373240288e-06, + "loss": 0.5927, + "step": 6445 + }, + { + "epoch": 1.7112704101951413, + "grad_norm": 0.39880803456168584, + "learning_rate": 4.080849922396692e-06, + "loss": 0.5874, + "step": 6446 + }, + { + "epoch": 1.7115359086685251, + "grad_norm": 0.4019850215024897, + "learning_rate": 4.080579440722653e-06, + "loss": 0.5968, + "step": 6447 + }, + { + "epoch": 1.711801407141909, + "grad_norm": 0.39670634230166185, + "learning_rate": 4.080308928223444e-06, + "loss": 0.559, + "step": 6448 + }, + { + "epoch": 1.7120669056152926, + "grad_norm": 0.410121864042993, + "learning_rate": 4.080038384904341e-06, + "loss": 0.6348, + "step": 6449 + }, + { + "epoch": 1.7123324040886765, + "grad_norm": 0.4080685061246559, + "learning_rate": 4.0797678107706215e-06, + "loss": 0.593, + "step": 6450 + }, + { + "epoch": 1.7125979025620603, + "grad_norm": 0.406464197577668, + "learning_rate": 4.079497205827563e-06, + "loss": 0.5889, + "step": 6451 + }, + { + "epoch": 1.712863401035444, + "grad_norm": 0.3972498954039883, + "learning_rate": 4.0792265700804405e-06, + "loss": 0.6046, + "step": 6452 + }, + { + "epoch": 1.7131288995088279, + "grad_norm": 0.4041997439336237, + "learning_rate": 4.078955903534533e-06, + "loss": 0.5807, + "step": 6453 + }, + { + "epoch": 1.7133943979822117, + "grad_norm": 0.4094804300767878, + "learning_rate": 4.0786852061951195e-06, + "loss": 0.5833, + "step": 6454 + }, + { + "epoch": 1.7136598964555954, + "grad_norm": 0.3990464385465636, + "learning_rate": 4.07841447806748e-06, + "loss": 0.6179, + "step": 6455 + }, + { + "epoch": 1.713925394928979, + "grad_norm": 0.4211035646587186, + "learning_rate": 4.078143719156893e-06, + "loss": 0.5843, + "step": 6456 + }, + { + "epoch": 1.714190893402363, + "grad_norm": 0.3862936986970333, + "learning_rate": 4.0778729294686395e-06, + "loss": 0.5577, + "step": 6457 + }, + { + "epoch": 1.7144563918757467, + "grad_norm": 0.4065149957277128, + "learning_rate": 4.077602109008001e-06, + "loss": 0.5903, + "step": 6458 + }, + { + "epoch": 1.7147218903491304, + "grad_norm": 0.4114984045862392, + "learning_rate": 4.077331257780257e-06, + "loss": 0.6215, + "step": 6459 + }, + { + "epoch": 1.7149873888225142, + "grad_norm": 0.41412438434444615, + "learning_rate": 4.077060375790694e-06, + "loss": 0.5886, + "step": 6460 + }, + { + "epoch": 1.715252887295898, + "grad_norm": 0.4145478491544096, + "learning_rate": 4.076789463044591e-06, + "loss": 0.6185, + "step": 6461 + }, + { + "epoch": 1.7155183857692817, + "grad_norm": 0.4079812481134643, + "learning_rate": 4.076518519547232e-06, + "loss": 0.5242, + "step": 6462 + }, + { + "epoch": 1.7157838842426656, + "grad_norm": 0.40181893687288417, + "learning_rate": 4.076247545303904e-06, + "loss": 0.5752, + "step": 6463 + }, + { + "epoch": 1.7160493827160495, + "grad_norm": 0.4280001907689151, + "learning_rate": 4.075976540319888e-06, + "loss": 0.5889, + "step": 6464 + }, + { + "epoch": 1.716314881189433, + "grad_norm": 0.40462215549039166, + "learning_rate": 4.075705504600471e-06, + "loss": 0.6138, + "step": 6465 + }, + { + "epoch": 1.716580379662817, + "grad_norm": 0.4202453038672166, + "learning_rate": 4.075434438150939e-06, + "loss": 0.6127, + "step": 6466 + }, + { + "epoch": 1.7168458781362008, + "grad_norm": 0.40610066860651095, + "learning_rate": 4.075163340976578e-06, + "loss": 0.5867, + "step": 6467 + }, + { + "epoch": 1.7171113766095845, + "grad_norm": 0.41606926502164887, + "learning_rate": 4.074892213082676e-06, + "loss": 0.6096, + "step": 6468 + }, + { + "epoch": 1.7173768750829683, + "grad_norm": 0.4070920623401771, + "learning_rate": 4.074621054474519e-06, + "loss": 0.5792, + "step": 6469 + }, + { + "epoch": 1.7176423735563522, + "grad_norm": 0.41991387058373475, + "learning_rate": 4.0743498651573966e-06, + "loss": 0.5878, + "step": 6470 + }, + { + "epoch": 1.7179078720297358, + "grad_norm": 0.4031444925611946, + "learning_rate": 4.074078645136597e-06, + "loss": 0.5925, + "step": 6471 + }, + { + "epoch": 1.7181733705031195, + "grad_norm": 0.40787428006592125, + "learning_rate": 4.073807394417409e-06, + "loss": 0.5918, + "step": 6472 + }, + { + "epoch": 1.7184388689765033, + "grad_norm": 0.4021743957758243, + "learning_rate": 4.073536113005125e-06, + "loss": 0.5677, + "step": 6473 + }, + { + "epoch": 1.7187043674498872, + "grad_norm": 0.40637385902873846, + "learning_rate": 4.073264800905032e-06, + "loss": 0.5862, + "step": 6474 + }, + { + "epoch": 1.7189698659232708, + "grad_norm": 0.4082521927974009, + "learning_rate": 4.072993458122425e-06, + "loss": 0.5639, + "step": 6475 + }, + { + "epoch": 1.7192353643966547, + "grad_norm": 0.4088434797337089, + "learning_rate": 4.072722084662593e-06, + "loss": 0.5662, + "step": 6476 + }, + { + "epoch": 1.7195008628700386, + "grad_norm": 0.4129241905146231, + "learning_rate": 4.072450680530831e-06, + "loss": 0.5906, + "step": 6477 + }, + { + "epoch": 1.7197663613434222, + "grad_norm": 0.4127538434151883, + "learning_rate": 4.07217924573243e-06, + "loss": 0.5959, + "step": 6478 + }, + { + "epoch": 1.720031859816806, + "grad_norm": 0.4299661159149051, + "learning_rate": 4.071907780272685e-06, + "loss": 0.5646, + "step": 6479 + }, + { + "epoch": 1.72029735829019, + "grad_norm": 0.42131397914061647, + "learning_rate": 4.071636284156888e-06, + "loss": 0.5905, + "step": 6480 + }, + { + "epoch": 1.7205628567635736, + "grad_norm": 0.4040503166510062, + "learning_rate": 4.071364757390337e-06, + "loss": 0.5559, + "step": 6481 + }, + { + "epoch": 1.7208283552369574, + "grad_norm": 0.4229742119781497, + "learning_rate": 4.071093199978325e-06, + "loss": 0.559, + "step": 6482 + }, + { + "epoch": 1.7210938537103413, + "grad_norm": 0.4091464314884923, + "learning_rate": 4.070821611926149e-06, + "loss": 0.6037, + "step": 6483 + }, + { + "epoch": 1.721359352183725, + "grad_norm": 0.41133828087437535, + "learning_rate": 4.070549993239106e-06, + "loss": 0.5789, + "step": 6484 + }, + { + "epoch": 1.7216248506571086, + "grad_norm": 0.4103944419430368, + "learning_rate": 4.070278343922492e-06, + "loss": 0.5738, + "step": 6485 + }, + { + "epoch": 1.7218903491304927, + "grad_norm": 0.4053549285718851, + "learning_rate": 4.070006663981606e-06, + "loss": 0.5963, + "step": 6486 + }, + { + "epoch": 1.7221558476038763, + "grad_norm": 0.4095061100300723, + "learning_rate": 4.069734953421746e-06, + "loss": 0.6, + "step": 6487 + }, + { + "epoch": 1.72242134607726, + "grad_norm": 0.41881698139462603, + "learning_rate": 4.0694632122482125e-06, + "loss": 0.58, + "step": 6488 + }, + { + "epoch": 1.7226868445506438, + "grad_norm": 0.40162987009694756, + "learning_rate": 4.069191440466302e-06, + "loss": 0.5906, + "step": 6489 + }, + { + "epoch": 1.7229523430240277, + "grad_norm": 0.40125551385564623, + "learning_rate": 4.0689196380813164e-06, + "loss": 0.5859, + "step": 6490 + }, + { + "epoch": 1.7232178414974113, + "grad_norm": 0.4096344823890862, + "learning_rate": 4.0686478050985565e-06, + "loss": 0.5923, + "step": 6491 + }, + { + "epoch": 1.7234833399707952, + "grad_norm": 0.4172744880573066, + "learning_rate": 4.068375941523325e-06, + "loss": 0.5975, + "step": 6492 + }, + { + "epoch": 1.723748838444179, + "grad_norm": 0.4183005841209401, + "learning_rate": 4.068104047360922e-06, + "loss": 0.5991, + "step": 6493 + }, + { + "epoch": 1.7240143369175627, + "grad_norm": 0.3965036355412523, + "learning_rate": 4.06783212261665e-06, + "loss": 0.5643, + "step": 6494 + }, + { + "epoch": 1.7242798353909465, + "grad_norm": 0.4265467477525225, + "learning_rate": 4.0675601672958126e-06, + "loss": 0.5802, + "step": 6495 + }, + { + "epoch": 1.7245453338643304, + "grad_norm": 0.42550491000281565, + "learning_rate": 4.067288181403715e-06, + "loss": 0.6092, + "step": 6496 + }, + { + "epoch": 1.724810832337714, + "grad_norm": 0.4056888465983238, + "learning_rate": 4.067016164945659e-06, + "loss": 0.5989, + "step": 6497 + }, + { + "epoch": 1.7250763308110977, + "grad_norm": 0.41399737876617976, + "learning_rate": 4.066744117926952e-06, + "loss": 0.5669, + "step": 6498 + }, + { + "epoch": 1.7253418292844818, + "grad_norm": 0.41084712423096337, + "learning_rate": 4.066472040352899e-06, + "loss": 0.5943, + "step": 6499 + }, + { + "epoch": 1.7256073277578654, + "grad_norm": 0.4254045083340015, + "learning_rate": 4.066199932228805e-06, + "loss": 0.5726, + "step": 6500 + }, + { + "epoch": 1.725872826231249, + "grad_norm": 0.41213205181867074, + "learning_rate": 4.065927793559978e-06, + "loss": 0.551, + "step": 6501 + }, + { + "epoch": 1.726138324704633, + "grad_norm": 0.40424425759441435, + "learning_rate": 4.065655624351725e-06, + "loss": 0.5483, + "step": 6502 + }, + { + "epoch": 1.7264038231780168, + "grad_norm": 0.4226921073138124, + "learning_rate": 4.065383424609354e-06, + "loss": 0.5804, + "step": 6503 + }, + { + "epoch": 1.7266693216514004, + "grad_norm": 0.3952102821170808, + "learning_rate": 4.065111194338174e-06, + "loss": 0.6039, + "step": 6504 + }, + { + "epoch": 1.7269348201247843, + "grad_norm": 0.4125175990813606, + "learning_rate": 4.064838933543494e-06, + "loss": 0.5747, + "step": 6505 + }, + { + "epoch": 1.7272003185981681, + "grad_norm": 0.40701259448877025, + "learning_rate": 4.064566642230622e-06, + "loss": 0.6064, + "step": 6506 + }, + { + "epoch": 1.7274658170715518, + "grad_norm": 0.4197157276715124, + "learning_rate": 4.06429432040487e-06, + "loss": 0.5983, + "step": 6507 + }, + { + "epoch": 1.7277313155449356, + "grad_norm": 0.4020441473168711, + "learning_rate": 4.0640219680715485e-06, + "loss": 0.5662, + "step": 6508 + }, + { + "epoch": 1.7279968140183195, + "grad_norm": 0.4079702267667392, + "learning_rate": 4.06374958523597e-06, + "loss": 0.6116, + "step": 6509 + }, + { + "epoch": 1.7282623124917031, + "grad_norm": 0.427835795562143, + "learning_rate": 4.063477171903446e-06, + "loss": 0.6076, + "step": 6510 + }, + { + "epoch": 1.7285278109650868, + "grad_norm": 0.41371501948257755, + "learning_rate": 4.063204728079289e-06, + "loss": 0.5818, + "step": 6511 + }, + { + "epoch": 1.7287933094384709, + "grad_norm": 0.41644671838677544, + "learning_rate": 4.062932253768812e-06, + "loss": 0.5681, + "step": 6512 + }, + { + "epoch": 1.7290588079118545, + "grad_norm": 0.40854132416092004, + "learning_rate": 4.062659748977329e-06, + "loss": 0.585, + "step": 6513 + }, + { + "epoch": 1.7293243063852382, + "grad_norm": 0.4370516538734631, + "learning_rate": 4.062387213710155e-06, + "loss": 0.5907, + "step": 6514 + }, + { + "epoch": 1.729589804858622, + "grad_norm": 0.42561001434745294, + "learning_rate": 4.0621146479726055e-06, + "loss": 0.5476, + "step": 6515 + }, + { + "epoch": 1.7298553033320059, + "grad_norm": 0.4169885273394613, + "learning_rate": 4.061842051769995e-06, + "loss": 0.5692, + "step": 6516 + }, + { + "epoch": 1.7301208018053895, + "grad_norm": 0.42042977366952405, + "learning_rate": 4.061569425107641e-06, + "loss": 0.5741, + "step": 6517 + }, + { + "epoch": 1.7303863002787734, + "grad_norm": 0.4201185458931557, + "learning_rate": 4.06129676799086e-06, + "loss": 0.5888, + "step": 6518 + }, + { + "epoch": 1.7306517987521572, + "grad_norm": 0.442603475118607, + "learning_rate": 4.061024080424969e-06, + "loss": 0.595, + "step": 6519 + }, + { + "epoch": 1.7309172972255409, + "grad_norm": 0.4096136870681202, + "learning_rate": 4.060751362415286e-06, + "loss": 0.5837, + "step": 6520 + }, + { + "epoch": 1.7311827956989247, + "grad_norm": 0.39845176144785643, + "learning_rate": 4.060478613967131e-06, + "loss": 0.5815, + "step": 6521 + }, + { + "epoch": 1.7314482941723086, + "grad_norm": 0.4172787874499624, + "learning_rate": 4.060205835085821e-06, + "loss": 0.5576, + "step": 6522 + }, + { + "epoch": 1.7317137926456923, + "grad_norm": 0.40926243043631466, + "learning_rate": 4.059933025776678e-06, + "loss": 0.591, + "step": 6523 + }, + { + "epoch": 1.7319792911190761, + "grad_norm": 0.40419760902700513, + "learning_rate": 4.059660186045023e-06, + "loss": 0.6001, + "step": 6524 + }, + { + "epoch": 1.73224478959246, + "grad_norm": 0.42930322248224045, + "learning_rate": 4.059387315896173e-06, + "loss": 0.5926, + "step": 6525 + }, + { + "epoch": 1.7325102880658436, + "grad_norm": 0.4186394730990391, + "learning_rate": 4.059114415335453e-06, + "loss": 0.5695, + "step": 6526 + }, + { + "epoch": 1.7327757865392273, + "grad_norm": 0.4286327221476898, + "learning_rate": 4.058841484368186e-06, + "loss": 0.5832, + "step": 6527 + }, + { + "epoch": 1.7330412850126113, + "grad_norm": 0.40727958260971603, + "learning_rate": 4.0585685229996915e-06, + "loss": 0.5557, + "step": 6528 + }, + { + "epoch": 1.733306783485995, + "grad_norm": 0.4232909270065776, + "learning_rate": 4.058295531235296e-06, + "loss": 0.5453, + "step": 6529 + }, + { + "epoch": 1.7335722819593786, + "grad_norm": 0.4088641700793539, + "learning_rate": 4.058022509080322e-06, + "loss": 0.5624, + "step": 6530 + }, + { + "epoch": 1.7338377804327625, + "grad_norm": 0.4241057977537865, + "learning_rate": 4.0577494565400944e-06, + "loss": 0.5991, + "step": 6531 + }, + { + "epoch": 1.7341032789061464, + "grad_norm": 0.4075900696694043, + "learning_rate": 4.057476373619938e-06, + "loss": 0.5869, + "step": 6532 + }, + { + "epoch": 1.73436877737953, + "grad_norm": 0.3968186468150876, + "learning_rate": 4.05720326032518e-06, + "loss": 0.5819, + "step": 6533 + }, + { + "epoch": 1.7346342758529139, + "grad_norm": 0.4289654645204103, + "learning_rate": 4.056930116661144e-06, + "loss": 0.5416, + "step": 6534 + }, + { + "epoch": 1.7348997743262977, + "grad_norm": 0.4277504534371669, + "learning_rate": 4.05665694263316e-06, + "loss": 0.5946, + "step": 6535 + }, + { + "epoch": 1.7351652727996814, + "grad_norm": 0.4229210958706269, + "learning_rate": 4.056383738246554e-06, + "loss": 0.5883, + "step": 6536 + }, + { + "epoch": 1.7354307712730652, + "grad_norm": 0.4053274095420921, + "learning_rate": 4.056110503506654e-06, + "loss": 0.5768, + "step": 6537 + }, + { + "epoch": 1.735696269746449, + "grad_norm": 0.3984035050388826, + "learning_rate": 4.055837238418791e-06, + "loss": 0.5626, + "step": 6538 + }, + { + "epoch": 1.7359617682198327, + "grad_norm": 0.43063569711169525, + "learning_rate": 4.05556394298829e-06, + "loss": 0.596, + "step": 6539 + }, + { + "epoch": 1.7362272666932164, + "grad_norm": 0.4163124255485391, + "learning_rate": 4.0552906172204845e-06, + "loss": 0.6004, + "step": 6540 + }, + { + "epoch": 1.7364927651666004, + "grad_norm": 0.40868346600138583, + "learning_rate": 4.055017261120704e-06, + "loss": 0.5904, + "step": 6541 + }, + { + "epoch": 1.736758263639984, + "grad_norm": 0.4076593578635097, + "learning_rate": 4.054743874694279e-06, + "loss": 0.5341, + "step": 6542 + }, + { + "epoch": 1.7370237621133677, + "grad_norm": 0.412052739537721, + "learning_rate": 4.054470457946542e-06, + "loss": 0.5846, + "step": 6543 + }, + { + "epoch": 1.7372892605867516, + "grad_norm": 0.4125055410366082, + "learning_rate": 4.054197010882826e-06, + "loss": 0.5922, + "step": 6544 + }, + { + "epoch": 1.7375547590601355, + "grad_norm": 0.4133294661631091, + "learning_rate": 4.053923533508462e-06, + "loss": 0.5897, + "step": 6545 + }, + { + "epoch": 1.737820257533519, + "grad_norm": 0.4145985645294253, + "learning_rate": 4.053650025828785e-06, + "loss": 0.5565, + "step": 6546 + }, + { + "epoch": 1.738085756006903, + "grad_norm": 0.4081788673106127, + "learning_rate": 4.0533764878491285e-06, + "loss": 0.5634, + "step": 6547 + }, + { + "epoch": 1.7383512544802868, + "grad_norm": 0.40319963446551593, + "learning_rate": 4.0531029195748265e-06, + "loss": 0.588, + "step": 6548 + }, + { + "epoch": 1.7386167529536705, + "grad_norm": 0.4061629001496505, + "learning_rate": 4.052829321011215e-06, + "loss": 0.5771, + "step": 6549 + }, + { + "epoch": 1.7388822514270543, + "grad_norm": 0.4084395782841003, + "learning_rate": 4.0525556921636296e-06, + "loss": 0.6114, + "step": 6550 + }, + { + "epoch": 1.7391477499004382, + "grad_norm": 0.4218094049712499, + "learning_rate": 4.052282033037408e-06, + "loss": 0.5761, + "step": 6551 + }, + { + "epoch": 1.7394132483738218, + "grad_norm": 0.43864736949046174, + "learning_rate": 4.052008343637885e-06, + "loss": 0.5658, + "step": 6552 + }, + { + "epoch": 1.7396787468472055, + "grad_norm": 0.4092254357176569, + "learning_rate": 4.0517346239704e-06, + "loss": 0.5668, + "step": 6553 + }, + { + "epoch": 1.7399442453205896, + "grad_norm": 0.4181490669998728, + "learning_rate": 4.05146087404029e-06, + "loss": 0.5832, + "step": 6554 + }, + { + "epoch": 1.7402097437939732, + "grad_norm": 0.43196287879216094, + "learning_rate": 4.051187093852895e-06, + "loss": 0.5782, + "step": 6555 + }, + { + "epoch": 1.7404752422673568, + "grad_norm": 0.4770861667131978, + "learning_rate": 4.050913283413553e-06, + "loss": 0.5812, + "step": 6556 + }, + { + "epoch": 1.7407407407407407, + "grad_norm": 0.4199658479211904, + "learning_rate": 4.050639442727606e-06, + "loss": 0.5751, + "step": 6557 + }, + { + "epoch": 1.7410062392141246, + "grad_norm": 0.3996931497967501, + "learning_rate": 4.050365571800392e-06, + "loss": 0.5668, + "step": 6558 + }, + { + "epoch": 1.7412717376875082, + "grad_norm": 0.4678040977474167, + "learning_rate": 4.050091670637254e-06, + "loss": 0.5583, + "step": 6559 + }, + { + "epoch": 1.741537236160892, + "grad_norm": 0.4243455377755792, + "learning_rate": 4.049817739243532e-06, + "loss": 0.5754, + "step": 6560 + }, + { + "epoch": 1.741802734634276, + "grad_norm": 0.43202984752398316, + "learning_rate": 4.049543777624571e-06, + "loss": 0.6082, + "step": 6561 + }, + { + "epoch": 1.7420682331076596, + "grad_norm": 0.40720103442299777, + "learning_rate": 4.0492697857857115e-06, + "loss": 0.5994, + "step": 6562 + }, + { + "epoch": 1.7423337315810434, + "grad_norm": 0.42362962610328386, + "learning_rate": 4.048995763732298e-06, + "loss": 0.6032, + "step": 6563 + }, + { + "epoch": 1.7425992300544273, + "grad_norm": 0.4488404055454519, + "learning_rate": 4.048721711469675e-06, + "loss": 0.5417, + "step": 6564 + }, + { + "epoch": 1.742864728527811, + "grad_norm": 0.41165074162444915, + "learning_rate": 4.048447629003186e-06, + "loss": 0.599, + "step": 6565 + }, + { + "epoch": 1.7431302270011948, + "grad_norm": 0.4077958636446163, + "learning_rate": 4.048173516338178e-06, + "loss": 0.6019, + "step": 6566 + }, + { + "epoch": 1.7433957254745787, + "grad_norm": 0.4092268470827822, + "learning_rate": 4.047899373479994e-06, + "loss": 0.5874, + "step": 6567 + }, + { + "epoch": 1.7436612239479623, + "grad_norm": 0.5179927698481733, + "learning_rate": 4.047625200433984e-06, + "loss": 0.5448, + "step": 6568 + }, + { + "epoch": 1.743926722421346, + "grad_norm": 0.42831109605807205, + "learning_rate": 4.047350997205492e-06, + "loss": 0.5797, + "step": 6569 + }, + { + "epoch": 1.7441922208947298, + "grad_norm": 0.42182548063428943, + "learning_rate": 4.047076763799869e-06, + "loss": 0.5863, + "step": 6570 + }, + { + "epoch": 1.7444577193681137, + "grad_norm": 0.4975277214813528, + "learning_rate": 4.04680250022246e-06, + "loss": 0.5956, + "step": 6571 + }, + { + "epoch": 1.7447232178414973, + "grad_norm": 0.42253041402695946, + "learning_rate": 4.046528206478615e-06, + "loss": 0.5696, + "step": 6572 + }, + { + "epoch": 1.7449887163148812, + "grad_norm": 0.41363595365217243, + "learning_rate": 4.046253882573683e-06, + "loss": 0.5485, + "step": 6573 + }, + { + "epoch": 1.745254214788265, + "grad_norm": 0.40893550641369186, + "learning_rate": 4.045979528513015e-06, + "loss": 0.5715, + "step": 6574 + }, + { + "epoch": 1.7455197132616487, + "grad_norm": 0.4154769475342251, + "learning_rate": 4.0457051443019606e-06, + "loss": 0.59, + "step": 6575 + }, + { + "epoch": 1.7457852117350325, + "grad_norm": 0.40984150144553694, + "learning_rate": 4.045430729945872e-06, + "loss": 0.6362, + "step": 6576 + }, + { + "epoch": 1.7460507102084164, + "grad_norm": 0.41910663541223325, + "learning_rate": 4.0451562854501e-06, + "loss": 0.5349, + "step": 6577 + }, + { + "epoch": 1.7463162086818, + "grad_norm": 0.4032734944892639, + "learning_rate": 4.044881810819996e-06, + "loss": 0.5586, + "step": 6578 + }, + { + "epoch": 1.746581707155184, + "grad_norm": 0.4346664314469867, + "learning_rate": 4.044607306060916e-06, + "loss": 0.5962, + "step": 6579 + }, + { + "epoch": 1.7468472056285678, + "grad_norm": 0.41575913957684874, + "learning_rate": 4.044332771178211e-06, + "loss": 0.6091, + "step": 6580 + }, + { + "epoch": 1.7471127041019514, + "grad_norm": 0.4102087393071203, + "learning_rate": 4.044058206177236e-06, + "loss": 0.5669, + "step": 6581 + }, + { + "epoch": 1.747378202575335, + "grad_norm": 0.42439716602294586, + "learning_rate": 4.0437836110633455e-06, + "loss": 0.6089, + "step": 6582 + }, + { + "epoch": 1.7476437010487191, + "grad_norm": 0.3952857331589499, + "learning_rate": 4.043508985841894e-06, + "loss": 0.6174, + "step": 6583 + }, + { + "epoch": 1.7479091995221028, + "grad_norm": 0.3998263039604155, + "learning_rate": 4.043234330518239e-06, + "loss": 0.5714, + "step": 6584 + }, + { + "epoch": 1.7481746979954864, + "grad_norm": 0.40169484274428324, + "learning_rate": 4.042959645097735e-06, + "loss": 0.5509, + "step": 6585 + }, + { + "epoch": 1.7484401964688703, + "grad_norm": 0.406623158965972, + "learning_rate": 4.04268492958574e-06, + "loss": 0.5843, + "step": 6586 + }, + { + "epoch": 1.7487056949422541, + "grad_norm": 0.4061533543998768, + "learning_rate": 4.042410183987614e-06, + "loss": 0.5658, + "step": 6587 + }, + { + "epoch": 1.7489711934156378, + "grad_norm": 0.4211592982661729, + "learning_rate": 4.042135408308711e-06, + "loss": 0.5963, + "step": 6588 + }, + { + "epoch": 1.7492366918890216, + "grad_norm": 0.40864661096244187, + "learning_rate": 4.041860602554392e-06, + "loss": 0.5803, + "step": 6589 + }, + { + "epoch": 1.7495021903624055, + "grad_norm": 0.4254625447919162, + "learning_rate": 4.041585766730016e-06, + "loss": 0.5877, + "step": 6590 + }, + { + "epoch": 1.7497676888357891, + "grad_norm": 0.41918740410262323, + "learning_rate": 4.041310900840944e-06, + "loss": 0.5882, + "step": 6591 + }, + { + "epoch": 1.750033187309173, + "grad_norm": 0.4184930025211725, + "learning_rate": 4.041036004892534e-06, + "loss": 0.5499, + "step": 6592 + }, + { + "epoch": 1.7502986857825569, + "grad_norm": 0.41521741050048006, + "learning_rate": 4.040761078890151e-06, + "loss": 0.5744, + "step": 6593 + }, + { + "epoch": 1.7505641842559405, + "grad_norm": 0.40365113020255894, + "learning_rate": 4.040486122839154e-06, + "loss": 0.5667, + "step": 6594 + }, + { + "epoch": 1.7508296827293242, + "grad_norm": 0.40818413024588884, + "learning_rate": 4.040211136744905e-06, + "loss": 0.5677, + "step": 6595 + }, + { + "epoch": 1.7510951812027082, + "grad_norm": 0.4076273584168611, + "learning_rate": 4.039936120612767e-06, + "loss": 0.5729, + "step": 6596 + }, + { + "epoch": 1.7513606796760919, + "grad_norm": 0.40528917919580143, + "learning_rate": 4.0396610744481055e-06, + "loss": 0.5872, + "step": 6597 + }, + { + "epoch": 1.7516261781494755, + "grad_norm": 0.3987071554952435, + "learning_rate": 4.0393859982562834e-06, + "loss": 0.5426, + "step": 6598 + }, + { + "epoch": 1.7518916766228594, + "grad_norm": 0.41900645372489553, + "learning_rate": 4.039110892042664e-06, + "loss": 0.5809, + "step": 6599 + }, + { + "epoch": 1.7521571750962432, + "grad_norm": 0.4069892406717045, + "learning_rate": 4.038835755812615e-06, + "loss": 0.5912, + "step": 6600 + }, + { + "epoch": 1.7524226735696269, + "grad_norm": 0.4176666863118596, + "learning_rate": 4.038560589571501e-06, + "loss": 0.5838, + "step": 6601 + }, + { + "epoch": 1.7526881720430108, + "grad_norm": 0.42499834184545204, + "learning_rate": 4.038285393324689e-06, + "loss": 0.5626, + "step": 6602 + }, + { + "epoch": 1.7529536705163946, + "grad_norm": 0.4184014937857046, + "learning_rate": 4.038010167077544e-06, + "loss": 0.5967, + "step": 6603 + }, + { + "epoch": 1.7532191689897783, + "grad_norm": 0.4229606510310555, + "learning_rate": 4.0377349108354355e-06, + "loss": 0.5593, + "step": 6604 + }, + { + "epoch": 1.7534846674631621, + "grad_norm": 0.4190484197388039, + "learning_rate": 4.037459624603731e-06, + "loss": 0.5682, + "step": 6605 + }, + { + "epoch": 1.753750165936546, + "grad_norm": 0.4040382952238776, + "learning_rate": 4.037184308387801e-06, + "loss": 0.5687, + "step": 6606 + }, + { + "epoch": 1.7540156644099296, + "grad_norm": 0.39656698413284036, + "learning_rate": 4.036908962193011e-06, + "loss": 0.5708, + "step": 6607 + }, + { + "epoch": 1.7542811628833133, + "grad_norm": 0.41894603382728196, + "learning_rate": 4.036633586024735e-06, + "loss": 0.5727, + "step": 6608 + }, + { + "epoch": 1.7545466613566973, + "grad_norm": 0.4368774187877, + "learning_rate": 4.036358179888341e-06, + "loss": 0.5592, + "step": 6609 + }, + { + "epoch": 1.754812159830081, + "grad_norm": 0.4053271824935084, + "learning_rate": 4.036082743789202e-06, + "loss": 0.5742, + "step": 6610 + }, + { + "epoch": 1.7550776583034646, + "grad_norm": 0.4025499347636373, + "learning_rate": 4.0358072777326865e-06, + "loss": 0.5453, + "step": 6611 + }, + { + "epoch": 1.7553431567768485, + "grad_norm": 0.41175185466705605, + "learning_rate": 4.0355317817241705e-06, + "loss": 0.5881, + "step": 6612 + }, + { + "epoch": 1.7556086552502324, + "grad_norm": 0.42963526717887235, + "learning_rate": 4.035256255769024e-06, + "loss": 0.6022, + "step": 6613 + }, + { + "epoch": 1.755874153723616, + "grad_norm": 0.4084833324235432, + "learning_rate": 4.034980699872622e-06, + "loss": 0.5866, + "step": 6614 + }, + { + "epoch": 1.7561396521969999, + "grad_norm": 0.40609272449252576, + "learning_rate": 4.0347051140403385e-06, + "loss": 0.5548, + "step": 6615 + }, + { + "epoch": 1.7564051506703837, + "grad_norm": 0.4247669150585388, + "learning_rate": 4.034429498277546e-06, + "loss": 0.5975, + "step": 6616 + }, + { + "epoch": 1.7566706491437674, + "grad_norm": 0.42602218826048216, + "learning_rate": 4.034153852589623e-06, + "loss": 0.5491, + "step": 6617 + }, + { + "epoch": 1.7569361476171512, + "grad_norm": 0.4029172410004458, + "learning_rate": 4.033878176981943e-06, + "loss": 0.5851, + "step": 6618 + }, + { + "epoch": 1.757201646090535, + "grad_norm": 0.41703193602472705, + "learning_rate": 4.033602471459884e-06, + "loss": 0.5651, + "step": 6619 + }, + { + "epoch": 1.7574671445639187, + "grad_norm": 0.42727637831045134, + "learning_rate": 4.033326736028821e-06, + "loss": 0.5676, + "step": 6620 + }, + { + "epoch": 1.7577326430373026, + "grad_norm": 0.41000351509458977, + "learning_rate": 4.033050970694133e-06, + "loss": 0.6082, + "step": 6621 + }, + { + "epoch": 1.7579981415106865, + "grad_norm": 0.39783516034211625, + "learning_rate": 4.032775175461196e-06, + "loss": 0.5912, + "step": 6622 + }, + { + "epoch": 1.75826363998407, + "grad_norm": 0.3967768396833056, + "learning_rate": 4.0324993503353915e-06, + "loss": 0.5979, + "step": 6623 + }, + { + "epoch": 1.7585291384574537, + "grad_norm": 0.41285797653939565, + "learning_rate": 4.0322234953220965e-06, + "loss": 0.5515, + "step": 6624 + }, + { + "epoch": 1.7587946369308376, + "grad_norm": 0.4150974076269747, + "learning_rate": 4.031947610426693e-06, + "loss": 0.5872, + "step": 6625 + }, + { + "epoch": 1.7590601354042215, + "grad_norm": 0.40716120170655684, + "learning_rate": 4.03167169565456e-06, + "loss": 0.5872, + "step": 6626 + }, + { + "epoch": 1.759325633877605, + "grad_norm": 0.407629409510305, + "learning_rate": 4.031395751011078e-06, + "loss": 0.5818, + "step": 6627 + }, + { + "epoch": 1.759591132350989, + "grad_norm": 0.4087098730370454, + "learning_rate": 4.03111977650163e-06, + "loss": 0.6263, + "step": 6628 + }, + { + "epoch": 1.7598566308243728, + "grad_norm": 0.42017328199788445, + "learning_rate": 4.030843772131597e-06, + "loss": 0.5782, + "step": 6629 + }, + { + "epoch": 1.7601221292977565, + "grad_norm": 0.39945350672825264, + "learning_rate": 4.030567737906363e-06, + "loss": 0.5714, + "step": 6630 + }, + { + "epoch": 1.7603876277711403, + "grad_norm": 0.4104803283690374, + "learning_rate": 4.03029167383131e-06, + "loss": 0.5762, + "step": 6631 + }, + { + "epoch": 1.7606531262445242, + "grad_norm": 0.4093183971814103, + "learning_rate": 4.030015579911823e-06, + "loss": 0.6214, + "step": 6632 + }, + { + "epoch": 1.7609186247179078, + "grad_norm": 0.40562923557932873, + "learning_rate": 4.0297394561532856e-06, + "loss": 0.5867, + "step": 6633 + }, + { + "epoch": 1.7611841231912917, + "grad_norm": 0.40816175435277885, + "learning_rate": 4.029463302561084e-06, + "loss": 0.6062, + "step": 6634 + }, + { + "epoch": 1.7614496216646756, + "grad_norm": 0.41015111420931805, + "learning_rate": 4.029187119140603e-06, + "loss": 0.6167, + "step": 6635 + }, + { + "epoch": 1.7617151201380592, + "grad_norm": 0.4235618497935881, + "learning_rate": 4.028910905897229e-06, + "loss": 0.5712, + "step": 6636 + }, + { + "epoch": 1.7619806186114428, + "grad_norm": 0.40568308901574224, + "learning_rate": 4.028634662836349e-06, + "loss": 0.5911, + "step": 6637 + }, + { + "epoch": 1.762246117084827, + "grad_norm": 0.4133862341077795, + "learning_rate": 4.02835838996335e-06, + "loss": 0.5633, + "step": 6638 + }, + { + "epoch": 1.7625116155582106, + "grad_norm": 0.41431337075236846, + "learning_rate": 4.028082087283621e-06, + "loss": 0.5846, + "step": 6639 + }, + { + "epoch": 1.7627771140315942, + "grad_norm": 0.4089988875488357, + "learning_rate": 4.027805754802549e-06, + "loss": 0.585, + "step": 6640 + }, + { + "epoch": 1.763042612504978, + "grad_norm": 0.40003121162604544, + "learning_rate": 4.0275293925255245e-06, + "loss": 0.6021, + "step": 6641 + }, + { + "epoch": 1.763308110978362, + "grad_norm": 0.41519379930250433, + "learning_rate": 4.027253000457937e-06, + "loss": 0.5901, + "step": 6642 + }, + { + "epoch": 1.7635736094517456, + "grad_norm": 0.41866266336762176, + "learning_rate": 4.026976578605176e-06, + "loss": 0.5641, + "step": 6643 + }, + { + "epoch": 1.7638391079251294, + "grad_norm": 0.4062009426567029, + "learning_rate": 4.026700126972633e-06, + "loss": 0.5777, + "step": 6644 + }, + { + "epoch": 1.7641046063985133, + "grad_norm": 0.40409256120154463, + "learning_rate": 4.0264236455657005e-06, + "loss": 0.6167, + "step": 6645 + }, + { + "epoch": 1.764370104871897, + "grad_norm": 0.40694387183136166, + "learning_rate": 4.026147134389769e-06, + "loss": 0.5743, + "step": 6646 + }, + { + "epoch": 1.7646356033452808, + "grad_norm": 0.42303410484491405, + "learning_rate": 4.025870593450232e-06, + "loss": 0.6023, + "step": 6647 + }, + { + "epoch": 1.7649011018186647, + "grad_norm": 0.43716108426476313, + "learning_rate": 4.025594022752483e-06, + "loss": 0.5843, + "step": 6648 + }, + { + "epoch": 1.7651666002920483, + "grad_norm": 0.45202150160351534, + "learning_rate": 4.025317422301914e-06, + "loss": 0.5743, + "step": 6649 + }, + { + "epoch": 1.765432098765432, + "grad_norm": 0.41025441912786664, + "learning_rate": 4.025040792103922e-06, + "loss": 0.6046, + "step": 6650 + }, + { + "epoch": 1.765697597238816, + "grad_norm": 0.4124058776081248, + "learning_rate": 4.024764132163899e-06, + "loss": 0.5696, + "step": 6651 + }, + { + "epoch": 1.7659630957121997, + "grad_norm": 0.4108617134818245, + "learning_rate": 4.024487442487242e-06, + "loss": 0.6156, + "step": 6652 + }, + { + "epoch": 1.7662285941855833, + "grad_norm": 0.4105477491847005, + "learning_rate": 4.02421072307935e-06, + "loss": 0.5805, + "step": 6653 + }, + { + "epoch": 1.7664940926589672, + "grad_norm": 0.42833907831174295, + "learning_rate": 4.023933973945615e-06, + "loss": 0.579, + "step": 6654 + }, + { + "epoch": 1.766759591132351, + "grad_norm": 0.4171271465893289, + "learning_rate": 4.023657195091436e-06, + "loss": 0.5553, + "step": 6655 + }, + { + "epoch": 1.7670250896057347, + "grad_norm": 0.40330478697567573, + "learning_rate": 4.023380386522211e-06, + "loss": 0.5996, + "step": 6656 + }, + { + "epoch": 1.7672905880791185, + "grad_norm": 0.41116862166087287, + "learning_rate": 4.02310354824334e-06, + "loss": 0.5758, + "step": 6657 + }, + { + "epoch": 1.7675560865525024, + "grad_norm": 0.39769075787674824, + "learning_rate": 4.02282668026022e-06, + "loss": 0.5986, + "step": 6658 + }, + { + "epoch": 1.767821585025886, + "grad_norm": 0.39000250716463525, + "learning_rate": 4.02254978257825e-06, + "loss": 0.5825, + "step": 6659 + }, + { + "epoch": 1.76808708349927, + "grad_norm": 0.40864099756177, + "learning_rate": 4.022272855202833e-06, + "loss": 0.5736, + "step": 6660 + }, + { + "epoch": 1.7683525819726538, + "grad_norm": 0.4053326510770391, + "learning_rate": 4.021995898139367e-06, + "loss": 0.5744, + "step": 6661 + }, + { + "epoch": 1.7686180804460374, + "grad_norm": 0.4136214850527255, + "learning_rate": 4.021718911393256e-06, + "loss": 0.5621, + "step": 6662 + }, + { + "epoch": 1.768883578919421, + "grad_norm": 0.4069967911742805, + "learning_rate": 4.021441894969899e-06, + "loss": 0.5657, + "step": 6663 + }, + { + "epoch": 1.7691490773928051, + "grad_norm": 0.40290058691802394, + "learning_rate": 4.0211648488747005e-06, + "loss": 0.6243, + "step": 6664 + }, + { + "epoch": 1.7694145758661888, + "grad_norm": 0.4142998862472786, + "learning_rate": 4.020887773113063e-06, + "loss": 0.5847, + "step": 6665 + }, + { + "epoch": 1.7696800743395724, + "grad_norm": 0.3995043314169401, + "learning_rate": 4.02061066769039e-06, + "loss": 0.6061, + "step": 6666 + }, + { + "epoch": 1.7699455728129563, + "grad_norm": 0.4053016745797228, + "learning_rate": 4.020333532612086e-06, + "loss": 0.5872, + "step": 6667 + }, + { + "epoch": 1.7702110712863401, + "grad_norm": 0.42178003999694946, + "learning_rate": 4.020056367883556e-06, + "loss": 0.5754, + "step": 6668 + }, + { + "epoch": 1.7704765697597238, + "grad_norm": 0.4285221713624082, + "learning_rate": 4.019779173510204e-06, + "loss": 0.598, + "step": 6669 + }, + { + "epoch": 1.7707420682331076, + "grad_norm": 0.42580296352122304, + "learning_rate": 4.019501949497438e-06, + "loss": 0.5709, + "step": 6670 + }, + { + "epoch": 1.7710075667064915, + "grad_norm": 0.404118880417849, + "learning_rate": 4.019224695850664e-06, + "loss": 0.5765, + "step": 6671 + }, + { + "epoch": 1.7712730651798752, + "grad_norm": 0.4124397369750885, + "learning_rate": 4.018947412575287e-06, + "loss": 0.5776, + "step": 6672 + }, + { + "epoch": 1.771538563653259, + "grad_norm": 0.4231230463740868, + "learning_rate": 4.018670099676718e-06, + "loss": 0.5689, + "step": 6673 + }, + { + "epoch": 1.7718040621266429, + "grad_norm": 0.39437917859929883, + "learning_rate": 4.018392757160363e-06, + "loss": 0.5943, + "step": 6674 + }, + { + "epoch": 1.7720695606000265, + "grad_norm": 0.41780077712783203, + "learning_rate": 4.018115385031632e-06, + "loss": 0.5705, + "step": 6675 + }, + { + "epoch": 1.7723350590734104, + "grad_norm": 0.40785215920560286, + "learning_rate": 4.017837983295934e-06, + "loss": 0.5825, + "step": 6676 + }, + { + "epoch": 1.7726005575467942, + "grad_norm": 0.43008972547146007, + "learning_rate": 4.017560551958679e-06, + "loss": 0.5783, + "step": 6677 + }, + { + "epoch": 1.7728660560201779, + "grad_norm": 0.40849145662135616, + "learning_rate": 4.0172830910252765e-06, + "loss": 0.6143, + "step": 6678 + }, + { + "epoch": 1.7731315544935615, + "grad_norm": 0.4278505762267433, + "learning_rate": 4.01700560050114e-06, + "loss": 0.5647, + "step": 6679 + }, + { + "epoch": 1.7733970529669454, + "grad_norm": 0.3951472079430473, + "learning_rate": 4.01672808039168e-06, + "loss": 0.587, + "step": 6680 + }, + { + "epoch": 1.7736625514403292, + "grad_norm": 0.4141906323868477, + "learning_rate": 4.016450530702309e-06, + "loss": 0.6067, + "step": 6681 + }, + { + "epoch": 1.773928049913713, + "grad_norm": 0.4153852526456724, + "learning_rate": 4.01617295143844e-06, + "loss": 0.5737, + "step": 6682 + }, + { + "epoch": 1.7741935483870968, + "grad_norm": 0.41014387610959574, + "learning_rate": 4.015895342605485e-06, + "loss": 0.5804, + "step": 6683 + }, + { + "epoch": 1.7744590468604806, + "grad_norm": 0.4009121828074509, + "learning_rate": 4.01561770420886e-06, + "loss": 0.5796, + "step": 6684 + }, + { + "epoch": 1.7747245453338643, + "grad_norm": 0.4196681349617996, + "learning_rate": 4.015340036253979e-06, + "loss": 0.5752, + "step": 6685 + }, + { + "epoch": 1.7749900438072481, + "grad_norm": 0.4170897476613166, + "learning_rate": 4.015062338746256e-06, + "loss": 0.5579, + "step": 6686 + }, + { + "epoch": 1.775255542280632, + "grad_norm": 0.3999841141295786, + "learning_rate": 4.014784611691109e-06, + "loss": 0.53, + "step": 6687 + }, + { + "epoch": 1.7755210407540156, + "grad_norm": 0.416260651993525, + "learning_rate": 4.014506855093952e-06, + "loss": 0.6065, + "step": 6688 + }, + { + "epoch": 1.7757865392273995, + "grad_norm": 0.4090105779456124, + "learning_rate": 4.014229068960205e-06, + "loss": 0.5788, + "step": 6689 + }, + { + "epoch": 1.7760520377007833, + "grad_norm": 0.40350553304500164, + "learning_rate": 4.013951253295283e-06, + "loss": 0.6055, + "step": 6690 + }, + { + "epoch": 1.776317536174167, + "grad_norm": 0.41113286065162047, + "learning_rate": 4.013673408104604e-06, + "loss": 0.6118, + "step": 6691 + }, + { + "epoch": 1.7765830346475506, + "grad_norm": 0.40203800212551655, + "learning_rate": 4.0133955333935884e-06, + "loss": 0.5919, + "step": 6692 + }, + { + "epoch": 1.7768485331209347, + "grad_norm": 0.40985005754769566, + "learning_rate": 4.013117629167653e-06, + "loss": 0.5653, + "step": 6693 + }, + { + "epoch": 1.7771140315943184, + "grad_norm": 0.3946564108519016, + "learning_rate": 4.012839695432221e-06, + "loss": 0.5996, + "step": 6694 + }, + { + "epoch": 1.777379530067702, + "grad_norm": 0.4115639074175485, + "learning_rate": 4.01256173219271e-06, + "loss": 0.5994, + "step": 6695 + }, + { + "epoch": 1.7776450285410859, + "grad_norm": 0.4175388887118231, + "learning_rate": 4.012283739454542e-06, + "loss": 0.6014, + "step": 6696 + }, + { + "epoch": 1.7779105270144697, + "grad_norm": 0.41057168848001685, + "learning_rate": 4.012005717223138e-06, + "loss": 0.5651, + "step": 6697 + }, + { + "epoch": 1.7781760254878534, + "grad_norm": 0.4199201558044919, + "learning_rate": 4.01172766550392e-06, + "loss": 0.5969, + "step": 6698 + }, + { + "epoch": 1.7784415239612372, + "grad_norm": 0.39833980333063823, + "learning_rate": 4.0114495843023125e-06, + "loss": 0.5796, + "step": 6699 + }, + { + "epoch": 1.778707022434621, + "grad_norm": 0.4185999555153697, + "learning_rate": 4.011171473623736e-06, + "loss": 0.5964, + "step": 6700 + }, + { + "epoch": 1.7789725209080047, + "grad_norm": 0.43427690545700415, + "learning_rate": 4.010893333473617e-06, + "loss": 0.5547, + "step": 6701 + }, + { + "epoch": 1.7792380193813886, + "grad_norm": 0.417941478982828, + "learning_rate": 4.010615163857379e-06, + "loss": 0.634, + "step": 6702 + }, + { + "epoch": 1.7795035178547725, + "grad_norm": 0.4144390171721843, + "learning_rate": 4.010336964780445e-06, + "loss": 0.5437, + "step": 6703 + }, + { + "epoch": 1.779769016328156, + "grad_norm": 0.43920459699861164, + "learning_rate": 4.0100587362482435e-06, + "loss": 0.5964, + "step": 6704 + }, + { + "epoch": 1.7800345148015397, + "grad_norm": 0.41801525737566153, + "learning_rate": 4.009780478266199e-06, + "loss": 0.5545, + "step": 6705 + }, + { + "epoch": 1.7803000132749238, + "grad_norm": 0.4114136717501727, + "learning_rate": 4.009502190839739e-06, + "loss": 0.5627, + "step": 6706 + }, + { + "epoch": 1.7805655117483075, + "grad_norm": 0.4067294335395838, + "learning_rate": 4.00922387397429e-06, + "loss": 0.5929, + "step": 6707 + }, + { + "epoch": 1.780831010221691, + "grad_norm": 0.43645275526314004, + "learning_rate": 4.008945527675281e-06, + "loss": 0.5867, + "step": 6708 + }, + { + "epoch": 1.781096508695075, + "grad_norm": 0.4289717337771052, + "learning_rate": 4.008667151948138e-06, + "loss": 0.578, + "step": 6709 + }, + { + "epoch": 1.7813620071684588, + "grad_norm": 0.41452371991451675, + "learning_rate": 4.008388746798293e-06, + "loss": 0.5771, + "step": 6710 + }, + { + "epoch": 1.7816275056418425, + "grad_norm": 0.4124151606917401, + "learning_rate": 4.008110312231175e-06, + "loss": 0.5485, + "step": 6711 + }, + { + "epoch": 1.7818930041152263, + "grad_norm": 0.43148046562706127, + "learning_rate": 4.007831848252212e-06, + "loss": 0.5779, + "step": 6712 + }, + { + "epoch": 1.7821585025886102, + "grad_norm": 0.41277836263795215, + "learning_rate": 4.007553354866835e-06, + "loss": 0.5757, + "step": 6713 + }, + { + "epoch": 1.7824240010619938, + "grad_norm": 0.39845628701136115, + "learning_rate": 4.007274832080479e-06, + "loss": 0.5624, + "step": 6714 + }, + { + "epoch": 1.7826894995353777, + "grad_norm": 0.4023912322385927, + "learning_rate": 4.006996279898572e-06, + "loss": 0.6067, + "step": 6715 + }, + { + "epoch": 1.7829549980087616, + "grad_norm": 0.4145733079155911, + "learning_rate": 4.006717698326548e-06, + "loss": 0.6021, + "step": 6716 + }, + { + "epoch": 1.7832204964821452, + "grad_norm": 0.4324421611238071, + "learning_rate": 4.00643908736984e-06, + "loss": 0.5844, + "step": 6717 + }, + { + "epoch": 1.7834859949555288, + "grad_norm": 0.4033355141476237, + "learning_rate": 4.006160447033881e-06, + "loss": 0.5926, + "step": 6718 + }, + { + "epoch": 1.783751493428913, + "grad_norm": 0.4090873830746304, + "learning_rate": 4.005881777324106e-06, + "loss": 0.5773, + "step": 6719 + }, + { + "epoch": 1.7840169919022966, + "grad_norm": 0.41451345115315874, + "learning_rate": 4.005603078245949e-06, + "loss": 0.6093, + "step": 6720 + }, + { + "epoch": 1.7842824903756802, + "grad_norm": 0.4176807508533954, + "learning_rate": 4.005324349804845e-06, + "loss": 0.5491, + "step": 6721 + }, + { + "epoch": 1.784547988849064, + "grad_norm": 0.41376046114487586, + "learning_rate": 4.005045592006232e-06, + "loss": 0.5884, + "step": 6722 + }, + { + "epoch": 1.784813487322448, + "grad_norm": 0.4144505435566118, + "learning_rate": 4.004766804855544e-06, + "loss": 0.5413, + "step": 6723 + }, + { + "epoch": 1.7850789857958316, + "grad_norm": 0.4212963988314677, + "learning_rate": 4.00448798835822e-06, + "loss": 0.5839, + "step": 6724 + }, + { + "epoch": 1.7853444842692154, + "grad_norm": 0.4156929550650738, + "learning_rate": 4.0042091425196946e-06, + "loss": 0.594, + "step": 6725 + }, + { + "epoch": 1.7856099827425993, + "grad_norm": 0.4231670247745961, + "learning_rate": 4.00393026734541e-06, + "loss": 0.5716, + "step": 6726 + }, + { + "epoch": 1.785875481215983, + "grad_norm": 0.4032690912634656, + "learning_rate": 4.003651362840803e-06, + "loss": 0.5574, + "step": 6727 + }, + { + "epoch": 1.7861409796893668, + "grad_norm": 0.41497692317316043, + "learning_rate": 4.003372429011312e-06, + "loss": 0.584, + "step": 6728 + }, + { + "epoch": 1.7864064781627507, + "grad_norm": 0.41700836564965743, + "learning_rate": 4.003093465862377e-06, + "loss": 0.5593, + "step": 6729 + }, + { + "epoch": 1.7866719766361343, + "grad_norm": 0.44354906312780357, + "learning_rate": 4.00281447339944e-06, + "loss": 0.5661, + "step": 6730 + }, + { + "epoch": 1.7869374751095182, + "grad_norm": 0.41971133136509464, + "learning_rate": 4.002535451627941e-06, + "loss": 0.5912, + "step": 6731 + }, + { + "epoch": 1.787202973582902, + "grad_norm": 0.4046616678096043, + "learning_rate": 4.002256400553323e-06, + "loss": 0.5963, + "step": 6732 + }, + { + "epoch": 1.7874684720562857, + "grad_norm": 0.4068544354994896, + "learning_rate": 4.001977320181025e-06, + "loss": 0.5511, + "step": 6733 + }, + { + "epoch": 1.7877339705296693, + "grad_norm": 0.4124776198168891, + "learning_rate": 4.001698210516494e-06, + "loss": 0.5772, + "step": 6734 + }, + { + "epoch": 1.7879994690030534, + "grad_norm": 0.4231214718807104, + "learning_rate": 4.001419071565169e-06, + "loss": 0.598, + "step": 6735 + }, + { + "epoch": 1.788264967476437, + "grad_norm": 0.40523243305685236, + "learning_rate": 4.001139903332497e-06, + "loss": 0.5751, + "step": 6736 + }, + { + "epoch": 1.7885304659498207, + "grad_norm": 0.4227323818227764, + "learning_rate": 4.000860705823921e-06, + "loss": 0.61, + "step": 6737 + }, + { + "epoch": 1.7887959644232045, + "grad_norm": 0.4103721905254452, + "learning_rate": 4.000581479044887e-06, + "loss": 0.5788, + "step": 6738 + }, + { + "epoch": 1.7890614628965884, + "grad_norm": 0.407688642094616, + "learning_rate": 4.00030222300084e-06, + "loss": 0.6018, + "step": 6739 + }, + { + "epoch": 1.789326961369972, + "grad_norm": 0.40338591151942643, + "learning_rate": 4.000022937697226e-06, + "loss": 0.5533, + "step": 6740 + }, + { + "epoch": 1.789592459843356, + "grad_norm": 0.4125465844528528, + "learning_rate": 3.999743623139493e-06, + "loss": 0.5632, + "step": 6741 + }, + { + "epoch": 1.7898579583167398, + "grad_norm": 0.4202690967650945, + "learning_rate": 3.9994642793330865e-06, + "loss": 0.5664, + "step": 6742 + }, + { + "epoch": 1.7901234567901234, + "grad_norm": 0.41986284840292776, + "learning_rate": 3.999184906283455e-06, + "loss": 0.5673, + "step": 6743 + }, + { + "epoch": 1.7903889552635073, + "grad_norm": 0.4233748241497013, + "learning_rate": 3.998905503996048e-06, + "loss": 0.6221, + "step": 6744 + }, + { + "epoch": 1.7906544537368911, + "grad_norm": 0.41751933140947617, + "learning_rate": 3.998626072476314e-06, + "loss": 0.6071, + "step": 6745 + }, + { + "epoch": 1.7909199522102748, + "grad_norm": 0.401131030236606, + "learning_rate": 3.998346611729701e-06, + "loss": 0.5586, + "step": 6746 + }, + { + "epoch": 1.7911854506836584, + "grad_norm": 0.408037665025668, + "learning_rate": 3.998067121761661e-06, + "loss": 0.5597, + "step": 6747 + }, + { + "epoch": 1.7914509491570425, + "grad_norm": 0.4109719261712508, + "learning_rate": 3.997787602577644e-06, + "loss": 0.5689, + "step": 6748 + }, + { + "epoch": 1.7917164476304261, + "grad_norm": 0.4191907985438587, + "learning_rate": 3.997508054183102e-06, + "loss": 0.5607, + "step": 6749 + }, + { + "epoch": 1.7919819461038098, + "grad_norm": 0.4025224186409604, + "learning_rate": 3.997228476583487e-06, + "loss": 0.6024, + "step": 6750 + }, + { + "epoch": 1.7922474445771936, + "grad_norm": 0.42400758948127865, + "learning_rate": 3.99694886978425e-06, + "loss": 0.605, + "step": 6751 + }, + { + "epoch": 1.7925129430505775, + "grad_norm": 0.4088776238221803, + "learning_rate": 3.996669233790845e-06, + "loss": 0.5648, + "step": 6752 + }, + { + "epoch": 1.7927784415239612, + "grad_norm": 0.4005017286578978, + "learning_rate": 3.996389568608725e-06, + "loss": 0.5738, + "step": 6753 + }, + { + "epoch": 1.793043939997345, + "grad_norm": 0.409950117296982, + "learning_rate": 3.9961098742433456e-06, + "loss": 0.5992, + "step": 6754 + }, + { + "epoch": 1.7933094384707289, + "grad_norm": 0.4094387153339892, + "learning_rate": 3.99583015070016e-06, + "loss": 0.5652, + "step": 6755 + }, + { + "epoch": 1.7935749369441125, + "grad_norm": 0.4160710863725598, + "learning_rate": 3.995550397984624e-06, + "loss": 0.5789, + "step": 6756 + }, + { + "epoch": 1.7938404354174964, + "grad_norm": 0.41428489369513855, + "learning_rate": 3.9952706161021935e-06, + "loss": 0.5942, + "step": 6757 + }, + { + "epoch": 1.7941059338908802, + "grad_norm": 0.4239792117802188, + "learning_rate": 3.994990805058325e-06, + "loss": 0.5833, + "step": 6758 + }, + { + "epoch": 1.7943714323642639, + "grad_norm": 0.42909582328902346, + "learning_rate": 3.994710964858475e-06, + "loss": 0.602, + "step": 6759 + }, + { + "epoch": 1.7946369308376475, + "grad_norm": 0.40918627492853654, + "learning_rate": 3.994431095508102e-06, + "loss": 0.5934, + "step": 6760 + }, + { + "epoch": 1.7949024293110316, + "grad_norm": 0.408092007214423, + "learning_rate": 3.994151197012664e-06, + "loss": 0.5941, + "step": 6761 + }, + { + "epoch": 1.7951679277844153, + "grad_norm": 0.4061522536338927, + "learning_rate": 3.993871269377619e-06, + "loss": 0.5774, + "step": 6762 + }, + { + "epoch": 1.795433426257799, + "grad_norm": 0.410127219541861, + "learning_rate": 3.993591312608427e-06, + "loss": 0.5948, + "step": 6763 + }, + { + "epoch": 1.7956989247311828, + "grad_norm": 0.39740529421018345, + "learning_rate": 3.9933113267105465e-06, + "loss": 0.5614, + "step": 6764 + }, + { + "epoch": 1.7959644232045666, + "grad_norm": 0.41163835686945005, + "learning_rate": 3.993031311689439e-06, + "loss": 0.5817, + "step": 6765 + }, + { + "epoch": 1.7962299216779503, + "grad_norm": 0.4048328335509438, + "learning_rate": 3.992751267550567e-06, + "loss": 0.549, + "step": 6766 + }, + { + "epoch": 1.7964954201513341, + "grad_norm": 0.4157182148542579, + "learning_rate": 3.992471194299388e-06, + "loss": 0.5716, + "step": 6767 + }, + { + "epoch": 1.796760918624718, + "grad_norm": 0.43080367008851167, + "learning_rate": 3.9921910919413675e-06, + "loss": 0.5826, + "step": 6768 + }, + { + "epoch": 1.7970264170981016, + "grad_norm": 0.40936816547465427, + "learning_rate": 3.9919109604819675e-06, + "loss": 0.6329, + "step": 6769 + }, + { + "epoch": 1.7972919155714855, + "grad_norm": 0.41647967557843796, + "learning_rate": 3.991630799926651e-06, + "loss": 0.5814, + "step": 6770 + }, + { + "epoch": 1.7975574140448694, + "grad_norm": 0.415466295232353, + "learning_rate": 3.991350610280881e-06, + "loss": 0.5805, + "step": 6771 + }, + { + "epoch": 1.797822912518253, + "grad_norm": 0.4173964290795593, + "learning_rate": 3.991070391550124e-06, + "loss": 0.5932, + "step": 6772 + }, + { + "epoch": 1.7980884109916366, + "grad_norm": 0.3989217392871118, + "learning_rate": 3.990790143739842e-06, + "loss": 0.5892, + "step": 6773 + }, + { + "epoch": 1.7983539094650207, + "grad_norm": 0.42301929733052934, + "learning_rate": 3.9905098668555034e-06, + "loss": 0.5827, + "step": 6774 + }, + { + "epoch": 1.7986194079384044, + "grad_norm": 0.4186676923609016, + "learning_rate": 3.990229560902573e-06, + "loss": 0.5876, + "step": 6775 + }, + { + "epoch": 1.798884906411788, + "grad_norm": 0.40775126557563107, + "learning_rate": 3.989949225886516e-06, + "loss": 0.5896, + "step": 6776 + }, + { + "epoch": 1.7991504048851719, + "grad_norm": 0.4082324733188587, + "learning_rate": 3.989668861812802e-06, + "loss": 0.6246, + "step": 6777 + }, + { + "epoch": 1.7994159033585557, + "grad_norm": 0.40622031691397353, + "learning_rate": 3.989388468686898e-06, + "loss": 0.5561, + "step": 6778 + }, + { + "epoch": 1.7996814018319394, + "grad_norm": 0.4134371356650234, + "learning_rate": 3.989108046514272e-06, + "loss": 0.5888, + "step": 6779 + }, + { + "epoch": 1.7999469003053232, + "grad_norm": 0.41897793178465764, + "learning_rate": 3.988827595300394e-06, + "loss": 0.6024, + "step": 6780 + }, + { + "epoch": 1.800212398778707, + "grad_norm": 0.40808028101470173, + "learning_rate": 3.988547115050731e-06, + "loss": 0.6041, + "step": 6781 + }, + { + "epoch": 1.8004778972520907, + "grad_norm": 0.398651550143708, + "learning_rate": 3.988266605770756e-06, + "loss": 0.6135, + "step": 6782 + }, + { + "epoch": 1.8007433957254746, + "grad_norm": 0.4260135063692521, + "learning_rate": 3.987986067465938e-06, + "loss": 0.6182, + "step": 6783 + }, + { + "epoch": 1.8010088941988585, + "grad_norm": 0.4243534815522036, + "learning_rate": 3.9877055001417476e-06, + "loss": 0.5587, + "step": 6784 + }, + { + "epoch": 1.801274392672242, + "grad_norm": 0.4097221488500463, + "learning_rate": 3.9874249038036585e-06, + "loss": 0.5204, + "step": 6785 + }, + { + "epoch": 1.801539891145626, + "grad_norm": 0.39020504548859486, + "learning_rate": 3.987144278457141e-06, + "loss": 0.581, + "step": 6786 + }, + { + "epoch": 1.8018053896190098, + "grad_norm": 0.4252682499755787, + "learning_rate": 3.98686362410767e-06, + "loss": 0.6026, + "step": 6787 + }, + { + "epoch": 1.8020708880923935, + "grad_norm": 0.4149469807642028, + "learning_rate": 3.986582940760717e-06, + "loss": 0.5668, + "step": 6788 + }, + { + "epoch": 1.802336386565777, + "grad_norm": 0.4623670980457604, + "learning_rate": 3.9863022284217575e-06, + "loss": 0.5974, + "step": 6789 + }, + { + "epoch": 1.8026018850391612, + "grad_norm": 0.41451994504363865, + "learning_rate": 3.986021487096265e-06, + "loss": 0.571, + "step": 6790 + }, + { + "epoch": 1.8028673835125448, + "grad_norm": 0.422431533507994, + "learning_rate": 3.985740716789715e-06, + "loss": 0.5804, + "step": 6791 + }, + { + "epoch": 1.8031328819859285, + "grad_norm": 0.4829477816671888, + "learning_rate": 3.985459917507584e-06, + "loss": 0.571, + "step": 6792 + }, + { + "epoch": 1.8033983804593123, + "grad_norm": 0.4825420029996147, + "learning_rate": 3.985179089255347e-06, + "loss": 0.5519, + "step": 6793 + }, + { + "epoch": 1.8036638789326962, + "grad_norm": 0.4076926917949993, + "learning_rate": 3.984898232038481e-06, + "loss": 0.5867, + "step": 6794 + }, + { + "epoch": 1.8039293774060798, + "grad_norm": 0.43630892186683706, + "learning_rate": 3.984617345862465e-06, + "loss": 0.6022, + "step": 6795 + }, + { + "epoch": 1.8041948758794637, + "grad_norm": 0.42421521316490074, + "learning_rate": 3.9843364307327745e-06, + "loss": 0.567, + "step": 6796 + }, + { + "epoch": 1.8044603743528476, + "grad_norm": 0.41776301918404113, + "learning_rate": 3.98405548665489e-06, + "loss": 0.6336, + "step": 6797 + }, + { + "epoch": 1.8047258728262312, + "grad_norm": 0.413140844851692, + "learning_rate": 3.983774513634289e-06, + "loss": 0.5809, + "step": 6798 + }, + { + "epoch": 1.804991371299615, + "grad_norm": 0.41053898584161114, + "learning_rate": 3.983493511676453e-06, + "loss": 0.596, + "step": 6799 + }, + { + "epoch": 1.805256869772999, + "grad_norm": 0.40652542068363795, + "learning_rate": 3.983212480786861e-06, + "loss": 0.5409, + "step": 6800 + }, + { + "epoch": 1.8055223682463826, + "grad_norm": 0.4080604089336657, + "learning_rate": 3.982931420970995e-06, + "loss": 0.5672, + "step": 6801 + }, + { + "epoch": 1.8057878667197662, + "grad_norm": 0.4071026067531037, + "learning_rate": 3.982650332234335e-06, + "loss": 0.599, + "step": 6802 + }, + { + "epoch": 1.8060533651931503, + "grad_norm": 0.41925125987130984, + "learning_rate": 3.982369214582362e-06, + "loss": 0.5809, + "step": 6803 + }, + { + "epoch": 1.806318863666534, + "grad_norm": 0.40424297393236863, + "learning_rate": 3.982088068020562e-06, + "loss": 0.5716, + "step": 6804 + }, + { + "epoch": 1.8065843621399176, + "grad_norm": 0.41457434040207436, + "learning_rate": 3.981806892554414e-06, + "loss": 0.6056, + "step": 6805 + }, + { + "epoch": 1.8068498606133014, + "grad_norm": 0.42759855334938374, + "learning_rate": 3.981525688189405e-06, + "loss": 0.5938, + "step": 6806 + }, + { + "epoch": 1.8071153590866853, + "grad_norm": 0.41430977852499373, + "learning_rate": 3.981244454931017e-06, + "loss": 0.5638, + "step": 6807 + }, + { + "epoch": 1.807380857560069, + "grad_norm": 0.4059039155523686, + "learning_rate": 3.9809631927847345e-06, + "loss": 0.583, + "step": 6808 + }, + { + "epoch": 1.8076463560334528, + "grad_norm": 0.4081727818607107, + "learning_rate": 3.980681901756046e-06, + "loss": 0.6182, + "step": 6809 + }, + { + "epoch": 1.8079118545068367, + "grad_norm": 0.4160574014519053, + "learning_rate": 3.980400581850433e-06, + "loss": 0.5759, + "step": 6810 + }, + { + "epoch": 1.8081773529802203, + "grad_norm": 0.40529282691449536, + "learning_rate": 3.980119233073384e-06, + "loss": 0.5634, + "step": 6811 + }, + { + "epoch": 1.8084428514536042, + "grad_norm": 0.42052593207965144, + "learning_rate": 3.979837855430387e-06, + "loss": 0.5583, + "step": 6812 + }, + { + "epoch": 1.808708349926988, + "grad_norm": 0.40796553657244394, + "learning_rate": 3.979556448926929e-06, + "loss": 0.5914, + "step": 6813 + }, + { + "epoch": 1.8089738484003717, + "grad_norm": 0.4024568507645281, + "learning_rate": 3.979275013568496e-06, + "loss": 0.5751, + "step": 6814 + }, + { + "epoch": 1.8092393468737553, + "grad_norm": 0.41620334266469783, + "learning_rate": 3.97899354936058e-06, + "loss": 0.5919, + "step": 6815 + }, + { + "epoch": 1.8095048453471394, + "grad_norm": 0.41714351909591146, + "learning_rate": 3.9787120563086675e-06, + "loss": 0.5854, + "step": 6816 + }, + { + "epoch": 1.809770343820523, + "grad_norm": 0.4140405639599087, + "learning_rate": 3.97843053441825e-06, + "loss": 0.5868, + "step": 6817 + }, + { + "epoch": 1.8100358422939067, + "grad_norm": 0.4044607528187589, + "learning_rate": 3.978148983694817e-06, + "loss": 0.5555, + "step": 6818 + }, + { + "epoch": 1.8103013407672905, + "grad_norm": 0.4138167916681698, + "learning_rate": 3.9778674041438605e-06, + "loss": 0.5706, + "step": 6819 + }, + { + "epoch": 1.8105668392406744, + "grad_norm": 0.4139509457082379, + "learning_rate": 3.9775857957708695e-06, + "loss": 0.572, + "step": 6820 + }, + { + "epoch": 1.810832337714058, + "grad_norm": 0.3952554492369363, + "learning_rate": 3.977304158581339e-06, + "loss": 0.5733, + "step": 6821 + }, + { + "epoch": 1.811097836187442, + "grad_norm": 0.4216797146380888, + "learning_rate": 3.97702249258076e-06, + "loss": 0.6087, + "step": 6822 + }, + { + "epoch": 1.8113633346608258, + "grad_norm": 0.40768296117858194, + "learning_rate": 3.976740797774626e-06, + "loss": 0.5728, + "step": 6823 + }, + { + "epoch": 1.8116288331342094, + "grad_norm": 0.4063751869081823, + "learning_rate": 3.97645907416843e-06, + "loss": 0.554, + "step": 6824 + }, + { + "epoch": 1.8118943316075933, + "grad_norm": 0.4096184952112304, + "learning_rate": 3.976177321767668e-06, + "loss": 0.5935, + "step": 6825 + }, + { + "epoch": 1.8121598300809771, + "grad_norm": 0.42072672693211105, + "learning_rate": 3.975895540577835e-06, + "loss": 0.5523, + "step": 6826 + }, + { + "epoch": 1.8124253285543608, + "grad_norm": 0.4165263028415445, + "learning_rate": 3.975613730604423e-06, + "loss": 0.5696, + "step": 6827 + }, + { + "epoch": 1.8126908270277446, + "grad_norm": 0.4202931615903253, + "learning_rate": 3.975331891852931e-06, + "loss": 0.591, + "step": 6828 + }, + { + "epoch": 1.8129563255011285, + "grad_norm": 0.406871159146028, + "learning_rate": 3.975050024328856e-06, + "loss": 0.5602, + "step": 6829 + }, + { + "epoch": 1.8132218239745121, + "grad_norm": 0.4085316530523902, + "learning_rate": 3.974768128037693e-06, + "loss": 0.5709, + "step": 6830 + }, + { + "epoch": 1.8134873224478958, + "grad_norm": 0.42312446102361884, + "learning_rate": 3.9744862029849404e-06, + "loss": 0.5622, + "step": 6831 + }, + { + "epoch": 1.8137528209212797, + "grad_norm": 0.42682122154259966, + "learning_rate": 3.974204249176098e-06, + "loss": 0.6021, + "step": 6832 + }, + { + "epoch": 1.8140183193946635, + "grad_norm": 0.42775275627796105, + "learning_rate": 3.973922266616662e-06, + "loss": 0.6152, + "step": 6833 + }, + { + "epoch": 1.8142838178680472, + "grad_norm": 0.4107629138001381, + "learning_rate": 3.973640255312134e-06, + "loss": 0.5763, + "step": 6834 + }, + { + "epoch": 1.814549316341431, + "grad_norm": 0.4122845505727306, + "learning_rate": 3.973358215268013e-06, + "loss": 0.6115, + "step": 6835 + }, + { + "epoch": 1.8148148148148149, + "grad_norm": 0.41054725684345034, + "learning_rate": 3.973076146489798e-06, + "loss": 0.6355, + "step": 6836 + }, + { + "epoch": 1.8150803132881985, + "grad_norm": 0.4168830083094273, + "learning_rate": 3.972794048982992e-06, + "loss": 0.5602, + "step": 6837 + }, + { + "epoch": 1.8153458117615824, + "grad_norm": 0.40859993742626444, + "learning_rate": 3.9725119227530965e-06, + "loss": 0.5994, + "step": 6838 + }, + { + "epoch": 1.8156113102349662, + "grad_norm": 0.40494888425964376, + "learning_rate": 3.972229767805613e-06, + "loss": 0.5842, + "step": 6839 + }, + { + "epoch": 1.8158768087083499, + "grad_norm": 0.4060388916853308, + "learning_rate": 3.9719475841460444e-06, + "loss": 0.5736, + "step": 6840 + }, + { + "epoch": 1.8161423071817338, + "grad_norm": 0.39772669619632506, + "learning_rate": 3.971665371779894e-06, + "loss": 0.6278, + "step": 6841 + }, + { + "epoch": 1.8164078056551176, + "grad_norm": 0.40206305843109685, + "learning_rate": 3.971383130712665e-06, + "loss": 0.5825, + "step": 6842 + }, + { + "epoch": 1.8166733041285013, + "grad_norm": 0.41548644846716803, + "learning_rate": 3.971100860949863e-06, + "loss": 0.5862, + "step": 6843 + }, + { + "epoch": 1.816938802601885, + "grad_norm": 0.42362314828560854, + "learning_rate": 3.970818562496993e-06, + "loss": 0.5783, + "step": 6844 + }, + { + "epoch": 1.817204301075269, + "grad_norm": 0.4076203223985314, + "learning_rate": 3.9705362353595585e-06, + "loss": 0.5686, + "step": 6845 + }, + { + "epoch": 1.8174697995486526, + "grad_norm": 0.42422761470994286, + "learning_rate": 3.970253879543067e-06, + "loss": 0.59, + "step": 6846 + }, + { + "epoch": 1.8177352980220363, + "grad_norm": 0.41515227694722934, + "learning_rate": 3.969971495053026e-06, + "loss": 0.534, + "step": 6847 + }, + { + "epoch": 1.8180007964954201, + "grad_norm": 0.40005750625970493, + "learning_rate": 3.969689081894941e-06, + "loss": 0.5336, + "step": 6848 + }, + { + "epoch": 1.818266294968804, + "grad_norm": 0.42563449381958635, + "learning_rate": 3.96940664007432e-06, + "loss": 0.5893, + "step": 6849 + }, + { + "epoch": 1.8185317934421876, + "grad_norm": 0.40709190852716304, + "learning_rate": 3.9691241695966735e-06, + "loss": 0.5901, + "step": 6850 + }, + { + "epoch": 1.8187972919155715, + "grad_norm": 0.4208219527175216, + "learning_rate": 3.968841670467507e-06, + "loss": 0.603, + "step": 6851 + }, + { + "epoch": 1.8190627903889554, + "grad_norm": 0.42438763403004487, + "learning_rate": 3.968559142692333e-06, + "loss": 0.6115, + "step": 6852 + }, + { + "epoch": 1.819328288862339, + "grad_norm": 0.4444673240053476, + "learning_rate": 3.968276586276659e-06, + "loss": 0.5403, + "step": 6853 + }, + { + "epoch": 1.8195937873357229, + "grad_norm": 0.40843547394315566, + "learning_rate": 3.9679940012259965e-06, + "loss": 0.5718, + "step": 6854 + }, + { + "epoch": 1.8198592858091067, + "grad_norm": 0.40557208046900034, + "learning_rate": 3.967711387545858e-06, + "loss": 0.5759, + "step": 6855 + }, + { + "epoch": 1.8201247842824904, + "grad_norm": 0.43608630065980597, + "learning_rate": 3.967428745241752e-06, + "loss": 0.545, + "step": 6856 + }, + { + "epoch": 1.820390282755874, + "grad_norm": 0.4456599856711164, + "learning_rate": 3.967146074319194e-06, + "loss": 0.6156, + "step": 6857 + }, + { + "epoch": 1.820655781229258, + "grad_norm": 0.41807563761382394, + "learning_rate": 3.966863374783695e-06, + "loss": 0.5844, + "step": 6858 + }, + { + "epoch": 1.8209212797026417, + "grad_norm": 0.42270989477356485, + "learning_rate": 3.966580646640768e-06, + "loss": 0.5792, + "step": 6859 + }, + { + "epoch": 1.8211867781760254, + "grad_norm": 0.4205151943187361, + "learning_rate": 3.966297889895929e-06, + "loss": 0.5996, + "step": 6860 + }, + { + "epoch": 1.8214522766494092, + "grad_norm": 0.4435206697567981, + "learning_rate": 3.96601510455469e-06, + "loss": 0.6189, + "step": 6861 + }, + { + "epoch": 1.821717775122793, + "grad_norm": 0.42922459160521326, + "learning_rate": 3.965732290622567e-06, + "loss": 0.5643, + "step": 6862 + }, + { + "epoch": 1.8219832735961767, + "grad_norm": 0.4166631525914852, + "learning_rate": 3.965449448105076e-06, + "loss": 0.619, + "step": 6863 + }, + { + "epoch": 1.8222487720695606, + "grad_norm": 0.41910531231153036, + "learning_rate": 3.965166577007733e-06, + "loss": 0.618, + "step": 6864 + }, + { + "epoch": 1.8225142705429445, + "grad_norm": 0.416901957180022, + "learning_rate": 3.9648836773360535e-06, + "loss": 0.6001, + "step": 6865 + }, + { + "epoch": 1.822779769016328, + "grad_norm": 0.41027350467885454, + "learning_rate": 3.9646007490955565e-06, + "loss": 0.5964, + "step": 6866 + }, + { + "epoch": 1.823045267489712, + "grad_norm": 0.42278800924118876, + "learning_rate": 3.9643177922917584e-06, + "loss": 0.5577, + "step": 6867 + }, + { + "epoch": 1.8233107659630958, + "grad_norm": 0.4649399626440613, + "learning_rate": 3.9640348069301785e-06, + "loss": 0.5729, + "step": 6868 + }, + { + "epoch": 1.8235762644364795, + "grad_norm": 0.4193467019799003, + "learning_rate": 3.963751793016335e-06, + "loss": 0.5742, + "step": 6869 + }, + { + "epoch": 1.823841762909863, + "grad_norm": 0.4285518145735066, + "learning_rate": 3.963468750555748e-06, + "loss": 0.5665, + "step": 6870 + }, + { + "epoch": 1.8241072613832472, + "grad_norm": 0.4425197104083943, + "learning_rate": 3.963185679553936e-06, + "loss": 0.6098, + "step": 6871 + }, + { + "epoch": 1.8243727598566308, + "grad_norm": 0.44767889063167665, + "learning_rate": 3.962902580016422e-06, + "loss": 0.5552, + "step": 6872 + }, + { + "epoch": 1.8246382583300145, + "grad_norm": 0.4129172959137986, + "learning_rate": 3.962619451948726e-06, + "loss": 0.5912, + "step": 6873 + }, + { + "epoch": 1.8249037568033983, + "grad_norm": 0.4123136336702218, + "learning_rate": 3.962336295356368e-06, + "loss": 0.618, + "step": 6874 + }, + { + "epoch": 1.8251692552767822, + "grad_norm": 0.4021082096493839, + "learning_rate": 3.962053110244873e-06, + "loss": 0.574, + "step": 6875 + }, + { + "epoch": 1.8254347537501658, + "grad_norm": 0.4737960529429846, + "learning_rate": 3.961769896619763e-06, + "loss": 0.607, + "step": 6876 + }, + { + "epoch": 1.8257002522235497, + "grad_norm": 0.41459691231585427, + "learning_rate": 3.9614866544865605e-06, + "loss": 0.5959, + "step": 6877 + }, + { + "epoch": 1.8259657506969336, + "grad_norm": 0.41804812351626613, + "learning_rate": 3.96120338385079e-06, + "loss": 0.6043, + "step": 6878 + }, + { + "epoch": 1.8262312491703172, + "grad_norm": 0.411606232216746, + "learning_rate": 3.9609200847179765e-06, + "loss": 0.5671, + "step": 6879 + }, + { + "epoch": 1.826496747643701, + "grad_norm": 0.4741741936047053, + "learning_rate": 3.960636757093643e-06, + "loss": 0.6091, + "step": 6880 + }, + { + "epoch": 1.826762246117085, + "grad_norm": 0.4204405750069081, + "learning_rate": 3.960353400983317e-06, + "loss": 0.5885, + "step": 6881 + }, + { + "epoch": 1.8270277445904686, + "grad_norm": 0.4248732639820394, + "learning_rate": 3.9600700163925245e-06, + "loss": 0.5854, + "step": 6882 + }, + { + "epoch": 1.8272932430638524, + "grad_norm": 0.39879096356030924, + "learning_rate": 3.959786603326792e-06, + "loss": 0.583, + "step": 6883 + }, + { + "epoch": 1.8275587415372363, + "grad_norm": 0.4032168028135756, + "learning_rate": 3.959503161791645e-06, + "loss": 0.6018, + "step": 6884 + }, + { + "epoch": 1.82782424001062, + "grad_norm": 0.4428247410526083, + "learning_rate": 3.959219691792614e-06, + "loss": 0.6365, + "step": 6885 + }, + { + "epoch": 1.8280897384840036, + "grad_norm": 0.4302132972119312, + "learning_rate": 3.958936193335227e-06, + "loss": 0.565, + "step": 6886 + }, + { + "epoch": 1.8283552369573874, + "grad_norm": 0.42115244794776907, + "learning_rate": 3.958652666425011e-06, + "loss": 0.5825, + "step": 6887 + }, + { + "epoch": 1.8286207354307713, + "grad_norm": 0.3989890541183796, + "learning_rate": 3.958369111067497e-06, + "loss": 0.5671, + "step": 6888 + }, + { + "epoch": 1.828886233904155, + "grad_norm": 0.3989471951017054, + "learning_rate": 3.958085527268215e-06, + "loss": 0.566, + "step": 6889 + }, + { + "epoch": 1.8291517323775388, + "grad_norm": 0.431697655434134, + "learning_rate": 3.957801915032695e-06, + "loss": 0.5799, + "step": 6890 + }, + { + "epoch": 1.8294172308509227, + "grad_norm": 0.42087917328398955, + "learning_rate": 3.957518274366468e-06, + "loss": 0.5698, + "step": 6891 + }, + { + "epoch": 1.8296827293243063, + "grad_norm": 0.4078086518688895, + "learning_rate": 3.957234605275066e-06, + "loss": 0.5999, + "step": 6892 + }, + { + "epoch": 1.8299482277976902, + "grad_norm": 0.4143377134359836, + "learning_rate": 3.9569509077640215e-06, + "loss": 0.5829, + "step": 6893 + }, + { + "epoch": 1.830213726271074, + "grad_norm": 0.40664548226008695, + "learning_rate": 3.956667181838867e-06, + "loss": 0.602, + "step": 6894 + }, + { + "epoch": 1.8304792247444577, + "grad_norm": 0.4072543374160619, + "learning_rate": 3.956383427505136e-06, + "loss": 0.5962, + "step": 6895 + }, + { + "epoch": 1.8307447232178415, + "grad_norm": 0.4132660895429606, + "learning_rate": 3.956099644768363e-06, + "loss": 0.5885, + "step": 6896 + }, + { + "epoch": 1.8310102216912254, + "grad_norm": 0.4123343765840562, + "learning_rate": 3.955815833634081e-06, + "loss": 0.5859, + "step": 6897 + }, + { + "epoch": 1.831275720164609, + "grad_norm": 0.409588982080439, + "learning_rate": 3.955531994107825e-06, + "loss": 0.548, + "step": 6898 + }, + { + "epoch": 1.8315412186379927, + "grad_norm": 0.4056604959574261, + "learning_rate": 3.955248126195133e-06, + "loss": 0.5578, + "step": 6899 + }, + { + "epoch": 1.8318067171113768, + "grad_norm": 0.4120319183354142, + "learning_rate": 3.954964229901539e-06, + "loss": 0.5953, + "step": 6900 + }, + { + "epoch": 1.8320722155847604, + "grad_norm": 0.40359523613022386, + "learning_rate": 3.95468030523258e-06, + "loss": 0.575, + "step": 6901 + }, + { + "epoch": 1.832337714058144, + "grad_norm": 0.4114259200271428, + "learning_rate": 3.954396352193792e-06, + "loss": 0.6483, + "step": 6902 + }, + { + "epoch": 1.832603212531528, + "grad_norm": 0.41812609480393964, + "learning_rate": 3.954112370790716e-06, + "loss": 0.5516, + "step": 6903 + }, + { + "epoch": 1.8328687110049118, + "grad_norm": 0.42278747761979335, + "learning_rate": 3.953828361028888e-06, + "loss": 0.5683, + "step": 6904 + }, + { + "epoch": 1.8331342094782954, + "grad_norm": 0.4145092316195848, + "learning_rate": 3.953544322913847e-06, + "loss": 0.5647, + "step": 6905 + }, + { + "epoch": 1.8333997079516793, + "grad_norm": 0.4269310386880556, + "learning_rate": 3.953260256451133e-06, + "loss": 0.61, + "step": 6906 + }, + { + "epoch": 1.8336652064250631, + "grad_norm": 0.409618894668987, + "learning_rate": 3.952976161646286e-06, + "loss": 0.5804, + "step": 6907 + }, + { + "epoch": 1.8339307048984468, + "grad_norm": 0.41819619456935553, + "learning_rate": 3.9526920385048465e-06, + "loss": 0.6074, + "step": 6908 + }, + { + "epoch": 1.8341962033718306, + "grad_norm": 0.43048840819700895, + "learning_rate": 3.952407887032355e-06, + "loss": 0.6119, + "step": 6909 + }, + { + "epoch": 1.8344617018452145, + "grad_norm": 0.3930225028037828, + "learning_rate": 3.952123707234354e-06, + "loss": 0.5589, + "step": 6910 + }, + { + "epoch": 1.8347272003185982, + "grad_norm": 0.4069389233453311, + "learning_rate": 3.951839499116384e-06, + "loss": 0.5952, + "step": 6911 + }, + { + "epoch": 1.8349926987919818, + "grad_norm": 0.4134311064961768, + "learning_rate": 3.95155526268399e-06, + "loss": 0.6046, + "step": 6912 + }, + { + "epoch": 1.8352581972653659, + "grad_norm": 0.41619515966378, + "learning_rate": 3.951270997942716e-06, + "loss": 0.6382, + "step": 6913 + }, + { + "epoch": 1.8355236957387495, + "grad_norm": 0.40929604319969626, + "learning_rate": 3.950986704898102e-06, + "loss": 0.5571, + "step": 6914 + }, + { + "epoch": 1.8357891942121332, + "grad_norm": 0.3946239114268539, + "learning_rate": 3.950702383555694e-06, + "loss": 0.5509, + "step": 6915 + }, + { + "epoch": 1.836054692685517, + "grad_norm": 0.4005028180358469, + "learning_rate": 3.95041803392104e-06, + "loss": 0.567, + "step": 6916 + }, + { + "epoch": 1.8363201911589009, + "grad_norm": 0.414259523949154, + "learning_rate": 3.950133655999681e-06, + "loss": 0.6307, + "step": 6917 + }, + { + "epoch": 1.8365856896322845, + "grad_norm": 0.4139833930567431, + "learning_rate": 3.9498492497971665e-06, + "loss": 0.5957, + "step": 6918 + }, + { + "epoch": 1.8368511881056684, + "grad_norm": 0.3996265316737662, + "learning_rate": 3.949564815319043e-06, + "loss": 0.5743, + "step": 6919 + }, + { + "epoch": 1.8371166865790522, + "grad_norm": 0.4173874486594135, + "learning_rate": 3.949280352570854e-06, + "loss": 0.599, + "step": 6920 + }, + { + "epoch": 1.837382185052436, + "grad_norm": 0.4189102319089356, + "learning_rate": 3.948995861558151e-06, + "loss": 0.6028, + "step": 6921 + }, + { + "epoch": 1.8376476835258198, + "grad_norm": 0.40938282620613214, + "learning_rate": 3.94871134228648e-06, + "loss": 0.5929, + "step": 6922 + }, + { + "epoch": 1.8379131819992036, + "grad_norm": 0.40507414414913434, + "learning_rate": 3.948426794761392e-06, + "loss": 0.5866, + "step": 6923 + }, + { + "epoch": 1.8381786804725873, + "grad_norm": 0.4117766308302032, + "learning_rate": 3.948142218988434e-06, + "loss": 0.6118, + "step": 6924 + }, + { + "epoch": 1.838444178945971, + "grad_norm": 0.41251748518821607, + "learning_rate": 3.9478576149731575e-06, + "loss": 0.583, + "step": 6925 + }, + { + "epoch": 1.838709677419355, + "grad_norm": 0.4509170725551388, + "learning_rate": 3.947572982721112e-06, + "loss": 0.5789, + "step": 6926 + }, + { + "epoch": 1.8389751758927386, + "grad_norm": 0.4169512986291828, + "learning_rate": 3.94728832223785e-06, + "loss": 0.5827, + "step": 6927 + }, + { + "epoch": 1.8392406743661223, + "grad_norm": 0.41734339719976643, + "learning_rate": 3.947003633528922e-06, + "loss": 0.5764, + "step": 6928 + }, + { + "epoch": 1.8395061728395061, + "grad_norm": 0.42631442640618766, + "learning_rate": 3.94671891659988e-06, + "loss": 0.6186, + "step": 6929 + }, + { + "epoch": 1.83977167131289, + "grad_norm": 0.41159493904399747, + "learning_rate": 3.946434171456277e-06, + "loss": 0.585, + "step": 6930 + }, + { + "epoch": 1.8400371697862736, + "grad_norm": 0.4099996179284077, + "learning_rate": 3.946149398103666e-06, + "loss": 0.6312, + "step": 6931 + }, + { + "epoch": 1.8403026682596575, + "grad_norm": 0.41920342230074054, + "learning_rate": 3.9458645965476005e-06, + "loss": 0.5982, + "step": 6932 + }, + { + "epoch": 1.8405681667330414, + "grad_norm": 0.410247917285299, + "learning_rate": 3.9455797667936365e-06, + "loss": 0.5704, + "step": 6933 + }, + { + "epoch": 1.840833665206425, + "grad_norm": 0.4026126901284573, + "learning_rate": 3.945294908847327e-06, + "loss": 0.6081, + "step": 6934 + }, + { + "epoch": 1.8410991636798089, + "grad_norm": 0.4020790591491528, + "learning_rate": 3.945010022714228e-06, + "loss": 0.589, + "step": 6935 + }, + { + "epoch": 1.8413646621531927, + "grad_norm": 0.4174677658329594, + "learning_rate": 3.944725108399896e-06, + "loss": 0.5861, + "step": 6936 + }, + { + "epoch": 1.8416301606265764, + "grad_norm": 0.41613573831651435, + "learning_rate": 3.944440165909886e-06, + "loss": 0.6048, + "step": 6937 + }, + { + "epoch": 1.8418956590999602, + "grad_norm": 0.405590446337733, + "learning_rate": 3.944155195249757e-06, + "loss": 0.5784, + "step": 6938 + }, + { + "epoch": 1.842161157573344, + "grad_norm": 0.4173842754338806, + "learning_rate": 3.943870196425066e-06, + "loss": 0.5831, + "step": 6939 + }, + { + "epoch": 1.8424266560467277, + "grad_norm": 0.41833973656606505, + "learning_rate": 3.943585169441371e-06, + "loss": 0.5818, + "step": 6940 + }, + { + "epoch": 1.8426921545201114, + "grad_norm": 0.4166588182995895, + "learning_rate": 3.94330011430423e-06, + "loss": 0.5665, + "step": 6941 + }, + { + "epoch": 1.8429576529934952, + "grad_norm": 0.4066253100791406, + "learning_rate": 3.943015031019204e-06, + "loss": 0.6069, + "step": 6942 + }, + { + "epoch": 1.843223151466879, + "grad_norm": 0.4158627441513552, + "learning_rate": 3.942729919591851e-06, + "loss": 0.5742, + "step": 6943 + }, + { + "epoch": 1.8434886499402627, + "grad_norm": 0.41128856398162067, + "learning_rate": 3.942444780027732e-06, + "loss": 0.5581, + "step": 6944 + }, + { + "epoch": 1.8437541484136466, + "grad_norm": 0.4069034956182217, + "learning_rate": 3.942159612332408e-06, + "loss": 0.5637, + "step": 6945 + }, + { + "epoch": 1.8440196468870305, + "grad_norm": 0.415807308141061, + "learning_rate": 3.941874416511441e-06, + "loss": 0.6002, + "step": 6946 + }, + { + "epoch": 1.844285145360414, + "grad_norm": 0.4144794052635019, + "learning_rate": 3.941589192570392e-06, + "loss": 0.573, + "step": 6947 + }, + { + "epoch": 1.844550643833798, + "grad_norm": 0.4149018012203615, + "learning_rate": 3.941303940514826e-06, + "loss": 0.567, + "step": 6948 + }, + { + "epoch": 1.8448161423071818, + "grad_norm": 0.4095783353325972, + "learning_rate": 3.941018660350302e-06, + "loss": 0.5538, + "step": 6949 + }, + { + "epoch": 1.8450816407805655, + "grad_norm": 0.39799016740997856, + "learning_rate": 3.940733352082387e-06, + "loss": 0.5661, + "step": 6950 + }, + { + "epoch": 1.8453471392539493, + "grad_norm": 0.42955056674073594, + "learning_rate": 3.940448015716644e-06, + "loss": 0.5729, + "step": 6951 + }, + { + "epoch": 1.8456126377273332, + "grad_norm": 0.4163976796612784, + "learning_rate": 3.940162651258637e-06, + "loss": 0.5954, + "step": 6952 + }, + { + "epoch": 1.8458781362007168, + "grad_norm": 0.41032931712941006, + "learning_rate": 3.939877258713933e-06, + "loss": 0.5901, + "step": 6953 + }, + { + "epoch": 1.8461436346741005, + "grad_norm": 0.4090675884744846, + "learning_rate": 3.939591838088097e-06, + "loss": 0.5996, + "step": 6954 + }, + { + "epoch": 1.8464091331474846, + "grad_norm": 0.4142928798379635, + "learning_rate": 3.939306389386696e-06, + "loss": 0.5669, + "step": 6955 + }, + { + "epoch": 1.8466746316208682, + "grad_norm": 0.4213378709982677, + "learning_rate": 3.939020912615295e-06, + "loss": 0.5718, + "step": 6956 + }, + { + "epoch": 1.8469401300942518, + "grad_norm": 0.39484293134250065, + "learning_rate": 3.938735407779464e-06, + "loss": 0.5098, + "step": 6957 + }, + { + "epoch": 1.8472056285676357, + "grad_norm": 0.42292218393672387, + "learning_rate": 3.93844987488477e-06, + "loss": 0.6282, + "step": 6958 + }, + { + "epoch": 1.8474711270410196, + "grad_norm": 0.39724106047794144, + "learning_rate": 3.938164313936781e-06, + "loss": 0.5649, + "step": 6959 + }, + { + "epoch": 1.8477366255144032, + "grad_norm": 0.4189608291655136, + "learning_rate": 3.937878724941066e-06, + "loss": 0.5761, + "step": 6960 + }, + { + "epoch": 1.848002123987787, + "grad_norm": 0.41197466655443343, + "learning_rate": 3.937593107903196e-06, + "loss": 0.59, + "step": 6961 + }, + { + "epoch": 1.848267622461171, + "grad_norm": 0.41261599546037114, + "learning_rate": 3.93730746282874e-06, + "loss": 0.5965, + "step": 6962 + }, + { + "epoch": 1.8485331209345546, + "grad_norm": 0.4103523178610754, + "learning_rate": 3.93702178972327e-06, + "loss": 0.5908, + "step": 6963 + }, + { + "epoch": 1.8487986194079384, + "grad_norm": 0.4059384165149419, + "learning_rate": 3.936736088592357e-06, + "loss": 0.5788, + "step": 6964 + }, + { + "epoch": 1.8490641178813223, + "grad_norm": 0.4159782179573111, + "learning_rate": 3.936450359441572e-06, + "loss": 0.5652, + "step": 6965 + }, + { + "epoch": 1.849329616354706, + "grad_norm": 0.4133402380836053, + "learning_rate": 3.936164602276488e-06, + "loss": 0.5645, + "step": 6966 + }, + { + "epoch": 1.8495951148280896, + "grad_norm": 0.3940295484067086, + "learning_rate": 3.935878817102679e-06, + "loss": 0.5685, + "step": 6967 + }, + { + "epoch": 1.8498606133014737, + "grad_norm": 0.4082797448803069, + "learning_rate": 3.935593003925716e-06, + "loss": 0.5554, + "step": 6968 + }, + { + "epoch": 1.8501261117748573, + "grad_norm": 0.4012126929322681, + "learning_rate": 3.935307162751174e-06, + "loss": 0.5789, + "step": 6969 + }, + { + "epoch": 1.850391610248241, + "grad_norm": 0.40915072726009466, + "learning_rate": 3.935021293584629e-06, + "loss": 0.5589, + "step": 6970 + }, + { + "epoch": 1.8506571087216248, + "grad_norm": 0.409323474463736, + "learning_rate": 3.934735396431656e-06, + "loss": 0.612, + "step": 6971 + }, + { + "epoch": 1.8509226071950087, + "grad_norm": 0.40803397077971276, + "learning_rate": 3.934449471297829e-06, + "loss": 0.6003, + "step": 6972 + }, + { + "epoch": 1.8511881056683923, + "grad_norm": 0.4044914220403835, + "learning_rate": 3.934163518188725e-06, + "loss": 0.5878, + "step": 6973 + }, + { + "epoch": 1.8514536041417762, + "grad_norm": 0.41849556901391216, + "learning_rate": 3.933877537109921e-06, + "loss": 0.5893, + "step": 6974 + }, + { + "epoch": 1.85171910261516, + "grad_norm": 0.4121817583274806, + "learning_rate": 3.933591528066995e-06, + "loss": 0.5838, + "step": 6975 + }, + { + "epoch": 1.8519846010885437, + "grad_norm": 0.4245968786430536, + "learning_rate": 3.933305491065524e-06, + "loss": 0.6225, + "step": 6976 + }, + { + "epoch": 1.8522500995619275, + "grad_norm": 0.41767815062103897, + "learning_rate": 3.933019426111086e-06, + "loss": 0.5724, + "step": 6977 + }, + { + "epoch": 1.8525155980353114, + "grad_norm": 0.40692991333474743, + "learning_rate": 3.932733333209261e-06, + "loss": 0.5894, + "step": 6978 + }, + { + "epoch": 1.852781096508695, + "grad_norm": 0.4297095729670086, + "learning_rate": 3.932447212365627e-06, + "loss": 0.5968, + "step": 6979 + }, + { + "epoch": 1.8530465949820787, + "grad_norm": 0.4038910888247878, + "learning_rate": 3.9321610635857675e-06, + "loss": 0.5839, + "step": 6980 + }, + { + "epoch": 1.8533120934554628, + "grad_norm": 0.39807978095272545, + "learning_rate": 3.931874886875259e-06, + "loss": 0.551, + "step": 6981 + }, + { + "epoch": 1.8535775919288464, + "grad_norm": 0.4181460716959522, + "learning_rate": 3.931588682239684e-06, + "loss": 0.5938, + "step": 6982 + }, + { + "epoch": 1.85384309040223, + "grad_norm": 0.4072785093978776, + "learning_rate": 3.931302449684625e-06, + "loss": 0.5822, + "step": 6983 + }, + { + "epoch": 1.854108588875614, + "grad_norm": 0.41679217851508005, + "learning_rate": 3.931016189215664e-06, + "loss": 0.5978, + "step": 6984 + }, + { + "epoch": 1.8543740873489978, + "grad_norm": 0.4198293604984194, + "learning_rate": 3.930729900838384e-06, + "loss": 0.6019, + "step": 6985 + }, + { + "epoch": 1.8546395858223814, + "grad_norm": 0.39854310088267186, + "learning_rate": 3.930443584558368e-06, + "loss": 0.5692, + "step": 6986 + }, + { + "epoch": 1.8549050842957653, + "grad_norm": 0.41363365602146657, + "learning_rate": 3.930157240381199e-06, + "loss": 0.5915, + "step": 6987 + }, + { + "epoch": 1.8551705827691491, + "grad_norm": 0.40609414164861235, + "learning_rate": 3.929870868312463e-06, + "loss": 0.6193, + "step": 6988 + }, + { + "epoch": 1.8554360812425328, + "grad_norm": 0.4102358930830493, + "learning_rate": 3.929584468357743e-06, + "loss": 0.5886, + "step": 6989 + }, + { + "epoch": 1.8557015797159166, + "grad_norm": 0.4120011596357434, + "learning_rate": 3.929298040522626e-06, + "loss": 0.5691, + "step": 6990 + }, + { + "epoch": 1.8559670781893005, + "grad_norm": 0.39642712371795763, + "learning_rate": 3.929011584812698e-06, + "loss": 0.5878, + "step": 6991 + }, + { + "epoch": 1.8562325766626842, + "grad_norm": 0.4229512224512087, + "learning_rate": 3.928725101233544e-06, + "loss": 0.6005, + "step": 6992 + }, + { + "epoch": 1.856498075136068, + "grad_norm": 0.419682157187313, + "learning_rate": 3.928438589790755e-06, + "loss": 0.6292, + "step": 6993 + }, + { + "epoch": 1.8567635736094519, + "grad_norm": 0.4311370340813145, + "learning_rate": 3.928152050489915e-06, + "loss": 0.5986, + "step": 6994 + }, + { + "epoch": 1.8570290720828355, + "grad_norm": 0.4188002309867436, + "learning_rate": 3.927865483336613e-06, + "loss": 0.5549, + "step": 6995 + }, + { + "epoch": 1.8572945705562192, + "grad_norm": 0.40987962762826796, + "learning_rate": 3.927578888336437e-06, + "loss": 0.5774, + "step": 6996 + }, + { + "epoch": 1.8575600690296032, + "grad_norm": 0.4123701598687084, + "learning_rate": 3.927292265494979e-06, + "loss": 0.5967, + "step": 6997 + }, + { + "epoch": 1.8578255675029869, + "grad_norm": 0.43184928741569023, + "learning_rate": 3.9270056148178265e-06, + "loss": 0.5483, + "step": 6998 + }, + { + "epoch": 1.8580910659763705, + "grad_norm": 0.4177628607055168, + "learning_rate": 3.926718936310571e-06, + "loss": 0.5819, + "step": 6999 + }, + { + "epoch": 1.8583565644497544, + "grad_norm": 0.4137772822737616, + "learning_rate": 3.926432229978803e-06, + "loss": 0.5778, + "step": 7000 + }, + { + "epoch": 1.8586220629231383, + "grad_norm": 0.40507119077028475, + "learning_rate": 3.926145495828113e-06, + "loss": 0.5789, + "step": 7001 + }, + { + "epoch": 1.858887561396522, + "grad_norm": 0.4366827296791418, + "learning_rate": 3.925858733864095e-06, + "loss": 0.5826, + "step": 7002 + }, + { + "epoch": 1.8591530598699058, + "grad_norm": 0.4225870782102806, + "learning_rate": 3.92557194409234e-06, + "loss": 0.5446, + "step": 7003 + }, + { + "epoch": 1.8594185583432896, + "grad_norm": 0.409338830054313, + "learning_rate": 3.9252851265184426e-06, + "loss": 0.5718, + "step": 7004 + }, + { + "epoch": 1.8596840568166733, + "grad_norm": 0.4147793387598263, + "learning_rate": 3.924998281147995e-06, + "loss": 0.6116, + "step": 7005 + }, + { + "epoch": 1.8599495552900571, + "grad_norm": 0.4242447271527274, + "learning_rate": 3.924711407986591e-06, + "loss": 0.6009, + "step": 7006 + }, + { + "epoch": 1.860215053763441, + "grad_norm": 0.4243040590806271, + "learning_rate": 3.924424507039828e-06, + "loss": 0.5839, + "step": 7007 + }, + { + "epoch": 1.8604805522368246, + "grad_norm": 0.4083765144251573, + "learning_rate": 3.924137578313299e-06, + "loss": 0.5757, + "step": 7008 + }, + { + "epoch": 1.8607460507102083, + "grad_norm": 0.4344873741278483, + "learning_rate": 3.9238506218125995e-06, + "loss": 0.6167, + "step": 7009 + }, + { + "epoch": 1.8610115491835924, + "grad_norm": 0.42440515609996415, + "learning_rate": 3.923563637543327e-06, + "loss": 0.5723, + "step": 7010 + }, + { + "epoch": 1.861277047656976, + "grad_norm": 0.4117864765771132, + "learning_rate": 3.923276625511078e-06, + "loss": 0.5164, + "step": 7011 + }, + { + "epoch": 1.8615425461303596, + "grad_norm": 0.398080969531183, + "learning_rate": 3.92298958572145e-06, + "loss": 0.5698, + "step": 7012 + }, + { + "epoch": 1.8618080446037435, + "grad_norm": 0.4152261717406292, + "learning_rate": 3.922702518180042e-06, + "loss": 0.6084, + "step": 7013 + }, + { + "epoch": 1.8620735430771274, + "grad_norm": 0.41227070117414727, + "learning_rate": 3.92241542289245e-06, + "loss": 0.5941, + "step": 7014 + }, + { + "epoch": 1.862339041550511, + "grad_norm": 0.399566862205925, + "learning_rate": 3.922128299864276e-06, + "loss": 0.554, + "step": 7015 + }, + { + "epoch": 1.8626045400238949, + "grad_norm": 0.4023805576115783, + "learning_rate": 3.9218411491011175e-06, + "loss": 0.5715, + "step": 7016 + }, + { + "epoch": 1.8628700384972787, + "grad_norm": 0.427954927302662, + "learning_rate": 3.921553970608575e-06, + "loss": 0.5953, + "step": 7017 + }, + { + "epoch": 1.8631355369706624, + "grad_norm": 0.4072849853449334, + "learning_rate": 3.92126676439225e-06, + "loss": 0.5737, + "step": 7018 + }, + { + "epoch": 1.8634010354440462, + "grad_norm": 0.4117371358774136, + "learning_rate": 3.920979530457743e-06, + "loss": 0.5186, + "step": 7019 + }, + { + "epoch": 1.86366653391743, + "grad_norm": 0.41760085787178824, + "learning_rate": 3.920692268810656e-06, + "loss": 0.5994, + "step": 7020 + }, + { + "epoch": 1.8639320323908137, + "grad_norm": 0.43754121540634955, + "learning_rate": 3.9204049794565916e-06, + "loss": 0.5334, + "step": 7021 + }, + { + "epoch": 1.8641975308641974, + "grad_norm": 0.4066923846218359, + "learning_rate": 3.920117662401153e-06, + "loss": 0.6017, + "step": 7022 + }, + { + "epoch": 1.8644630293375815, + "grad_norm": 0.425835622217567, + "learning_rate": 3.919830317649942e-06, + "loss": 0.603, + "step": 7023 + }, + { + "epoch": 1.864728527810965, + "grad_norm": 0.41243683298308464, + "learning_rate": 3.919542945208563e-06, + "loss": 0.5594, + "step": 7024 + }, + { + "epoch": 1.8649940262843487, + "grad_norm": 0.4323968409259387, + "learning_rate": 3.919255545082622e-06, + "loss": 0.5988, + "step": 7025 + }, + { + "epoch": 1.8652595247577326, + "grad_norm": 0.41124068422260446, + "learning_rate": 3.918968117277722e-06, + "loss": 0.5965, + "step": 7026 + }, + { + "epoch": 1.8655250232311165, + "grad_norm": 0.41688159374831313, + "learning_rate": 3.91868066179947e-06, + "loss": 0.5406, + "step": 7027 + }, + { + "epoch": 1.8657905217045, + "grad_norm": 0.4205942850696052, + "learning_rate": 3.918393178653472e-06, + "loss": 0.6348, + "step": 7028 + }, + { + "epoch": 1.866056020177884, + "grad_norm": 0.42770247507352255, + "learning_rate": 3.918105667845333e-06, + "loss": 0.6007, + "step": 7029 + }, + { + "epoch": 1.8663215186512678, + "grad_norm": 0.40042531361537104, + "learning_rate": 3.9178181293806625e-06, + "loss": 0.566, + "step": 7030 + }, + { + "epoch": 1.8665870171246515, + "grad_norm": 0.42314818105008406, + "learning_rate": 3.9175305632650665e-06, + "loss": 0.5958, + "step": 7031 + }, + { + "epoch": 1.8668525155980353, + "grad_norm": 0.4151923153351884, + "learning_rate": 3.917242969504154e-06, + "loss": 0.5921, + "step": 7032 + }, + { + "epoch": 1.8671180140714192, + "grad_norm": 0.40717137058083086, + "learning_rate": 3.916955348103533e-06, + "loss": 0.5954, + "step": 7033 + }, + { + "epoch": 1.8673835125448028, + "grad_norm": 0.4010803302552354, + "learning_rate": 3.916667699068815e-06, + "loss": 0.6084, + "step": 7034 + }, + { + "epoch": 1.8676490110181867, + "grad_norm": 0.4177206524164597, + "learning_rate": 3.916380022405606e-06, + "loss": 0.6482, + "step": 7035 + }, + { + "epoch": 1.8679145094915706, + "grad_norm": 0.4064869796423884, + "learning_rate": 3.91609231811952e-06, + "loss": 0.5894, + "step": 7036 + }, + { + "epoch": 1.8681800079649542, + "grad_norm": 0.4117399834446077, + "learning_rate": 3.915804586216166e-06, + "loss": 0.5939, + "step": 7037 + }, + { + "epoch": 1.8684455064383378, + "grad_norm": 0.4038886545348688, + "learning_rate": 3.915516826701157e-06, + "loss": 0.5296, + "step": 7038 + }, + { + "epoch": 1.8687110049117217, + "grad_norm": 0.4128136495153341, + "learning_rate": 3.9152290395801034e-06, + "loss": 0.5759, + "step": 7039 + }, + { + "epoch": 1.8689765033851056, + "grad_norm": 0.40882796094160617, + "learning_rate": 3.914941224858619e-06, + "loss": 0.6188, + "step": 7040 + }, + { + "epoch": 1.8692420018584892, + "grad_norm": 0.4236903690793271, + "learning_rate": 3.914653382542316e-06, + "loss": 0.6061, + "step": 7041 + }, + { + "epoch": 1.869507500331873, + "grad_norm": 0.45522785024099915, + "learning_rate": 3.914365512636808e-06, + "loss": 0.5333, + "step": 7042 + }, + { + "epoch": 1.869772998805257, + "grad_norm": 0.40912051091098667, + "learning_rate": 3.91407761514771e-06, + "loss": 0.5738, + "step": 7043 + }, + { + "epoch": 1.8700384972786406, + "grad_norm": 0.42416505063028453, + "learning_rate": 3.913789690080636e-06, + "loss": 0.5897, + "step": 7044 + }, + { + "epoch": 1.8703039957520244, + "grad_norm": 0.45966633961184605, + "learning_rate": 3.913501737441202e-06, + "loss": 0.5864, + "step": 7045 + }, + { + "epoch": 1.8705694942254083, + "grad_norm": 0.46243260498498207, + "learning_rate": 3.913213757235023e-06, + "loss": 0.5317, + "step": 7046 + }, + { + "epoch": 1.870834992698792, + "grad_norm": 0.41045049137262035, + "learning_rate": 3.912925749467715e-06, + "loss": 0.584, + "step": 7047 + }, + { + "epoch": 1.8711004911721758, + "grad_norm": 0.4265956650495861, + "learning_rate": 3.912637714144897e-06, + "loss": 0.5848, + "step": 7048 + }, + { + "epoch": 1.8713659896455597, + "grad_norm": 0.42875699279050344, + "learning_rate": 3.9123496512721835e-06, + "loss": 0.5396, + "step": 7049 + }, + { + "epoch": 1.8716314881189433, + "grad_norm": 0.4485515056738511, + "learning_rate": 3.912061560855194e-06, + "loss": 0.5734, + "step": 7050 + }, + { + "epoch": 1.871896986592327, + "grad_norm": 0.3963442275472542, + "learning_rate": 3.911773442899547e-06, + "loss": 0.5685, + "step": 7051 + }, + { + "epoch": 1.872162485065711, + "grad_norm": 0.40815834074072116, + "learning_rate": 3.911485297410861e-06, + "loss": 0.5848, + "step": 7052 + }, + { + "epoch": 1.8724279835390947, + "grad_norm": 0.4177885606628742, + "learning_rate": 3.9111971243947555e-06, + "loss": 0.6002, + "step": 7053 + }, + { + "epoch": 1.8726934820124783, + "grad_norm": 0.3982488207452386, + "learning_rate": 3.910908923856851e-06, + "loss": 0.5551, + "step": 7054 + }, + { + "epoch": 1.8729589804858622, + "grad_norm": 0.4073962251072794, + "learning_rate": 3.910620695802768e-06, + "loss": 0.6004, + "step": 7055 + }, + { + "epoch": 1.873224478959246, + "grad_norm": 0.4427886926565442, + "learning_rate": 3.9103324402381285e-06, + "loss": 0.5414, + "step": 7056 + }, + { + "epoch": 1.8734899774326297, + "grad_norm": 0.41073338129047515, + "learning_rate": 3.910044157168552e-06, + "loss": 0.5758, + "step": 7057 + }, + { + "epoch": 1.8737554759060135, + "grad_norm": 0.41259637289069107, + "learning_rate": 3.909755846599663e-06, + "loss": 0.573, + "step": 7058 + }, + { + "epoch": 1.8740209743793974, + "grad_norm": 0.4290440138050512, + "learning_rate": 3.909467508537082e-06, + "loss": 0.5761, + "step": 7059 + }, + { + "epoch": 1.874286472852781, + "grad_norm": 0.4248638059398351, + "learning_rate": 3.909179142986436e-06, + "loss": 0.5467, + "step": 7060 + }, + { + "epoch": 1.874551971326165, + "grad_norm": 0.4014542453981169, + "learning_rate": 3.908890749953344e-06, + "loss": 0.5637, + "step": 7061 + }, + { + "epoch": 1.8748174697995488, + "grad_norm": 0.41432015219835344, + "learning_rate": 3.908602329443434e-06, + "loss": 0.5928, + "step": 7062 + }, + { + "epoch": 1.8750829682729324, + "grad_norm": 0.42709130461589084, + "learning_rate": 3.90831388146233e-06, + "loss": 0.6149, + "step": 7063 + }, + { + "epoch": 1.875348466746316, + "grad_norm": 0.41988608307090197, + "learning_rate": 3.908025406015656e-06, + "loss": 0.6249, + "step": 7064 + }, + { + "epoch": 1.8756139652197001, + "grad_norm": 0.41175358541659673, + "learning_rate": 3.907736903109041e-06, + "loss": 0.5573, + "step": 7065 + }, + { + "epoch": 1.8758794636930838, + "grad_norm": 0.43369499182599835, + "learning_rate": 3.907448372748109e-06, + "loss": 0.575, + "step": 7066 + }, + { + "epoch": 1.8761449621664674, + "grad_norm": 0.38989814639866516, + "learning_rate": 3.907159814938487e-06, + "loss": 0.568, + "step": 7067 + }, + { + "epoch": 1.8764104606398513, + "grad_norm": 0.42665951948656816, + "learning_rate": 3.906871229685803e-06, + "loss": 0.5758, + "step": 7068 + }, + { + "epoch": 1.8766759591132351, + "grad_norm": 0.3906605363540033, + "learning_rate": 3.906582616995686e-06, + "loss": 0.6332, + "step": 7069 + }, + { + "epoch": 1.8769414575866188, + "grad_norm": 0.4502266554381831, + "learning_rate": 3.906293976873764e-06, + "loss": 0.5759, + "step": 7070 + }, + { + "epoch": 1.8772069560600027, + "grad_norm": 0.4401907751667689, + "learning_rate": 3.9060053093256664e-06, + "loss": 0.5683, + "step": 7071 + }, + { + "epoch": 1.8774724545333865, + "grad_norm": 0.3887070679862068, + "learning_rate": 3.905716614357023e-06, + "loss": 0.5801, + "step": 7072 + }, + { + "epoch": 1.8777379530067702, + "grad_norm": 0.426074423699116, + "learning_rate": 3.905427891973463e-06, + "loss": 0.582, + "step": 7073 + }, + { + "epoch": 1.878003451480154, + "grad_norm": 0.4311525127479367, + "learning_rate": 3.905139142180619e-06, + "loss": 0.5939, + "step": 7074 + }, + { + "epoch": 1.8782689499535379, + "grad_norm": 0.4289219634781661, + "learning_rate": 3.904850364984121e-06, + "loss": 0.5718, + "step": 7075 + }, + { + "epoch": 1.8785344484269215, + "grad_norm": 0.41533920716406125, + "learning_rate": 3.904561560389601e-06, + "loss": 0.5811, + "step": 7076 + }, + { + "epoch": 1.8787999469003052, + "grad_norm": 0.41417032333409726, + "learning_rate": 3.9042727284026915e-06, + "loss": 0.5811, + "step": 7077 + }, + { + "epoch": 1.8790654453736892, + "grad_norm": 0.4119648391998343, + "learning_rate": 3.903983869029025e-06, + "loss": 0.5629, + "step": 7078 + }, + { + "epoch": 1.8793309438470729, + "grad_norm": 0.4338637584952911, + "learning_rate": 3.9036949822742364e-06, + "loss": 0.5871, + "step": 7079 + }, + { + "epoch": 1.8795964423204565, + "grad_norm": 0.4248311757657316, + "learning_rate": 3.903406068143959e-06, + "loss": 0.5396, + "step": 7080 + }, + { + "epoch": 1.8798619407938404, + "grad_norm": 0.4436117104678151, + "learning_rate": 3.903117126643828e-06, + "loss": 0.5837, + "step": 7081 + }, + { + "epoch": 1.8801274392672243, + "grad_norm": 0.40926219767764765, + "learning_rate": 3.902828157779475e-06, + "loss": 0.5778, + "step": 7082 + }, + { + "epoch": 1.880392937740608, + "grad_norm": 0.44262614321451976, + "learning_rate": 3.90253916155654e-06, + "loss": 0.5845, + "step": 7083 + }, + { + "epoch": 1.8806584362139918, + "grad_norm": 0.4208902684691188, + "learning_rate": 3.902250137980656e-06, + "loss": 0.5783, + "step": 7084 + }, + { + "epoch": 1.8809239346873756, + "grad_norm": 0.4314078678057738, + "learning_rate": 3.901961087057463e-06, + "loss": 0.5801, + "step": 7085 + }, + { + "epoch": 1.8811894331607593, + "grad_norm": 0.4070660062455029, + "learning_rate": 3.901672008792595e-06, + "loss": 0.5939, + "step": 7086 + }, + { + "epoch": 1.8814549316341431, + "grad_norm": 0.41904395539529654, + "learning_rate": 3.901382903191691e-06, + "loss": 0.5527, + "step": 7087 + }, + { + "epoch": 1.881720430107527, + "grad_norm": 0.41560623587894185, + "learning_rate": 3.901093770260389e-06, + "loss": 0.5851, + "step": 7088 + }, + { + "epoch": 1.8819859285809106, + "grad_norm": 0.4192905403772401, + "learning_rate": 3.900804610004329e-06, + "loss": 0.6059, + "step": 7089 + }, + { + "epoch": 1.8822514270542945, + "grad_norm": 0.43038459333408474, + "learning_rate": 3.900515422429149e-06, + "loss": 0.6114, + "step": 7090 + }, + { + "epoch": 1.8825169255276784, + "grad_norm": 0.41069500608507187, + "learning_rate": 3.900226207540489e-06, + "loss": 0.5919, + "step": 7091 + }, + { + "epoch": 1.882782424001062, + "grad_norm": 0.41503715019740184, + "learning_rate": 3.899936965343989e-06, + "loss": 0.6078, + "step": 7092 + }, + { + "epoch": 1.8830479224744456, + "grad_norm": 0.42023327605317484, + "learning_rate": 3.899647695845291e-06, + "loss": 0.5638, + "step": 7093 + }, + { + "epoch": 1.8833134209478295, + "grad_norm": 0.39592249521843, + "learning_rate": 3.899358399050036e-06, + "loss": 0.5636, + "step": 7094 + }, + { + "epoch": 1.8835789194212134, + "grad_norm": 0.41660178126431174, + "learning_rate": 3.899069074963866e-06, + "loss": 0.5839, + "step": 7095 + }, + { + "epoch": 1.883844417894597, + "grad_norm": 0.4080299981450624, + "learning_rate": 3.8987797235924225e-06, + "loss": 0.6028, + "step": 7096 + }, + { + "epoch": 1.8841099163679809, + "grad_norm": 0.40829228703046844, + "learning_rate": 3.898490344941352e-06, + "loss": 0.5658, + "step": 7097 + }, + { + "epoch": 1.8843754148413647, + "grad_norm": 0.3937678374315934, + "learning_rate": 3.898200939016295e-06, + "loss": 0.5685, + "step": 7098 + }, + { + "epoch": 1.8846409133147484, + "grad_norm": 0.42225679619736156, + "learning_rate": 3.897911505822894e-06, + "loss": 0.5641, + "step": 7099 + }, + { + "epoch": 1.8849064117881322, + "grad_norm": 0.4124490971575039, + "learning_rate": 3.897622045366798e-06, + "loss": 0.5664, + "step": 7100 + }, + { + "epoch": 1.885171910261516, + "grad_norm": 0.41383272080816186, + "learning_rate": 3.8973325576536504e-06, + "loss": 0.604, + "step": 7101 + }, + { + "epoch": 1.8854374087348997, + "grad_norm": 0.42296562095170526, + "learning_rate": 3.897043042689095e-06, + "loss": 0.5908, + "step": 7102 + }, + { + "epoch": 1.8857029072082836, + "grad_norm": 0.41033353709716786, + "learning_rate": 3.896753500478781e-06, + "loss": 0.5726, + "step": 7103 + }, + { + "epoch": 1.8859684056816675, + "grad_norm": 0.42002942515730557, + "learning_rate": 3.896463931028354e-06, + "loss": 0.6214, + "step": 7104 + }, + { + "epoch": 1.886233904155051, + "grad_norm": 0.40460757940480924, + "learning_rate": 3.896174334343461e-06, + "loss": 0.5417, + "step": 7105 + }, + { + "epoch": 1.8864994026284347, + "grad_norm": 0.4135518491071666, + "learning_rate": 3.89588471042975e-06, + "loss": 0.6152, + "step": 7106 + }, + { + "epoch": 1.8867649011018188, + "grad_norm": 0.4054098715824253, + "learning_rate": 3.89559505929287e-06, + "loss": 0.5941, + "step": 7107 + }, + { + "epoch": 1.8870303995752025, + "grad_norm": 0.4074179156026263, + "learning_rate": 3.895305380938468e-06, + "loss": 0.5917, + "step": 7108 + }, + { + "epoch": 1.887295898048586, + "grad_norm": 0.4141455513704588, + "learning_rate": 3.895015675372195e-06, + "loss": 0.576, + "step": 7109 + }, + { + "epoch": 1.88756139652197, + "grad_norm": 0.4056926170099754, + "learning_rate": 3.894725942599701e-06, + "loss": 0.5842, + "step": 7110 + }, + { + "epoch": 1.8878268949953538, + "grad_norm": 0.3904370757020439, + "learning_rate": 3.894436182626636e-06, + "loss": 0.5772, + "step": 7111 + }, + { + "epoch": 1.8880923934687375, + "grad_norm": 0.4068417976627062, + "learning_rate": 3.894146395458653e-06, + "loss": 0.5864, + "step": 7112 + }, + { + "epoch": 1.8883578919421213, + "grad_norm": 0.42834493965508247, + "learning_rate": 3.893856581101399e-06, + "loss": 0.5723, + "step": 7113 + }, + { + "epoch": 1.8886233904155052, + "grad_norm": 0.4209024790297688, + "learning_rate": 3.893566739560532e-06, + "loss": 0.5457, + "step": 7114 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.4108870116465058, + "learning_rate": 3.893276870841699e-06, + "loss": 0.5806, + "step": 7115 + }, + { + "epoch": 1.8891543873622727, + "grad_norm": 0.4253025282359474, + "learning_rate": 3.892986974950557e-06, + "loss": 0.5417, + "step": 7116 + }, + { + "epoch": 1.8894198858356566, + "grad_norm": 0.41170489180390835, + "learning_rate": 3.892697051892758e-06, + "loss": 0.5709, + "step": 7117 + }, + { + "epoch": 1.8896853843090402, + "grad_norm": 0.4286715874308777, + "learning_rate": 3.892407101673957e-06, + "loss": 0.6024, + "step": 7118 + }, + { + "epoch": 1.8899508827824238, + "grad_norm": 0.4213431871891988, + "learning_rate": 3.8921171242998076e-06, + "loss": 0.5818, + "step": 7119 + }, + { + "epoch": 1.890216381255808, + "grad_norm": 0.4134217887042566, + "learning_rate": 3.891827119775966e-06, + "loss": 0.6229, + "step": 7120 + }, + { + "epoch": 1.8904818797291916, + "grad_norm": 0.39975942266424874, + "learning_rate": 3.891537088108088e-06, + "loss": 0.5957, + "step": 7121 + }, + { + "epoch": 1.8907473782025752, + "grad_norm": 0.4100904851031382, + "learning_rate": 3.8912470293018295e-06, + "loss": 0.5524, + "step": 7122 + }, + { + "epoch": 1.891012876675959, + "grad_norm": 0.41251017499776654, + "learning_rate": 3.890956943362848e-06, + "loss": 0.5927, + "step": 7123 + }, + { + "epoch": 1.891278375149343, + "grad_norm": 0.4120074103450618, + "learning_rate": 3.8906668302967995e-06, + "loss": 0.5797, + "step": 7124 + }, + { + "epoch": 1.8915438736227266, + "grad_norm": 0.4217471409968703, + "learning_rate": 3.8903766901093435e-06, + "loss": 0.625, + "step": 7125 + }, + { + "epoch": 1.8918093720961104, + "grad_norm": 0.40144946257709435, + "learning_rate": 3.8900865228061385e-06, + "loss": 0.5918, + "step": 7126 + }, + { + "epoch": 1.8920748705694943, + "grad_norm": 0.41413698412971367, + "learning_rate": 3.889796328392842e-06, + "loss": 0.5882, + "step": 7127 + }, + { + "epoch": 1.892340369042878, + "grad_norm": 0.41863856537905497, + "learning_rate": 3.8895061068751135e-06, + "loss": 0.5854, + "step": 7128 + }, + { + "epoch": 1.8926058675162618, + "grad_norm": 0.40670889262518367, + "learning_rate": 3.889215858258616e-06, + "loss": 0.5806, + "step": 7129 + }, + { + "epoch": 1.8928713659896457, + "grad_norm": 0.4153909377778881, + "learning_rate": 3.888925582549006e-06, + "loss": 0.6093, + "step": 7130 + }, + { + "epoch": 1.8931368644630293, + "grad_norm": 0.4258020703053314, + "learning_rate": 3.888635279751947e-06, + "loss": 0.5642, + "step": 7131 + }, + { + "epoch": 1.893402362936413, + "grad_norm": 0.415145681811216, + "learning_rate": 3.8883449498731e-06, + "loss": 0.5801, + "step": 7132 + }, + { + "epoch": 1.893667861409797, + "grad_norm": 0.4192543394298992, + "learning_rate": 3.8880545929181275e-06, + "loss": 0.5722, + "step": 7133 + }, + { + "epoch": 1.8939333598831807, + "grad_norm": 0.4102842553446317, + "learning_rate": 3.8877642088926915e-06, + "loss": 0.5668, + "step": 7134 + }, + { + "epoch": 1.8941988583565643, + "grad_norm": 0.408315022426324, + "learning_rate": 3.887473797802456e-06, + "loss": 0.5662, + "step": 7135 + }, + { + "epoch": 1.8944643568299482, + "grad_norm": 0.4280394709291742, + "learning_rate": 3.887183359653084e-06, + "loss": 0.5617, + "step": 7136 + }, + { + "epoch": 1.894729855303332, + "grad_norm": 0.41449501383434817, + "learning_rate": 3.88689289445024e-06, + "loss": 0.6042, + "step": 7137 + }, + { + "epoch": 1.8949953537767157, + "grad_norm": 0.4119354111836354, + "learning_rate": 3.88660240219959e-06, + "loss": 0.6229, + "step": 7138 + }, + { + "epoch": 1.8952608522500995, + "grad_norm": 0.42889587467980234, + "learning_rate": 3.886311882906797e-06, + "loss": 0.5963, + "step": 7139 + }, + { + "epoch": 1.8955263507234834, + "grad_norm": 0.42957855699604774, + "learning_rate": 3.886021336577528e-06, + "loss": 0.6243, + "step": 7140 + }, + { + "epoch": 1.895791849196867, + "grad_norm": 0.3983587462236109, + "learning_rate": 3.88573076321745e-06, + "loss": 0.5632, + "step": 7141 + }, + { + "epoch": 1.896057347670251, + "grad_norm": 0.4088288548310979, + "learning_rate": 3.885440162832228e-06, + "loss": 0.5727, + "step": 7142 + }, + { + "epoch": 1.8963228461436348, + "grad_norm": 0.40718356625853985, + "learning_rate": 3.885149535427532e-06, + "loss": 0.5832, + "step": 7143 + }, + { + "epoch": 1.8965883446170184, + "grad_norm": 0.4079566258436637, + "learning_rate": 3.884858881009029e-06, + "loss": 0.5967, + "step": 7144 + }, + { + "epoch": 1.8968538430904023, + "grad_norm": 0.4313563491976267, + "learning_rate": 3.8845681995823855e-06, + "loss": 0.5713, + "step": 7145 + }, + { + "epoch": 1.8971193415637861, + "grad_norm": 0.40342349424624524, + "learning_rate": 3.884277491153273e-06, + "loss": 0.5959, + "step": 7146 + }, + { + "epoch": 1.8973848400371698, + "grad_norm": 0.4015402096251144, + "learning_rate": 3.883986755727361e-06, + "loss": 0.5433, + "step": 7147 + }, + { + "epoch": 1.8976503385105534, + "grad_norm": 0.39872304151407295, + "learning_rate": 3.883695993310317e-06, + "loss": 0.5821, + "step": 7148 + }, + { + "epoch": 1.8979158369839373, + "grad_norm": 0.40450619407227917, + "learning_rate": 3.883405203907814e-06, + "loss": 0.5778, + "step": 7149 + }, + { + "epoch": 1.8981813354573212, + "grad_norm": 0.4206019621409669, + "learning_rate": 3.883114387525522e-06, + "loss": 0.5926, + "step": 7150 + }, + { + "epoch": 1.8984468339307048, + "grad_norm": 0.4187351264285589, + "learning_rate": 3.882823544169114e-06, + "loss": 0.563, + "step": 7151 + }, + { + "epoch": 1.8987123324040887, + "grad_norm": 0.40080680699785154, + "learning_rate": 3.8825326738442605e-06, + "loss": 0.6069, + "step": 7152 + }, + { + "epoch": 1.8989778308774725, + "grad_norm": 0.41560257005499435, + "learning_rate": 3.882241776556634e-06, + "loss": 0.5726, + "step": 7153 + }, + { + "epoch": 1.8992433293508562, + "grad_norm": 0.43318930252869076, + "learning_rate": 3.88195085231191e-06, + "loss": 0.5534, + "step": 7154 + }, + { + "epoch": 1.89950882782424, + "grad_norm": 0.4056416871609871, + "learning_rate": 3.8816599011157595e-06, + "loss": 0.6022, + "step": 7155 + }, + { + "epoch": 1.8997743262976239, + "grad_norm": 0.40568965832114234, + "learning_rate": 3.8813689229738595e-06, + "loss": 0.5757, + "step": 7156 + }, + { + "epoch": 1.9000398247710075, + "grad_norm": 0.41616048594392585, + "learning_rate": 3.881077917891882e-06, + "loss": 0.5699, + "step": 7157 + }, + { + "epoch": 1.9003053232443914, + "grad_norm": 0.42909154285676004, + "learning_rate": 3.880786885875505e-06, + "loss": 0.5895, + "step": 7158 + }, + { + "epoch": 1.9005708217177752, + "grad_norm": 0.4225142793651282, + "learning_rate": 3.880495826930402e-06, + "loss": 0.5605, + "step": 7159 + }, + { + "epoch": 1.900836320191159, + "grad_norm": 0.4052258347579196, + "learning_rate": 3.88020474106225e-06, + "loss": 0.5623, + "step": 7160 + }, + { + "epoch": 1.9011018186645425, + "grad_norm": 0.416099027524383, + "learning_rate": 3.879913628276727e-06, + "loss": 0.5802, + "step": 7161 + }, + { + "epoch": 1.9013673171379266, + "grad_norm": 0.40948217753710037, + "learning_rate": 3.879622488579509e-06, + "loss": 0.5894, + "step": 7162 + }, + { + "epoch": 1.9016328156113103, + "grad_norm": 0.41299216749806356, + "learning_rate": 3.879331321976275e-06, + "loss": 0.6044, + "step": 7163 + }, + { + "epoch": 1.901898314084694, + "grad_norm": 0.40904011412817093, + "learning_rate": 3.879040128472703e-06, + "loss": 0.5801, + "step": 7164 + }, + { + "epoch": 1.9021638125580778, + "grad_norm": 0.4023578922308481, + "learning_rate": 3.878748908074472e-06, + "loss": 0.5735, + "step": 7165 + }, + { + "epoch": 1.9024293110314616, + "grad_norm": 0.43283246343719994, + "learning_rate": 3.878457660787262e-06, + "loss": 0.5849, + "step": 7166 + }, + { + "epoch": 1.9026948095048453, + "grad_norm": 0.40883413686724884, + "learning_rate": 3.878166386616752e-06, + "loss": 0.5444, + "step": 7167 + }, + { + "epoch": 1.9029603079782291, + "grad_norm": 0.40954773542159334, + "learning_rate": 3.8778750855686224e-06, + "loss": 0.5801, + "step": 7168 + }, + { + "epoch": 1.903225806451613, + "grad_norm": 0.4047815189510977, + "learning_rate": 3.877583757648556e-06, + "loss": 0.601, + "step": 7169 + }, + { + "epoch": 1.9034913049249966, + "grad_norm": 0.4209748280684184, + "learning_rate": 3.877292402862233e-06, + "loss": 0.5653, + "step": 7170 + }, + { + "epoch": 1.9037568033983805, + "grad_norm": 0.410220557831242, + "learning_rate": 3.877001021215337e-06, + "loss": 0.5674, + "step": 7171 + }, + { + "epoch": 1.9040223018717644, + "grad_norm": 0.41436491621168636, + "learning_rate": 3.876709612713548e-06, + "loss": 0.5737, + "step": 7172 + }, + { + "epoch": 1.904287800345148, + "grad_norm": 0.4252049888717145, + "learning_rate": 3.876418177362552e-06, + "loss": 0.61, + "step": 7173 + }, + { + "epoch": 1.9045532988185316, + "grad_norm": 0.40611264795670127, + "learning_rate": 3.876126715168031e-06, + "loss": 0.5285, + "step": 7174 + }, + { + "epoch": 1.9048187972919157, + "grad_norm": 0.40313354278754837, + "learning_rate": 3.87583522613567e-06, + "loss": 0.6079, + "step": 7175 + }, + { + "epoch": 1.9050842957652994, + "grad_norm": 0.41684222460139536, + "learning_rate": 3.875543710271153e-06, + "loss": 0.6068, + "step": 7176 + }, + { + "epoch": 1.905349794238683, + "grad_norm": 0.4214278278462318, + "learning_rate": 3.875252167580166e-06, + "loss": 0.5499, + "step": 7177 + }, + { + "epoch": 1.9056152927120669, + "grad_norm": 0.4101153577327562, + "learning_rate": 3.874960598068394e-06, + "loss": 0.5739, + "step": 7178 + }, + { + "epoch": 1.9058807911854507, + "grad_norm": 0.41426093026913, + "learning_rate": 3.874669001741525e-06, + "loss": 0.5946, + "step": 7179 + }, + { + "epoch": 1.9061462896588344, + "grad_norm": 0.40268257071900093, + "learning_rate": 3.8743773786052444e-06, + "loss": 0.5602, + "step": 7180 + }, + { + "epoch": 1.9064117881322182, + "grad_norm": 0.4120395510416962, + "learning_rate": 3.874085728665239e-06, + "loss": 0.5847, + "step": 7181 + }, + { + "epoch": 1.906677286605602, + "grad_norm": 0.41739352348945763, + "learning_rate": 3.8737940519271985e-06, + "loss": 0.5723, + "step": 7182 + }, + { + "epoch": 1.9069427850789857, + "grad_norm": 0.39973271480277534, + "learning_rate": 3.873502348396809e-06, + "loss": 0.5676, + "step": 7183 + }, + { + "epoch": 1.9072082835523696, + "grad_norm": 0.4089954045043844, + "learning_rate": 3.873210618079762e-06, + "loss": 0.5866, + "step": 7184 + }, + { + "epoch": 1.9074737820257535, + "grad_norm": 0.4219370075883623, + "learning_rate": 3.872918860981746e-06, + "loss": 0.605, + "step": 7185 + }, + { + "epoch": 1.907739280499137, + "grad_norm": 0.41240840288629904, + "learning_rate": 3.87262707710845e-06, + "loss": 0.557, + "step": 7186 + }, + { + "epoch": 1.9080047789725207, + "grad_norm": 0.3991788954103238, + "learning_rate": 3.872335266465566e-06, + "loss": 0.5306, + "step": 7187 + }, + { + "epoch": 1.9082702774459048, + "grad_norm": 0.4198281710401784, + "learning_rate": 3.872043429058783e-06, + "loss": 0.5764, + "step": 7188 + }, + { + "epoch": 1.9085357759192885, + "grad_norm": 0.40761366136593047, + "learning_rate": 3.871751564893794e-06, + "loss": 0.5876, + "step": 7189 + }, + { + "epoch": 1.908801274392672, + "grad_norm": 0.4151686069673462, + "learning_rate": 3.8714596739762926e-06, + "loss": 0.6116, + "step": 7190 + }, + { + "epoch": 1.909066772866056, + "grad_norm": 0.4120251930310793, + "learning_rate": 3.871167756311968e-06, + "loss": 0.5752, + "step": 7191 + }, + { + "epoch": 1.9093322713394398, + "grad_norm": 0.3938978993848605, + "learning_rate": 3.870875811906515e-06, + "loss": 0.5936, + "step": 7192 + }, + { + "epoch": 1.9095977698128235, + "grad_norm": 0.41884482446354926, + "learning_rate": 3.870583840765628e-06, + "loss": 0.5622, + "step": 7193 + }, + { + "epoch": 1.9098632682862073, + "grad_norm": 0.4100973745888629, + "learning_rate": 3.8702918428949994e-06, + "loss": 0.5504, + "step": 7194 + }, + { + "epoch": 1.9101287667595912, + "grad_norm": 0.4072821592512897, + "learning_rate": 3.869999818300326e-06, + "loss": 0.5643, + "step": 7195 + }, + { + "epoch": 1.9103942652329748, + "grad_norm": 0.41219489722838315, + "learning_rate": 3.869707766987301e-06, + "loss": 0.6151, + "step": 7196 + }, + { + "epoch": 1.9106597637063587, + "grad_norm": 0.417313775674746, + "learning_rate": 3.869415688961621e-06, + "loss": 0.5773, + "step": 7197 + }, + { + "epoch": 1.9109252621797426, + "grad_norm": 0.4041787787380582, + "learning_rate": 3.869123584228982e-06, + "loss": 0.5854, + "step": 7198 + }, + { + "epoch": 1.9111907606531262, + "grad_norm": 0.4236120671735632, + "learning_rate": 3.868831452795081e-06, + "loss": 0.5773, + "step": 7199 + }, + { + "epoch": 1.91145625912651, + "grad_norm": 0.40693164692222916, + "learning_rate": 3.8685392946656145e-06, + "loss": 0.6105, + "step": 7200 + }, + { + "epoch": 1.911721757599894, + "grad_norm": 0.40337635707669184, + "learning_rate": 3.868247109846282e-06, + "loss": 0.566, + "step": 7201 + }, + { + "epoch": 1.9119872560732776, + "grad_norm": 0.4172642275968336, + "learning_rate": 3.86795489834278e-06, + "loss": 0.5838, + "step": 7202 + }, + { + "epoch": 1.9122527545466612, + "grad_norm": 0.40433774663092875, + "learning_rate": 3.867662660160809e-06, + "loss": 0.5072, + "step": 7203 + }, + { + "epoch": 1.9125182530200453, + "grad_norm": 0.3943275096641571, + "learning_rate": 3.8673703953060685e-06, + "loss": 0.567, + "step": 7204 + }, + { + "epoch": 1.912783751493429, + "grad_norm": 0.41790873958201535, + "learning_rate": 3.867078103784256e-06, + "loss": 0.6085, + "step": 7205 + }, + { + "epoch": 1.9130492499668126, + "grad_norm": 0.4233498400202297, + "learning_rate": 3.866785785601073e-06, + "loss": 0.5965, + "step": 7206 + }, + { + "epoch": 1.9133147484401964, + "grad_norm": 0.4231518175194749, + "learning_rate": 3.8664934407622205e-06, + "loss": 0.5811, + "step": 7207 + }, + { + "epoch": 1.9135802469135803, + "grad_norm": 0.40647426726853086, + "learning_rate": 3.866201069273401e-06, + "loss": 0.6193, + "step": 7208 + }, + { + "epoch": 1.913845745386964, + "grad_norm": 0.4248919244281186, + "learning_rate": 3.865908671140315e-06, + "loss": 0.5679, + "step": 7209 + }, + { + "epoch": 1.9141112438603478, + "grad_norm": 0.4042962506070454, + "learning_rate": 3.865616246368667e-06, + "loss": 0.5449, + "step": 7210 + }, + { + "epoch": 1.9143767423337317, + "grad_norm": 0.40072582673144136, + "learning_rate": 3.865323794964156e-06, + "loss": 0.5574, + "step": 7211 + }, + { + "epoch": 1.9146422408071153, + "grad_norm": 0.4110386395783789, + "learning_rate": 3.86503131693249e-06, + "loss": 0.5768, + "step": 7212 + }, + { + "epoch": 1.9149077392804992, + "grad_norm": 0.4209434484598474, + "learning_rate": 3.864738812279372e-06, + "loss": 0.5626, + "step": 7213 + }, + { + "epoch": 1.915173237753883, + "grad_norm": 0.4026041665023483, + "learning_rate": 3.8644462810105035e-06, + "loss": 0.6029, + "step": 7214 + }, + { + "epoch": 1.9154387362272667, + "grad_norm": 0.41299039364937057, + "learning_rate": 3.8641537231315935e-06, + "loss": 0.6005, + "step": 7215 + }, + { + "epoch": 1.9157042347006503, + "grad_norm": 0.4173446802498935, + "learning_rate": 3.863861138648345e-06, + "loss": 0.6042, + "step": 7216 + }, + { + "epoch": 1.9159697331740344, + "grad_norm": 0.41590426949559445, + "learning_rate": 3.863568527566465e-06, + "loss": 0.5762, + "step": 7217 + }, + { + "epoch": 1.916235231647418, + "grad_norm": 0.4075294254854495, + "learning_rate": 3.863275889891661e-06, + "loss": 0.5298, + "step": 7218 + }, + { + "epoch": 1.9165007301208017, + "grad_norm": 0.4076307919559676, + "learning_rate": 3.862983225629638e-06, + "loss": 0.5942, + "step": 7219 + }, + { + "epoch": 1.9167662285941856, + "grad_norm": 0.41292162118299036, + "learning_rate": 3.862690534786105e-06, + "loss": 0.5463, + "step": 7220 + }, + { + "epoch": 1.9170317270675694, + "grad_norm": 0.40924475782259645, + "learning_rate": 3.862397817366771e-06, + "loss": 0.6026, + "step": 7221 + }, + { + "epoch": 1.917297225540953, + "grad_norm": 0.41077001876865915, + "learning_rate": 3.862105073377344e-06, + "loss": 0.5265, + "step": 7222 + }, + { + "epoch": 1.917562724014337, + "grad_norm": 0.4157999763905054, + "learning_rate": 3.861812302823532e-06, + "loss": 0.5912, + "step": 7223 + }, + { + "epoch": 1.9178282224877208, + "grad_norm": 0.41209908263462247, + "learning_rate": 3.861519505711047e-06, + "loss": 0.5868, + "step": 7224 + }, + { + "epoch": 1.9180937209611044, + "grad_norm": 0.41152275665315047, + "learning_rate": 3.861226682045597e-06, + "loss": 0.5714, + "step": 7225 + }, + { + "epoch": 1.9183592194344883, + "grad_norm": 0.39744687340693086, + "learning_rate": 3.860933831832894e-06, + "loss": 0.6113, + "step": 7226 + }, + { + "epoch": 1.9186247179078721, + "grad_norm": 0.4188903427788763, + "learning_rate": 3.860640955078649e-06, + "loss": 0.6169, + "step": 7227 + }, + { + "epoch": 1.9188902163812558, + "grad_norm": 0.4197760929248185, + "learning_rate": 3.860348051788575e-06, + "loss": 0.5944, + "step": 7228 + }, + { + "epoch": 1.9191557148546394, + "grad_norm": 0.4179022404322318, + "learning_rate": 3.860055121968382e-06, + "loss": 0.5663, + "step": 7229 + }, + { + "epoch": 1.9194212133280235, + "grad_norm": 0.4126394658987617, + "learning_rate": 3.859762165623785e-06, + "loss": 0.5862, + "step": 7230 + }, + { + "epoch": 1.9196867118014072, + "grad_norm": 0.4190873015660085, + "learning_rate": 3.859469182760496e-06, + "loss": 0.6027, + "step": 7231 + }, + { + "epoch": 1.9199522102747908, + "grad_norm": 0.40684845842720996, + "learning_rate": 3.8591761733842295e-06, + "loss": 0.5854, + "step": 7232 + }, + { + "epoch": 1.9202177087481747, + "grad_norm": 0.4156621535515605, + "learning_rate": 3.8588831375007e-06, + "loss": 0.5641, + "step": 7233 + }, + { + "epoch": 1.9204832072215585, + "grad_norm": 0.4109784466585704, + "learning_rate": 3.858590075115623e-06, + "loss": 0.5998, + "step": 7234 + }, + { + "epoch": 1.9207487056949422, + "grad_norm": 0.40599372124872196, + "learning_rate": 3.858296986234711e-06, + "loss": 0.5791, + "step": 7235 + }, + { + "epoch": 1.921014204168326, + "grad_norm": 0.42788123544363277, + "learning_rate": 3.858003870863684e-06, + "loss": 0.544, + "step": 7236 + }, + { + "epoch": 1.9212797026417099, + "grad_norm": 0.4058644815726423, + "learning_rate": 3.857710729008256e-06, + "loss": 0.5603, + "step": 7237 + }, + { + "epoch": 1.9215452011150935, + "grad_norm": 0.4137391214144147, + "learning_rate": 3.857417560674144e-06, + "loss": 0.5525, + "step": 7238 + }, + { + "epoch": 1.9218106995884774, + "grad_norm": 0.4048127906917372, + "learning_rate": 3.857124365867066e-06, + "loss": 0.5757, + "step": 7239 + }, + { + "epoch": 1.9220761980618613, + "grad_norm": 0.41206914099775677, + "learning_rate": 3.8568311445927405e-06, + "loss": 0.5959, + "step": 7240 + }, + { + "epoch": 1.922341696535245, + "grad_norm": 0.423977109176925, + "learning_rate": 3.856537896856887e-06, + "loss": 0.5684, + "step": 7241 + }, + { + "epoch": 1.9226071950086288, + "grad_norm": 0.4226704449740402, + "learning_rate": 3.8562446226652215e-06, + "loss": 0.5879, + "step": 7242 + }, + { + "epoch": 1.9228726934820126, + "grad_norm": 0.41120450032532635, + "learning_rate": 3.855951322023465e-06, + "loss": 0.5457, + "step": 7243 + }, + { + "epoch": 1.9231381919553963, + "grad_norm": 0.40977560259430057, + "learning_rate": 3.855657994937339e-06, + "loss": 0.5688, + "step": 7244 + }, + { + "epoch": 1.92340369042878, + "grad_norm": 0.40903560979960996, + "learning_rate": 3.855364641412561e-06, + "loss": 0.5836, + "step": 7245 + }, + { + "epoch": 1.9236691889021638, + "grad_norm": 0.4013008960528774, + "learning_rate": 3.855071261454855e-06, + "loss": 0.5365, + "step": 7246 + }, + { + "epoch": 1.9239346873755476, + "grad_norm": 0.4381337528387058, + "learning_rate": 3.854777855069942e-06, + "loss": 0.5856, + "step": 7247 + }, + { + "epoch": 1.9242001858489313, + "grad_norm": 0.41795327366659757, + "learning_rate": 3.854484422263544e-06, + "loss": 0.5938, + "step": 7248 + }, + { + "epoch": 1.9244656843223151, + "grad_norm": 0.4187167192709015, + "learning_rate": 3.8541909630413825e-06, + "loss": 0.5443, + "step": 7249 + }, + { + "epoch": 1.924731182795699, + "grad_norm": 0.40990239990344823, + "learning_rate": 3.853897477409182e-06, + "loss": 0.5993, + "step": 7250 + }, + { + "epoch": 1.9249966812690826, + "grad_norm": 0.41379049674742924, + "learning_rate": 3.853603965372665e-06, + "loss": 0.5921, + "step": 7251 + }, + { + "epoch": 1.9252621797424665, + "grad_norm": 0.40372316307430534, + "learning_rate": 3.853310426937557e-06, + "loss": 0.5946, + "step": 7252 + }, + { + "epoch": 1.9255276782158504, + "grad_norm": 0.4219761214784973, + "learning_rate": 3.853016862109583e-06, + "loss": 0.5674, + "step": 7253 + }, + { + "epoch": 1.925793176689234, + "grad_norm": 0.407804451793417, + "learning_rate": 3.852723270894467e-06, + "loss": 0.5489, + "step": 7254 + }, + { + "epoch": 1.9260586751626179, + "grad_norm": 0.44483804256399523, + "learning_rate": 3.852429653297934e-06, + "loss": 0.5722, + "step": 7255 + }, + { + "epoch": 1.9263241736360017, + "grad_norm": 0.41375184177786095, + "learning_rate": 3.852136009325713e-06, + "loss": 0.5655, + "step": 7256 + }, + { + "epoch": 1.9265896721093854, + "grad_norm": 0.40349311891037937, + "learning_rate": 3.851842338983529e-06, + "loss": 0.5718, + "step": 7257 + }, + { + "epoch": 1.926855170582769, + "grad_norm": 0.42041434968372976, + "learning_rate": 3.851548642277109e-06, + "loss": 0.5895, + "step": 7258 + }, + { + "epoch": 1.927120669056153, + "grad_norm": 0.4037138590739769, + "learning_rate": 3.851254919212182e-06, + "loss": 0.5639, + "step": 7259 + }, + { + "epoch": 1.9273861675295367, + "grad_norm": 0.39997450320468025, + "learning_rate": 3.850961169794475e-06, + "loss": 0.5952, + "step": 7260 + }, + { + "epoch": 1.9276516660029204, + "grad_norm": 0.4084183576950089, + "learning_rate": 3.850667394029718e-06, + "loss": 0.6001, + "step": 7261 + }, + { + "epoch": 1.9279171644763042, + "grad_norm": 0.41479993199213955, + "learning_rate": 3.850373591923639e-06, + "loss": 0.5794, + "step": 7262 + }, + { + "epoch": 1.928182662949688, + "grad_norm": 0.4210915276461565, + "learning_rate": 3.850079763481969e-06, + "loss": 0.6374, + "step": 7263 + }, + { + "epoch": 1.9284481614230717, + "grad_norm": 0.41585093653916605, + "learning_rate": 3.849785908710438e-06, + "loss": 0.6002, + "step": 7264 + }, + { + "epoch": 1.9287136598964556, + "grad_norm": 0.4031317207835994, + "learning_rate": 3.849492027614777e-06, + "loss": 0.5674, + "step": 7265 + }, + { + "epoch": 1.9289791583698395, + "grad_norm": 0.41265470691605544, + "learning_rate": 3.8491981202007175e-06, + "loss": 0.5441, + "step": 7266 + }, + { + "epoch": 1.929244656843223, + "grad_norm": 0.41596383701151046, + "learning_rate": 3.8489041864739914e-06, + "loss": 0.5913, + "step": 7267 + }, + { + "epoch": 1.929510155316607, + "grad_norm": 0.4041481327039089, + "learning_rate": 3.84861022644033e-06, + "loss": 0.5745, + "step": 7268 + }, + { + "epoch": 1.9297756537899908, + "grad_norm": 0.42350082351462753, + "learning_rate": 3.848316240105469e-06, + "loss": 0.6313, + "step": 7269 + }, + { + "epoch": 1.9300411522633745, + "grad_norm": 0.40789671091642965, + "learning_rate": 3.8480222274751386e-06, + "loss": 0.5952, + "step": 7270 + }, + { + "epoch": 1.9303066507367581, + "grad_norm": 0.41692011131455387, + "learning_rate": 3.847728188555074e-06, + "loss": 0.5473, + "step": 7271 + }, + { + "epoch": 1.9305721492101422, + "grad_norm": 0.4214585485488159, + "learning_rate": 3.847434123351011e-06, + "loss": 0.574, + "step": 7272 + }, + { + "epoch": 1.9308376476835258, + "grad_norm": 0.3976340395555284, + "learning_rate": 3.847140031868683e-06, + "loss": 0.5672, + "step": 7273 + }, + { + "epoch": 1.9311031461569095, + "grad_norm": 0.4136880234069284, + "learning_rate": 3.846845914113826e-06, + "loss": 0.5907, + "step": 7274 + }, + { + "epoch": 1.9313686446302933, + "grad_norm": 0.4202107596106995, + "learning_rate": 3.8465517700921766e-06, + "loss": 0.5998, + "step": 7275 + }, + { + "epoch": 1.9316341431036772, + "grad_norm": 0.42833834639822044, + "learning_rate": 3.8462575998094695e-06, + "loss": 0.5901, + "step": 7276 + }, + { + "epoch": 1.9318996415770608, + "grad_norm": 0.41506693799550215, + "learning_rate": 3.845963403271444e-06, + "loss": 0.5528, + "step": 7277 + }, + { + "epoch": 1.9321651400504447, + "grad_norm": 0.3990362379776796, + "learning_rate": 3.845669180483836e-06, + "loss": 0.577, + "step": 7278 + }, + { + "epoch": 1.9324306385238286, + "grad_norm": 0.4219362670948583, + "learning_rate": 3.845374931452385e-06, + "loss": 0.569, + "step": 7279 + }, + { + "epoch": 1.9326961369972122, + "grad_norm": 0.4143561297914913, + "learning_rate": 3.845080656182829e-06, + "loss": 0.5832, + "step": 7280 + }, + { + "epoch": 1.932961635470596, + "grad_norm": 0.40720635082135936, + "learning_rate": 3.844786354680906e-06, + "loss": 0.5841, + "step": 7281 + }, + { + "epoch": 1.93322713394398, + "grad_norm": 0.4026420910491919, + "learning_rate": 3.844492026952356e-06, + "loss": 0.6044, + "step": 7282 + }, + { + "epoch": 1.9334926324173636, + "grad_norm": 0.40863571946330857, + "learning_rate": 3.8441976730029205e-06, + "loss": 0.5766, + "step": 7283 + }, + { + "epoch": 1.9337581308907472, + "grad_norm": 0.432909068619264, + "learning_rate": 3.843903292838339e-06, + "loss": 0.5737, + "step": 7284 + }, + { + "epoch": 1.9340236293641313, + "grad_norm": 0.42447373542135586, + "learning_rate": 3.843608886464353e-06, + "loss": 0.5932, + "step": 7285 + }, + { + "epoch": 1.934289127837515, + "grad_norm": 0.405714743475015, + "learning_rate": 3.843314453886704e-06, + "loss": 0.5928, + "step": 7286 + }, + { + "epoch": 1.9345546263108986, + "grad_norm": 0.4230486993583029, + "learning_rate": 3.843019995111134e-06, + "loss": 0.534, + "step": 7287 + }, + { + "epoch": 1.9348201247842824, + "grad_norm": 0.41842226727641196, + "learning_rate": 3.842725510143387e-06, + "loss": 0.5724, + "step": 7288 + }, + { + "epoch": 1.9350856232576663, + "grad_norm": 0.431399790817969, + "learning_rate": 3.842430998989203e-06, + "loss": 0.5919, + "step": 7289 + }, + { + "epoch": 1.93535112173105, + "grad_norm": 0.42631657849719096, + "learning_rate": 3.842136461654329e-06, + "loss": 0.6032, + "step": 7290 + }, + { + "epoch": 1.9356166202044338, + "grad_norm": 0.39359174551393095, + "learning_rate": 3.841841898144507e-06, + "loss": 0.5687, + "step": 7291 + }, + { + "epoch": 1.9358821186778177, + "grad_norm": 0.42555020533302723, + "learning_rate": 3.841547308465483e-06, + "loss": 0.5823, + "step": 7292 + }, + { + "epoch": 1.9361476171512013, + "grad_norm": 0.41936295779697286, + "learning_rate": 3.841252692623002e-06, + "loss": 0.5861, + "step": 7293 + }, + { + "epoch": 1.9364131156245852, + "grad_norm": 0.39913802978986646, + "learning_rate": 3.840958050622809e-06, + "loss": 0.5512, + "step": 7294 + }, + { + "epoch": 1.936678614097969, + "grad_norm": 0.42064337016114994, + "learning_rate": 3.840663382470651e-06, + "loss": 0.5483, + "step": 7295 + }, + { + "epoch": 1.9369441125713527, + "grad_norm": 0.39942168559428537, + "learning_rate": 3.840368688172276e-06, + "loss": 0.5832, + "step": 7296 + }, + { + "epoch": 1.9372096110447365, + "grad_norm": 0.40085220384694215, + "learning_rate": 3.840073967733427e-06, + "loss": 0.5325, + "step": 7297 + }, + { + "epoch": 1.9374751095181204, + "grad_norm": 0.41087090139393423, + "learning_rate": 3.839779221159856e-06, + "loss": 0.5977, + "step": 7298 + }, + { + "epoch": 1.937740607991504, + "grad_norm": 0.41611621392507747, + "learning_rate": 3.839484448457311e-06, + "loss": 0.5487, + "step": 7299 + }, + { + "epoch": 1.9380061064648877, + "grad_norm": 0.40748647357914997, + "learning_rate": 3.839189649631538e-06, + "loss": 0.5579, + "step": 7300 + }, + { + "epoch": 1.9382716049382716, + "grad_norm": 0.4065733952357567, + "learning_rate": 3.8388948246882885e-06, + "loss": 0.5952, + "step": 7301 + }, + { + "epoch": 1.9385371034116554, + "grad_norm": 0.4162546430091261, + "learning_rate": 3.838599973633312e-06, + "loss": 0.5625, + "step": 7302 + }, + { + "epoch": 1.938802601885039, + "grad_norm": 0.4703268687495602, + "learning_rate": 3.838305096472358e-06, + "loss": 0.5804, + "step": 7303 + }, + { + "epoch": 1.939068100358423, + "grad_norm": 0.41770832090007876, + "learning_rate": 3.8380101932111775e-06, + "loss": 0.605, + "step": 7304 + }, + { + "epoch": 1.9393335988318068, + "grad_norm": 0.41438696227764454, + "learning_rate": 3.837715263855524e-06, + "loss": 0.5796, + "step": 7305 + }, + { + "epoch": 1.9395990973051904, + "grad_norm": 0.4578819497358745, + "learning_rate": 3.837420308411146e-06, + "loss": 0.5712, + "step": 7306 + }, + { + "epoch": 1.9398645957785743, + "grad_norm": 0.4286288086263282, + "learning_rate": 3.837125326883797e-06, + "loss": 0.5593, + "step": 7307 + }, + { + "epoch": 1.9401300942519581, + "grad_norm": 0.4055658793994388, + "learning_rate": 3.836830319279232e-06, + "loss": 0.5451, + "step": 7308 + }, + { + "epoch": 1.9403955927253418, + "grad_norm": 0.4236891770860224, + "learning_rate": 3.836535285603201e-06, + "loss": 0.6062, + "step": 7309 + }, + { + "epoch": 1.9406610911987257, + "grad_norm": 0.42610792745289283, + "learning_rate": 3.836240225861461e-06, + "loss": 0.5928, + "step": 7310 + }, + { + "epoch": 1.9409265896721095, + "grad_norm": 0.41706220234604163, + "learning_rate": 3.835945140059764e-06, + "loss": 0.6066, + "step": 7311 + }, + { + "epoch": 1.9411920881454932, + "grad_norm": 0.4757271152707851, + "learning_rate": 3.8356500282038655e-06, + "loss": 0.5494, + "step": 7312 + }, + { + "epoch": 1.9414575866188768, + "grad_norm": 0.41967861085295155, + "learning_rate": 3.835354890299522e-06, + "loss": 0.5801, + "step": 7313 + }, + { + "epoch": 1.9417230850922609, + "grad_norm": 0.4215218200591324, + "learning_rate": 3.835059726352488e-06, + "loss": 0.5693, + "step": 7314 + }, + { + "epoch": 1.9419885835656445, + "grad_norm": 0.4319910953381267, + "learning_rate": 3.834764536368521e-06, + "loss": 0.5452, + "step": 7315 + }, + { + "epoch": 1.9422540820390282, + "grad_norm": 0.4790875349315884, + "learning_rate": 3.834469320353378e-06, + "loss": 0.542, + "step": 7316 + }, + { + "epoch": 1.942519580512412, + "grad_norm": 0.418557472080714, + "learning_rate": 3.834174078312816e-06, + "loss": 0.6156, + "step": 7317 + }, + { + "epoch": 1.9427850789857959, + "grad_norm": 0.42487266979654265, + "learning_rate": 3.833878810252592e-06, + "loss": 0.5927, + "step": 7318 + }, + { + "epoch": 1.9430505774591795, + "grad_norm": 0.4671579322941817, + "learning_rate": 3.833583516178465e-06, + "loss": 0.5861, + "step": 7319 + }, + { + "epoch": 1.9433160759325634, + "grad_norm": 0.4521129657543881, + "learning_rate": 3.833288196096194e-06, + "loss": 0.5552, + "step": 7320 + }, + { + "epoch": 1.9435815744059473, + "grad_norm": 0.41080678976372376, + "learning_rate": 3.83299285001154e-06, + "loss": 0.595, + "step": 7321 + }, + { + "epoch": 1.943847072879331, + "grad_norm": 0.4057780120432856, + "learning_rate": 3.8326974779302605e-06, + "loss": 0.5358, + "step": 7322 + }, + { + "epoch": 1.9441125713527148, + "grad_norm": 0.42065625410479307, + "learning_rate": 3.8324020798581184e-06, + "loss": 0.5659, + "step": 7323 + }, + { + "epoch": 1.9443780698260986, + "grad_norm": 0.43485851420317584, + "learning_rate": 3.832106655800872e-06, + "loss": 0.5927, + "step": 7324 + }, + { + "epoch": 1.9446435682994823, + "grad_norm": 0.42274232802905415, + "learning_rate": 3.831811205764286e-06, + "loss": 0.5807, + "step": 7325 + }, + { + "epoch": 1.944909066772866, + "grad_norm": 0.42485315848677196, + "learning_rate": 3.83151572975412e-06, + "loss": 0.5638, + "step": 7326 + }, + { + "epoch": 1.94517456524625, + "grad_norm": 0.4208865440074488, + "learning_rate": 3.831220227776136e-06, + "loss": 0.5513, + "step": 7327 + }, + { + "epoch": 1.9454400637196336, + "grad_norm": 0.42121111915072895, + "learning_rate": 3.830924699836099e-06, + "loss": 0.5613, + "step": 7328 + }, + { + "epoch": 1.9457055621930173, + "grad_norm": 0.4239437252746114, + "learning_rate": 3.830629145939772e-06, + "loss": 0.5748, + "step": 7329 + }, + { + "epoch": 1.9459710606664011, + "grad_norm": 0.41232702374219565, + "learning_rate": 3.830333566092918e-06, + "loss": 0.5673, + "step": 7330 + }, + { + "epoch": 1.946236559139785, + "grad_norm": 0.42393449027238267, + "learning_rate": 3.830037960301303e-06, + "loss": 0.5655, + "step": 7331 + }, + { + "epoch": 1.9465020576131686, + "grad_norm": 0.4003471598463813, + "learning_rate": 3.829742328570691e-06, + "loss": 0.5824, + "step": 7332 + }, + { + "epoch": 1.9467675560865525, + "grad_norm": 0.42178886221355294, + "learning_rate": 3.829446670906848e-06, + "loss": 0.5879, + "step": 7333 + }, + { + "epoch": 1.9470330545599364, + "grad_norm": 0.4258184891800165, + "learning_rate": 3.82915098731554e-06, + "loss": 0.5587, + "step": 7334 + }, + { + "epoch": 1.94729855303332, + "grad_norm": 0.4323379824169179, + "learning_rate": 3.828855277802533e-06, + "loss": 0.539, + "step": 7335 + }, + { + "epoch": 1.9475640515067039, + "grad_norm": 0.408000934961471, + "learning_rate": 3.828559542373594e-06, + "loss": 0.579, + "step": 7336 + }, + { + "epoch": 1.9478295499800877, + "grad_norm": 0.3996728167826378, + "learning_rate": 3.828263781034492e-06, + "loss": 0.5707, + "step": 7337 + }, + { + "epoch": 1.9480950484534714, + "grad_norm": 0.4002980116914289, + "learning_rate": 3.827967993790994e-06, + "loss": 0.5923, + "step": 7338 + }, + { + "epoch": 1.948360546926855, + "grad_norm": 0.42657760748181905, + "learning_rate": 3.827672180648868e-06, + "loss": 0.5513, + "step": 7339 + }, + { + "epoch": 1.948626045400239, + "grad_norm": 0.42031230724856017, + "learning_rate": 3.827376341613884e-06, + "loss": 0.5797, + "step": 7340 + }, + { + "epoch": 1.9488915438736227, + "grad_norm": 0.41445161550057874, + "learning_rate": 3.8270804766918125e-06, + "loss": 0.6078, + "step": 7341 + }, + { + "epoch": 1.9491570423470064, + "grad_norm": 0.4178440241879826, + "learning_rate": 3.826784585888421e-06, + "loss": 0.6078, + "step": 7342 + }, + { + "epoch": 1.9494225408203902, + "grad_norm": 0.4132298969428845, + "learning_rate": 3.826488669209483e-06, + "loss": 0.6086, + "step": 7343 + }, + { + "epoch": 1.949688039293774, + "grad_norm": 0.40184854981401397, + "learning_rate": 3.826192726660767e-06, + "loss": 0.593, + "step": 7344 + }, + { + "epoch": 1.9499535377671577, + "grad_norm": 0.4267417598297509, + "learning_rate": 3.825896758248046e-06, + "loss": 0.6028, + "step": 7345 + }, + { + "epoch": 1.9502190362405416, + "grad_norm": 0.42069510722893005, + "learning_rate": 3.825600763977091e-06, + "loss": 0.6005, + "step": 7346 + }, + { + "epoch": 1.9504845347139255, + "grad_norm": 0.40830595833856426, + "learning_rate": 3.825304743853676e-06, + "loss": 0.5894, + "step": 7347 + }, + { + "epoch": 1.950750033187309, + "grad_norm": 0.4081436091207105, + "learning_rate": 3.825008697883574e-06, + "loss": 0.5825, + "step": 7348 + }, + { + "epoch": 1.951015531660693, + "grad_norm": 0.4236863647340996, + "learning_rate": 3.824712626072558e-06, + "loss": 0.5817, + "step": 7349 + }, + { + "epoch": 1.9512810301340768, + "grad_norm": 0.4310682385563616, + "learning_rate": 3.824416528426403e-06, + "loss": 0.5522, + "step": 7350 + }, + { + "epoch": 1.9515465286074605, + "grad_norm": 0.4084756942330639, + "learning_rate": 3.824120404950883e-06, + "loss": 0.5844, + "step": 7351 + }, + { + "epoch": 1.9518120270808443, + "grad_norm": 0.40926606038917146, + "learning_rate": 3.8238242556517725e-06, + "loss": 0.5794, + "step": 7352 + }, + { + "epoch": 1.9520775255542282, + "grad_norm": 0.42301127135326394, + "learning_rate": 3.823528080534847e-06, + "loss": 0.5991, + "step": 7353 + }, + { + "epoch": 1.9523430240276118, + "grad_norm": 0.4138429565378456, + "learning_rate": 3.823231879605885e-06, + "loss": 0.5498, + "step": 7354 + }, + { + "epoch": 1.9526085225009955, + "grad_norm": 0.39313562489739623, + "learning_rate": 3.8229356528706605e-06, + "loss": 0.575, + "step": 7355 + }, + { + "epoch": 1.9528740209743793, + "grad_norm": 0.41320416235125834, + "learning_rate": 3.822639400334952e-06, + "loss": 0.586, + "step": 7356 + }, + { + "epoch": 1.9531395194477632, + "grad_norm": 0.41092052923304684, + "learning_rate": 3.822343122004536e-06, + "loss": 0.5798, + "step": 7357 + }, + { + "epoch": 1.9534050179211468, + "grad_norm": 0.4064466354432723, + "learning_rate": 3.8220468178851925e-06, + "loss": 0.552, + "step": 7358 + }, + { + "epoch": 1.9536705163945307, + "grad_norm": 0.41978745107814436, + "learning_rate": 3.821750487982698e-06, + "loss": 0.5922, + "step": 7359 + }, + { + "epoch": 1.9539360148679146, + "grad_norm": 0.4120256704044728, + "learning_rate": 3.821454132302834e-06, + "loss": 0.5852, + "step": 7360 + }, + { + "epoch": 1.9542015133412982, + "grad_norm": 0.4190237629622631, + "learning_rate": 3.8211577508513784e-06, + "loss": 0.5776, + "step": 7361 + }, + { + "epoch": 1.954467011814682, + "grad_norm": 0.40117207222015244, + "learning_rate": 3.820861343634113e-06, + "loss": 0.574, + "step": 7362 + }, + { + "epoch": 1.954732510288066, + "grad_norm": 0.42058702842285467, + "learning_rate": 3.820564910656815e-06, + "loss": 0.5945, + "step": 7363 + }, + { + "epoch": 1.9549980087614496, + "grad_norm": 0.40265179966644293, + "learning_rate": 3.82026845192527e-06, + "loss": 0.5835, + "step": 7364 + }, + { + "epoch": 1.9552635072348334, + "grad_norm": 0.42302484410861746, + "learning_rate": 3.819971967445256e-06, + "loss": 0.5984, + "step": 7365 + }, + { + "epoch": 1.9555290057082173, + "grad_norm": 0.4035889090298571, + "learning_rate": 3.819675457222558e-06, + "loss": 0.5887, + "step": 7366 + }, + { + "epoch": 1.955794504181601, + "grad_norm": 0.4194479260135337, + "learning_rate": 3.8193789212629575e-06, + "loss": 0.6092, + "step": 7367 + }, + { + "epoch": 1.9560600026549846, + "grad_norm": 0.40413352930301033, + "learning_rate": 3.819082359572237e-06, + "loss": 0.595, + "step": 7368 + }, + { + "epoch": 1.9563255011283687, + "grad_norm": 0.4184555770271347, + "learning_rate": 3.818785772156181e-06, + "loss": 0.5598, + "step": 7369 + }, + { + "epoch": 1.9565909996017523, + "grad_norm": 0.4227090579697663, + "learning_rate": 3.818489159020573e-06, + "loss": 0.5609, + "step": 7370 + }, + { + "epoch": 1.956856498075136, + "grad_norm": 0.4088025144247253, + "learning_rate": 3.8181925201712e-06, + "loss": 0.5938, + "step": 7371 + }, + { + "epoch": 1.9571219965485198, + "grad_norm": 0.4180422561093033, + "learning_rate": 3.817895855613843e-06, + "loss": 0.6161, + "step": 7372 + }, + { + "epoch": 1.9573874950219037, + "grad_norm": 0.409885794078535, + "learning_rate": 3.817599165354292e-06, + "loss": 0.5921, + "step": 7373 + }, + { + "epoch": 1.9576529934952873, + "grad_norm": 0.4098468966114667, + "learning_rate": 3.8173024493983305e-06, + "loss": 0.5138, + "step": 7374 + }, + { + "epoch": 1.9579184919686712, + "grad_norm": 0.40772967420333917, + "learning_rate": 3.817005707751746e-06, + "loss": 0.619, + "step": 7375 + }, + { + "epoch": 1.958183990442055, + "grad_norm": 0.420969508034548, + "learning_rate": 3.816708940420324e-06, + "loss": 0.5709, + "step": 7376 + }, + { + "epoch": 1.9584494889154387, + "grad_norm": 0.41034849017729425, + "learning_rate": 3.816412147409856e-06, + "loss": 0.5908, + "step": 7377 + }, + { + "epoch": 1.9587149873888225, + "grad_norm": 0.407992208602232, + "learning_rate": 3.816115328726127e-06, + "loss": 0.596, + "step": 7378 + }, + { + "epoch": 1.9589804858622064, + "grad_norm": 0.4055234760779293, + "learning_rate": 3.815818484374927e-06, + "loss": 0.605, + "step": 7379 + }, + { + "epoch": 1.95924598433559, + "grad_norm": 0.4098993209244575, + "learning_rate": 3.815521614362046e-06, + "loss": 0.5998, + "step": 7380 + }, + { + "epoch": 1.9595114828089737, + "grad_norm": 0.4103240554784892, + "learning_rate": 3.81522471869327e-06, + "loss": 0.5734, + "step": 7381 + }, + { + "epoch": 1.9597769812823578, + "grad_norm": 0.40183648352397106, + "learning_rate": 3.814927797374393e-06, + "loss": 0.5776, + "step": 7382 + }, + { + "epoch": 1.9600424797557414, + "grad_norm": 0.4269050270802489, + "learning_rate": 3.814630850411205e-06, + "loss": 0.5676, + "step": 7383 + }, + { + "epoch": 1.960307978229125, + "grad_norm": 0.42774758345991337, + "learning_rate": 3.8143338778094963e-06, + "loss": 0.586, + "step": 7384 + }, + { + "epoch": 1.960573476702509, + "grad_norm": 0.4030192793021057, + "learning_rate": 3.8140368795750594e-06, + "loss": 0.5994, + "step": 7385 + }, + { + "epoch": 1.9608389751758928, + "grad_norm": 0.42557558089397585, + "learning_rate": 3.8137398557136852e-06, + "loss": 0.5457, + "step": 7386 + }, + { + "epoch": 1.9611044736492764, + "grad_norm": 0.4345490112943575, + "learning_rate": 3.813442806231167e-06, + "loss": 0.5549, + "step": 7387 + }, + { + "epoch": 1.9613699721226603, + "grad_norm": 0.419544798137635, + "learning_rate": 3.813145731133299e-06, + "loss": 0.5727, + "step": 7388 + }, + { + "epoch": 1.9616354705960442, + "grad_norm": 0.4180544259536902, + "learning_rate": 3.812848630425874e-06, + "loss": 0.5615, + "step": 7389 + }, + { + "epoch": 1.9619009690694278, + "grad_norm": 0.4114378365322946, + "learning_rate": 3.812551504114686e-06, + "loss": 0.5964, + "step": 7390 + }, + { + "epoch": 1.9621664675428117, + "grad_norm": 0.4224188280197434, + "learning_rate": 3.8122543522055303e-06, + "loss": 0.5948, + "step": 7391 + }, + { + "epoch": 1.9624319660161955, + "grad_norm": 0.42865940324872326, + "learning_rate": 3.811957174704201e-06, + "loss": 0.6207, + "step": 7392 + }, + { + "epoch": 1.9626974644895792, + "grad_norm": 0.43074459774381124, + "learning_rate": 3.8116599716164947e-06, + "loss": 0.5657, + "step": 7393 + }, + { + "epoch": 1.9629629629629628, + "grad_norm": 0.41221153172612696, + "learning_rate": 3.811362742948209e-06, + "loss": 0.5774, + "step": 7394 + }, + { + "epoch": 1.9632284614363469, + "grad_norm": 0.41844579762903084, + "learning_rate": 3.811065488705138e-06, + "loss": 0.5979, + "step": 7395 + }, + { + "epoch": 1.9634939599097305, + "grad_norm": 0.4212079361322282, + "learning_rate": 3.8107682088930797e-06, + "loss": 0.5965, + "step": 7396 + }, + { + "epoch": 1.9637594583831142, + "grad_norm": 0.42673139201545884, + "learning_rate": 3.810470903517832e-06, + "loss": 0.5969, + "step": 7397 + }, + { + "epoch": 1.964024956856498, + "grad_norm": 0.4032462664849555, + "learning_rate": 3.8101735725851937e-06, + "loss": 0.5742, + "step": 7398 + }, + { + "epoch": 1.964290455329882, + "grad_norm": 0.41851530448704083, + "learning_rate": 3.8098762161009624e-06, + "loss": 0.5887, + "step": 7399 + }, + { + "epoch": 1.9645559538032655, + "grad_norm": 0.40742839364011607, + "learning_rate": 3.809578834070938e-06, + "loss": 0.5805, + "step": 7400 + }, + { + "epoch": 1.9648214522766494, + "grad_norm": 0.4200450734704836, + "learning_rate": 3.809281426500919e-06, + "loss": 0.5806, + "step": 7401 + }, + { + "epoch": 1.9650869507500333, + "grad_norm": 0.3931166683697361, + "learning_rate": 3.8089839933967076e-06, + "loss": 0.5923, + "step": 7402 + }, + { + "epoch": 1.965352449223417, + "grad_norm": 0.4154707503645699, + "learning_rate": 3.8086865347641035e-06, + "loss": 0.5891, + "step": 7403 + }, + { + "epoch": 1.9656179476968008, + "grad_norm": 0.41784168985414244, + "learning_rate": 3.808389050608907e-06, + "loss": 0.5571, + "step": 7404 + }, + { + "epoch": 1.9658834461701846, + "grad_norm": 0.427872826689121, + "learning_rate": 3.8080915409369207e-06, + "loss": 0.6034, + "step": 7405 + }, + { + "epoch": 1.9661489446435683, + "grad_norm": 0.4186891550456353, + "learning_rate": 3.807794005753947e-06, + "loss": 0.5597, + "step": 7406 + }, + { + "epoch": 1.9664144431169521, + "grad_norm": 0.4142455121414843, + "learning_rate": 3.807496445065788e-06, + "loss": 0.5941, + "step": 7407 + }, + { + "epoch": 1.966679941590336, + "grad_norm": 0.416150868149289, + "learning_rate": 3.8071988588782466e-06, + "loss": 0.5948, + "step": 7408 + }, + { + "epoch": 1.9669454400637196, + "grad_norm": 0.43637830656131715, + "learning_rate": 3.8069012471971278e-06, + "loss": 0.6111, + "step": 7409 + }, + { + "epoch": 1.9672109385371033, + "grad_norm": 0.42120288693073155, + "learning_rate": 3.806603610028234e-06, + "loss": 0.5805, + "step": 7410 + }, + { + "epoch": 1.9674764370104874, + "grad_norm": 0.43376020874526666, + "learning_rate": 3.8063059473773705e-06, + "loss": 0.5339, + "step": 7411 + }, + { + "epoch": 1.967741935483871, + "grad_norm": 0.4124621309741732, + "learning_rate": 3.806008259250343e-06, + "loss": 0.5867, + "step": 7412 + }, + { + "epoch": 1.9680074339572546, + "grad_norm": 0.4206963355803583, + "learning_rate": 3.805710545652957e-06, + "loss": 0.5793, + "step": 7413 + }, + { + "epoch": 1.9682729324306385, + "grad_norm": 0.41916107887977166, + "learning_rate": 3.8054128065910186e-06, + "loss": 0.6174, + "step": 7414 + }, + { + "epoch": 1.9685384309040224, + "grad_norm": 0.424104433428084, + "learning_rate": 3.8051150420703337e-06, + "loss": 0.5376, + "step": 7415 + }, + { + "epoch": 1.968803929377406, + "grad_norm": 0.4225016672918098, + "learning_rate": 3.8048172520967103e-06, + "loss": 0.5778, + "step": 7416 + }, + { + "epoch": 1.9690694278507899, + "grad_norm": 0.41054173141752887, + "learning_rate": 3.804519436675956e-06, + "loss": 0.5925, + "step": 7417 + }, + { + "epoch": 1.9693349263241737, + "grad_norm": 0.41224055467123305, + "learning_rate": 3.8042215958138784e-06, + "loss": 0.6037, + "step": 7418 + }, + { + "epoch": 1.9696004247975574, + "grad_norm": 0.43703809178912356, + "learning_rate": 3.8039237295162858e-06, + "loss": 0.6011, + "step": 7419 + }, + { + "epoch": 1.9698659232709412, + "grad_norm": 0.43038227767228815, + "learning_rate": 3.8036258377889882e-06, + "loss": 0.6118, + "step": 7420 + }, + { + "epoch": 1.970131421744325, + "grad_norm": 0.4288096208914957, + "learning_rate": 3.8033279206377956e-06, + "loss": 0.5887, + "step": 7421 + }, + { + "epoch": 1.9703969202177087, + "grad_norm": 0.40989466331715, + "learning_rate": 3.8030299780685164e-06, + "loss": 0.5657, + "step": 7422 + }, + { + "epoch": 1.9706624186910924, + "grad_norm": 0.4177392528650334, + "learning_rate": 3.8027320100869635e-06, + "loss": 0.6003, + "step": 7423 + }, + { + "epoch": 1.9709279171644765, + "grad_norm": 0.431206722063452, + "learning_rate": 3.802434016698946e-06, + "loss": 0.5818, + "step": 7424 + }, + { + "epoch": 1.97119341563786, + "grad_norm": 0.4086312185451687, + "learning_rate": 3.8021359979102767e-06, + "loss": 0.5778, + "step": 7425 + }, + { + "epoch": 1.9714589141112437, + "grad_norm": 0.4211439415890832, + "learning_rate": 3.801837953726767e-06, + "loss": 0.562, + "step": 7426 + }, + { + "epoch": 1.9717244125846276, + "grad_norm": 0.4336611730411598, + "learning_rate": 3.8015398841542297e-06, + "loss": 0.5601, + "step": 7427 + }, + { + "epoch": 1.9719899110580115, + "grad_norm": 0.4125105807222685, + "learning_rate": 3.8012417891984776e-06, + "loss": 0.5843, + "step": 7428 + }, + { + "epoch": 1.972255409531395, + "grad_norm": 0.4246092297005021, + "learning_rate": 3.800943668865326e-06, + "loss": 0.571, + "step": 7429 + }, + { + "epoch": 1.972520908004779, + "grad_norm": 0.43149052929987475, + "learning_rate": 3.800645523160586e-06, + "loss": 0.5911, + "step": 7430 + }, + { + "epoch": 1.9727864064781628, + "grad_norm": 0.41525898191498506, + "learning_rate": 3.800347352090074e-06, + "loss": 0.5449, + "step": 7431 + }, + { + "epoch": 1.9730519049515465, + "grad_norm": 0.42474713314117263, + "learning_rate": 3.8000491556596055e-06, + "loss": 0.5774, + "step": 7432 + }, + { + "epoch": 1.9733174034249303, + "grad_norm": 0.4226163744300414, + "learning_rate": 3.799750933874995e-06, + "loss": 0.5947, + "step": 7433 + }, + { + "epoch": 1.9735829018983142, + "grad_norm": 0.4190942395315396, + "learning_rate": 3.79945268674206e-06, + "loss": 0.5871, + "step": 7434 + }, + { + "epoch": 1.9738484003716978, + "grad_norm": 0.4447372453983853, + "learning_rate": 3.7991544142666146e-06, + "loss": 0.5689, + "step": 7435 + }, + { + "epoch": 1.9741138988450815, + "grad_norm": 0.41663101946232156, + "learning_rate": 3.798856116454478e-06, + "loss": 0.613, + "step": 7436 + }, + { + "epoch": 1.9743793973184656, + "grad_norm": 0.42124953480960353, + "learning_rate": 3.798557793311466e-06, + "loss": 0.577, + "step": 7437 + }, + { + "epoch": 1.9746448957918492, + "grad_norm": 0.4125730878123955, + "learning_rate": 3.7982594448433983e-06, + "loss": 0.5628, + "step": 7438 + }, + { + "epoch": 1.9749103942652328, + "grad_norm": 0.43410623056739295, + "learning_rate": 3.797961071056093e-06, + "loss": 0.6187, + "step": 7439 + }, + { + "epoch": 1.9751758927386167, + "grad_norm": 0.461363510074443, + "learning_rate": 3.7976626719553688e-06, + "loss": 0.5689, + "step": 7440 + }, + { + "epoch": 1.9754413912120006, + "grad_norm": 0.4235374205365759, + "learning_rate": 3.797364247547045e-06, + "loss": 0.5703, + "step": 7441 + }, + { + "epoch": 1.9757068896853842, + "grad_norm": 0.4126239334919761, + "learning_rate": 3.7970657978369418e-06, + "loss": 0.5861, + "step": 7442 + }, + { + "epoch": 1.975972388158768, + "grad_norm": 0.4186589837459904, + "learning_rate": 3.79676732283088e-06, + "loss": 0.5426, + "step": 7443 + }, + { + "epoch": 1.976237886632152, + "grad_norm": 0.4823851203797616, + "learning_rate": 3.79646882253468e-06, + "loss": 0.5653, + "step": 7444 + }, + { + "epoch": 1.9765033851055356, + "grad_norm": 0.41591811961301756, + "learning_rate": 3.796170296954164e-06, + "loss": 0.5903, + "step": 7445 + }, + { + "epoch": 1.9767688835789194, + "grad_norm": 0.4081284254401549, + "learning_rate": 3.7958717460951546e-06, + "loss": 0.549, + "step": 7446 + }, + { + "epoch": 1.9770343820523033, + "grad_norm": 0.41894988927461274, + "learning_rate": 3.7955731699634717e-06, + "loss": 0.6042, + "step": 7447 + }, + { + "epoch": 1.977299880525687, + "grad_norm": 0.4370634235233121, + "learning_rate": 3.79527456856494e-06, + "loss": 0.5551, + "step": 7448 + }, + { + "epoch": 1.9775653789990706, + "grad_norm": 0.4120978249653195, + "learning_rate": 3.7949759419053833e-06, + "loss": 0.5542, + "step": 7449 + }, + { + "epoch": 1.9778308774724547, + "grad_norm": 0.41561231759868, + "learning_rate": 3.7946772899906253e-06, + "loss": 0.6164, + "step": 7450 + }, + { + "epoch": 1.9780963759458383, + "grad_norm": 0.42482719222816673, + "learning_rate": 3.7943786128264902e-06, + "loss": 0.5818, + "step": 7451 + }, + { + "epoch": 1.978361874419222, + "grad_norm": 0.4144044880639814, + "learning_rate": 3.794079910418803e-06, + "loss": 0.6004, + "step": 7452 + }, + { + "epoch": 1.9786273728926058, + "grad_norm": 0.4226922735449433, + "learning_rate": 3.7937811827733882e-06, + "loss": 0.5761, + "step": 7453 + }, + { + "epoch": 1.9788928713659897, + "grad_norm": 0.4112301463198234, + "learning_rate": 3.793482429896074e-06, + "loss": 0.5876, + "step": 7454 + }, + { + "epoch": 1.9791583698393733, + "grad_norm": 0.4325794906865588, + "learning_rate": 3.7931836517926844e-06, + "loss": 0.6001, + "step": 7455 + }, + { + "epoch": 1.9794238683127572, + "grad_norm": 0.41626570417621117, + "learning_rate": 3.7928848484690477e-06, + "loss": 0.5428, + "step": 7456 + }, + { + "epoch": 1.979689366786141, + "grad_norm": 0.3996050822708342, + "learning_rate": 3.792586019930991e-06, + "loss": 0.5544, + "step": 7457 + }, + { + "epoch": 1.9799548652595247, + "grad_norm": 0.4215100294319676, + "learning_rate": 3.792287166184342e-06, + "loss": 0.5889, + "step": 7458 + }, + { + "epoch": 1.9802203637329086, + "grad_norm": 0.4110296134987471, + "learning_rate": 3.791988287234929e-06, + "loss": 0.6003, + "step": 7459 + }, + { + "epoch": 1.9804858622062924, + "grad_norm": 0.4076559851840653, + "learning_rate": 3.7916893830885816e-06, + "loss": 0.5866, + "step": 7460 + }, + { + "epoch": 1.980751360679676, + "grad_norm": 0.41650298212304376, + "learning_rate": 3.7913904537511283e-06, + "loss": 0.5948, + "step": 7461 + }, + { + "epoch": 1.98101685915306, + "grad_norm": 0.41599462841395546, + "learning_rate": 3.791091499228399e-06, + "loss": 0.5718, + "step": 7462 + }, + { + "epoch": 1.9812823576264438, + "grad_norm": 0.41144312088330043, + "learning_rate": 3.790792519526225e-06, + "loss": 0.5425, + "step": 7463 + }, + { + "epoch": 1.9815478560998274, + "grad_norm": 0.39046794691059317, + "learning_rate": 3.790493514650436e-06, + "loss": 0.5973, + "step": 7464 + }, + { + "epoch": 1.981813354573211, + "grad_norm": 0.4153129533702801, + "learning_rate": 3.790194484606864e-06, + "loss": 0.5623, + "step": 7465 + }, + { + "epoch": 1.9820788530465951, + "grad_norm": 0.39437436556230643, + "learning_rate": 3.78989542940134e-06, + "loss": 0.534, + "step": 7466 + }, + { + "epoch": 1.9823443515199788, + "grad_norm": 0.4131985715877372, + "learning_rate": 3.789596349039697e-06, + "loss": 0.5939, + "step": 7467 + }, + { + "epoch": 1.9826098499933624, + "grad_norm": 0.40769899343581045, + "learning_rate": 3.7892972435277686e-06, + "loss": 0.5719, + "step": 7468 + }, + { + "epoch": 1.9828753484667463, + "grad_norm": 0.4191367188228909, + "learning_rate": 3.7889981128713873e-06, + "loss": 0.5787, + "step": 7469 + }, + { + "epoch": 1.9831408469401302, + "grad_norm": 0.4086534301464835, + "learning_rate": 3.7886989570763856e-06, + "loss": 0.5825, + "step": 7470 + }, + { + "epoch": 1.9834063454135138, + "grad_norm": 0.4079483021682168, + "learning_rate": 3.7883997761485993e-06, + "loss": 0.5523, + "step": 7471 + }, + { + "epoch": 1.9836718438868977, + "grad_norm": 0.4221876143467679, + "learning_rate": 3.7881005700938635e-06, + "loss": 0.5938, + "step": 7472 + }, + { + "epoch": 1.9839373423602815, + "grad_norm": 0.4106825970474645, + "learning_rate": 3.7878013389180126e-06, + "loss": 0.5667, + "step": 7473 + }, + { + "epoch": 1.9842028408336652, + "grad_norm": 0.4304947278749791, + "learning_rate": 3.7875020826268818e-06, + "loss": 0.5774, + "step": 7474 + }, + { + "epoch": 1.984468339307049, + "grad_norm": 0.41587302972158063, + "learning_rate": 3.7872028012263084e-06, + "loss": 0.5689, + "step": 7475 + }, + { + "epoch": 1.9847338377804329, + "grad_norm": 0.43202715931612407, + "learning_rate": 3.7869034947221285e-06, + "loss": 0.5928, + "step": 7476 + }, + { + "epoch": 1.9849993362538165, + "grad_norm": 0.4194965374504465, + "learning_rate": 3.78660416312018e-06, + "loss": 0.55, + "step": 7477 + }, + { + "epoch": 1.9852648347272002, + "grad_norm": 0.4310260489747963, + "learning_rate": 3.786304806426301e-06, + "loss": 0.6205, + "step": 7478 + }, + { + "epoch": 1.9855303332005843, + "grad_norm": 0.4281765102576451, + "learning_rate": 3.7860054246463272e-06, + "loss": 0.5639, + "step": 7479 + }, + { + "epoch": 1.985795831673968, + "grad_norm": 0.40540803995042807, + "learning_rate": 3.7857060177860992e-06, + "loss": 0.5753, + "step": 7480 + }, + { + "epoch": 1.9860613301473515, + "grad_norm": 0.40309097399726107, + "learning_rate": 3.7854065858514573e-06, + "loss": 0.6094, + "step": 7481 + }, + { + "epoch": 1.9863268286207354, + "grad_norm": 0.40909241825909437, + "learning_rate": 3.7851071288482387e-06, + "loss": 0.5524, + "step": 7482 + }, + { + "epoch": 1.9865923270941193, + "grad_norm": 0.4100476787226209, + "learning_rate": 3.7848076467822846e-06, + "loss": 0.5629, + "step": 7483 + }, + { + "epoch": 1.986857825567503, + "grad_norm": 0.4431898317902111, + "learning_rate": 3.7845081396594354e-06, + "loss": 0.5379, + "step": 7484 + }, + { + "epoch": 1.9871233240408868, + "grad_norm": 0.4052830763386127, + "learning_rate": 3.7842086074855334e-06, + "loss": 0.5875, + "step": 7485 + }, + { + "epoch": 1.9873888225142706, + "grad_norm": 0.4146115790317509, + "learning_rate": 3.783909050266419e-06, + "loss": 0.5928, + "step": 7486 + }, + { + "epoch": 1.9876543209876543, + "grad_norm": 0.41350662762554063, + "learning_rate": 3.783609468007934e-06, + "loss": 0.55, + "step": 7487 + }, + { + "epoch": 1.9879198194610381, + "grad_norm": 0.415554981790555, + "learning_rate": 3.783309860715921e-06, + "loss": 0.5588, + "step": 7488 + }, + { + "epoch": 1.988185317934422, + "grad_norm": 0.43987157320306064, + "learning_rate": 3.7830102283962254e-06, + "loss": 0.5456, + "step": 7489 + }, + { + "epoch": 1.9884508164078056, + "grad_norm": 0.41114900283976163, + "learning_rate": 3.782710571054688e-06, + "loss": 0.6099, + "step": 7490 + }, + { + "epoch": 1.9887163148811893, + "grad_norm": 0.40792923356523525, + "learning_rate": 3.7824108886971535e-06, + "loss": 0.5866, + "step": 7491 + }, + { + "epoch": 1.9889818133545734, + "grad_norm": 0.417019160180324, + "learning_rate": 3.7821111813294674e-06, + "loss": 0.57, + "step": 7492 + }, + { + "epoch": 1.989247311827957, + "grad_norm": 0.4273576445563474, + "learning_rate": 3.7818114489574737e-06, + "loss": 0.6266, + "step": 7493 + }, + { + "epoch": 1.9895128103013406, + "grad_norm": 0.43704009096819274, + "learning_rate": 3.7815116915870187e-06, + "loss": 0.5903, + "step": 7494 + }, + { + "epoch": 1.9897783087747245, + "grad_norm": 0.4329871314136986, + "learning_rate": 3.781211909223948e-06, + "loss": 0.6059, + "step": 7495 + }, + { + "epoch": 1.9900438072481084, + "grad_norm": 0.4230724535294457, + "learning_rate": 3.7809121018741078e-06, + "loss": 0.5653, + "step": 7496 + }, + { + "epoch": 1.990309305721492, + "grad_norm": 0.41918402761396256, + "learning_rate": 3.7806122695433456e-06, + "loss": 0.5826, + "step": 7497 + }, + { + "epoch": 1.9905748041948759, + "grad_norm": 0.41636714488334237, + "learning_rate": 3.780312412237509e-06, + "loss": 0.5638, + "step": 7498 + }, + { + "epoch": 1.9908403026682597, + "grad_norm": 0.4156181713627736, + "learning_rate": 3.7800125299624444e-06, + "loss": 0.583, + "step": 7499 + }, + { + "epoch": 1.9911058011416434, + "grad_norm": 0.40901610155378626, + "learning_rate": 3.779712622724003e-06, + "loss": 0.5923, + "step": 7500 + }, + { + "epoch": 1.9913712996150272, + "grad_norm": 0.4420018060181688, + "learning_rate": 3.7794126905280317e-06, + "loss": 0.5948, + "step": 7501 + }, + { + "epoch": 1.991636798088411, + "grad_norm": 0.41691576555377097, + "learning_rate": 3.77911273338038e-06, + "loss": 0.6296, + "step": 7502 + }, + { + "epoch": 1.9919022965617947, + "grad_norm": 0.4292098675997834, + "learning_rate": 3.7788127512868984e-06, + "loss": 0.5879, + "step": 7503 + }, + { + "epoch": 1.9921677950351786, + "grad_norm": 0.4052871928629502, + "learning_rate": 3.778512744253437e-06, + "loss": 0.5857, + "step": 7504 + }, + { + "epoch": 1.9924332935085625, + "grad_norm": 0.3994916515356325, + "learning_rate": 3.778212712285847e-06, + "loss": 0.5584, + "step": 7505 + }, + { + "epoch": 1.992698791981946, + "grad_norm": 0.41964470575903884, + "learning_rate": 3.777912655389979e-06, + "loss": 0.5491, + "step": 7506 + }, + { + "epoch": 1.9929642904553297, + "grad_norm": 0.40849686894282194, + "learning_rate": 3.777612573571686e-06, + "loss": 0.5834, + "step": 7507 + }, + { + "epoch": 1.9932297889287136, + "grad_norm": 0.4076866839137105, + "learning_rate": 3.777312466836819e-06, + "loss": 0.6027, + "step": 7508 + }, + { + "epoch": 1.9934952874020975, + "grad_norm": 0.4012289472764235, + "learning_rate": 3.7770123351912325e-06, + "loss": 0.5425, + "step": 7509 + }, + { + "epoch": 1.9937607858754811, + "grad_norm": 0.42824139054135735, + "learning_rate": 3.776712178640778e-06, + "loss": 0.5956, + "step": 7510 + }, + { + "epoch": 1.994026284348865, + "grad_norm": 0.4058150445018642, + "learning_rate": 3.7764119971913098e-06, + "loss": 0.5885, + "step": 7511 + }, + { + "epoch": 1.9942917828222488, + "grad_norm": 0.42464102247810515, + "learning_rate": 3.7761117908486833e-06, + "loss": 0.5425, + "step": 7512 + }, + { + "epoch": 1.9945572812956325, + "grad_norm": 0.4153732021015508, + "learning_rate": 3.7758115596187517e-06, + "loss": 0.5942, + "step": 7513 + }, + { + "epoch": 1.9948227797690163, + "grad_norm": 0.42507348054777094, + "learning_rate": 3.7755113035073707e-06, + "loss": 0.5615, + "step": 7514 + }, + { + "epoch": 1.9950882782424002, + "grad_norm": 0.41011630486263034, + "learning_rate": 3.7752110225203974e-06, + "loss": 0.575, + "step": 7515 + }, + { + "epoch": 1.9953537767157838, + "grad_norm": 0.452965704832396, + "learning_rate": 3.7749107166636867e-06, + "loss": 0.5906, + "step": 7516 + }, + { + "epoch": 1.9956192751891677, + "grad_norm": 0.41711257779944655, + "learning_rate": 3.774610385943095e-06, + "loss": 0.5813, + "step": 7517 + }, + { + "epoch": 1.9958847736625516, + "grad_norm": 0.4103512349127069, + "learning_rate": 3.77431003036448e-06, + "loss": 0.5871, + "step": 7518 + }, + { + "epoch": 1.9961502721359352, + "grad_norm": 0.40903660395809843, + "learning_rate": 3.7740096499336996e-06, + "loss": 0.5533, + "step": 7519 + }, + { + "epoch": 1.9964157706093189, + "grad_norm": 0.39756787945628164, + "learning_rate": 3.7737092446566115e-06, + "loss": 0.5727, + "step": 7520 + }, + { + "epoch": 1.996681269082703, + "grad_norm": 0.4148081666930947, + "learning_rate": 3.773408814539075e-06, + "loss": 0.5934, + "step": 7521 + }, + { + "epoch": 1.9969467675560866, + "grad_norm": 0.4042675946497851, + "learning_rate": 3.773108359586948e-06, + "loss": 0.595, + "step": 7522 + }, + { + "epoch": 1.9972122660294702, + "grad_norm": 0.42682277902814, + "learning_rate": 3.772807879806091e-06, + "loss": 0.5483, + "step": 7523 + }, + { + "epoch": 1.997477764502854, + "grad_norm": 0.394793331758505, + "learning_rate": 3.772507375202365e-06, + "loss": 0.5506, + "step": 7524 + }, + { + "epoch": 1.997743262976238, + "grad_norm": 0.4138675275200553, + "learning_rate": 3.7722068457816287e-06, + "loss": 0.6088, + "step": 7525 + }, + { + "epoch": 1.9980087614496216, + "grad_norm": 0.4196800846173291, + "learning_rate": 3.771906291549744e-06, + "loss": 0.5843, + "step": 7526 + }, + { + "epoch": 1.9982742599230054, + "grad_norm": 0.3985408842440907, + "learning_rate": 3.7716057125125727e-06, + "loss": 0.5751, + "step": 7527 + }, + { + "epoch": 1.9985397583963893, + "grad_norm": 0.4011213842007995, + "learning_rate": 3.7713051086759764e-06, + "loss": 0.6004, + "step": 7528 + }, + { + "epoch": 1.998805256869773, + "grad_norm": 0.41196775153150056, + "learning_rate": 3.771004480045818e-06, + "loss": 0.5886, + "step": 7529 + }, + { + "epoch": 1.9990707553431568, + "grad_norm": 0.40961083270779247, + "learning_rate": 3.7707038266279604e-06, + "loss": 0.5957, + "step": 7530 + }, + { + "epoch": 1.9993362538165407, + "grad_norm": 0.41781163583119213, + "learning_rate": 3.7704031484282666e-06, + "loss": 0.5324, + "step": 7531 + }, + { + "epoch": 1.9996017522899243, + "grad_norm": 0.4136015002730909, + "learning_rate": 3.7701024454526015e-06, + "loss": 0.578, + "step": 7532 + }, + { + "epoch": 1.999867250763308, + "grad_norm": 0.4167394435791282, + "learning_rate": 3.769801717706828e-06, + "loss": 0.5867, + "step": 7533 + }, + { + "epoch": 2.0, + "grad_norm": 0.6582931606981949, + "learning_rate": 3.769500965196813e-06, + "loss": 0.5582, + "step": 7534 + }, + { + "epoch": 2.0002654984733836, + "grad_norm": 0.39849785806989524, + "learning_rate": 3.7692001879284203e-06, + "loss": 0.5269, + "step": 7535 + }, + { + "epoch": 2.0005309969467677, + "grad_norm": 0.4071731370376904, + "learning_rate": 3.7688993859075164e-06, + "loss": 0.5787, + "step": 7536 + }, + { + "epoch": 2.0007964954201514, + "grad_norm": 0.4060741465652973, + "learning_rate": 3.7685985591399677e-06, + "loss": 0.5649, + "step": 7537 + }, + { + "epoch": 2.001061993893535, + "grad_norm": 0.41035203247693974, + "learning_rate": 3.768297707631642e-06, + "loss": 0.5462, + "step": 7538 + }, + { + "epoch": 2.001327492366919, + "grad_norm": 0.40969406178316803, + "learning_rate": 3.7679968313884042e-06, + "loss": 0.5734, + "step": 7539 + }, + { + "epoch": 2.0015929908403027, + "grad_norm": 0.40768245156178695, + "learning_rate": 3.7676959304161238e-06, + "loss": 0.5512, + "step": 7540 + }, + { + "epoch": 2.0018584893136864, + "grad_norm": 0.4021573870578567, + "learning_rate": 3.76739500472067e-06, + "loss": 0.5539, + "step": 7541 + }, + { + "epoch": 2.00212398778707, + "grad_norm": 0.4124405753842156, + "learning_rate": 3.7670940543079093e-06, + "loss": 0.5433, + "step": 7542 + }, + { + "epoch": 2.002389486260454, + "grad_norm": 0.4147953159519462, + "learning_rate": 3.7667930791837126e-06, + "loss": 0.5792, + "step": 7543 + }, + { + "epoch": 2.0026549847338377, + "grad_norm": 0.4091672369166906, + "learning_rate": 3.7664920793539493e-06, + "loss": 0.5764, + "step": 7544 + }, + { + "epoch": 2.0029204832072214, + "grad_norm": 0.4199860337356155, + "learning_rate": 3.7661910548244896e-06, + "loss": 0.5781, + "step": 7545 + }, + { + "epoch": 2.0031859816806055, + "grad_norm": 0.41147385773224143, + "learning_rate": 3.765890005601204e-06, + "loss": 0.5667, + "step": 7546 + }, + { + "epoch": 2.003451480153989, + "grad_norm": 0.41587528066035606, + "learning_rate": 3.765588931689964e-06, + "loss": 0.5434, + "step": 7547 + }, + { + "epoch": 2.0037169786273727, + "grad_norm": 0.42208180926256395, + "learning_rate": 3.7652878330966414e-06, + "loss": 0.5435, + "step": 7548 + }, + { + "epoch": 2.003982477100757, + "grad_norm": 0.4387065946199643, + "learning_rate": 3.764986709827107e-06, + "loss": 0.607, + "step": 7549 + }, + { + "epoch": 2.0042479755741405, + "grad_norm": 0.4301699883843542, + "learning_rate": 3.764685561887236e-06, + "loss": 0.5479, + "step": 7550 + }, + { + "epoch": 2.004513474047524, + "grad_norm": 0.4192885969648279, + "learning_rate": 3.764384389282899e-06, + "loss": 0.6061, + "step": 7551 + }, + { + "epoch": 2.004778972520908, + "grad_norm": 0.42042741745900486, + "learning_rate": 3.7640831920199706e-06, + "loss": 0.5549, + "step": 7552 + }, + { + "epoch": 2.005044470994292, + "grad_norm": 0.4189877292495763, + "learning_rate": 3.7637819701043256e-06, + "loss": 0.5336, + "step": 7553 + }, + { + "epoch": 2.0053099694676755, + "grad_norm": 0.4283044764271871, + "learning_rate": 3.763480723541838e-06, + "loss": 0.5944, + "step": 7554 + }, + { + "epoch": 2.005575467941059, + "grad_norm": 0.41820593600901707, + "learning_rate": 3.7631794523383823e-06, + "loss": 0.5788, + "step": 7555 + }, + { + "epoch": 2.005840966414443, + "grad_norm": 0.4093590565435108, + "learning_rate": 3.762878156499835e-06, + "loss": 0.5061, + "step": 7556 + }, + { + "epoch": 2.006106464887827, + "grad_norm": 0.4257495887949887, + "learning_rate": 3.762576836032071e-06, + "loss": 0.5894, + "step": 7557 + }, + { + "epoch": 2.0063719633612105, + "grad_norm": 0.42369780156780357, + "learning_rate": 3.7622754909409674e-06, + "loss": 0.6089, + "step": 7558 + }, + { + "epoch": 2.0066374618345946, + "grad_norm": 0.4171308834477733, + "learning_rate": 3.761974121232401e-06, + "loss": 0.5569, + "step": 7559 + }, + { + "epoch": 2.006902960307978, + "grad_norm": 0.4261698926162277, + "learning_rate": 3.76167272691225e-06, + "loss": 0.5541, + "step": 7560 + }, + { + "epoch": 2.007168458781362, + "grad_norm": 0.42610925731238486, + "learning_rate": 3.7613713079863914e-06, + "loss": 0.5565, + "step": 7561 + }, + { + "epoch": 2.007433957254746, + "grad_norm": 0.4239100599712335, + "learning_rate": 3.7610698644607036e-06, + "loss": 0.6016, + "step": 7562 + }, + { + "epoch": 2.0076994557281296, + "grad_norm": 0.42451315438866494, + "learning_rate": 3.7607683963410665e-06, + "loss": 0.5724, + "step": 7563 + }, + { + "epoch": 2.007964954201513, + "grad_norm": 0.4550468384610608, + "learning_rate": 3.7604669036333584e-06, + "loss": 0.5782, + "step": 7564 + }, + { + "epoch": 2.0082304526748973, + "grad_norm": 0.42960257659456064, + "learning_rate": 3.7601653863434596e-06, + "loss": 0.5576, + "step": 7565 + }, + { + "epoch": 2.008495951148281, + "grad_norm": 0.4165642861767433, + "learning_rate": 3.7598638444772505e-06, + "loss": 0.575, + "step": 7566 + }, + { + "epoch": 2.0087614496216646, + "grad_norm": 0.41955441713771147, + "learning_rate": 3.7595622780406116e-06, + "loss": 0.5729, + "step": 7567 + }, + { + "epoch": 2.0090269480950487, + "grad_norm": 0.4142957685437747, + "learning_rate": 3.759260687039424e-06, + "loss": 0.5511, + "step": 7568 + }, + { + "epoch": 2.0092924465684323, + "grad_norm": 0.42324566322808216, + "learning_rate": 3.75895907147957e-06, + "loss": 0.569, + "step": 7569 + }, + { + "epoch": 2.009557945041816, + "grad_norm": 0.42120555280639016, + "learning_rate": 3.758657431366932e-06, + "loss": 0.5956, + "step": 7570 + }, + { + "epoch": 2.0098234435151996, + "grad_norm": 0.41309407656392527, + "learning_rate": 3.758355766707392e-06, + "loss": 0.5809, + "step": 7571 + }, + { + "epoch": 2.0100889419885837, + "grad_norm": 0.41890565553502174, + "learning_rate": 3.7580540775068334e-06, + "loss": 0.5573, + "step": 7572 + }, + { + "epoch": 2.0103544404619673, + "grad_norm": 0.42468503747452674, + "learning_rate": 3.757752363771141e-06, + "loss": 0.5866, + "step": 7573 + }, + { + "epoch": 2.010619938935351, + "grad_norm": 0.40785280191753054, + "learning_rate": 3.757450625506197e-06, + "loss": 0.5332, + "step": 7574 + }, + { + "epoch": 2.010885437408735, + "grad_norm": 0.42729196912372314, + "learning_rate": 3.7571488627178875e-06, + "loss": 0.5881, + "step": 7575 + }, + { + "epoch": 2.0111509358821187, + "grad_norm": 0.4189552892007665, + "learning_rate": 3.7568470754120967e-06, + "loss": 0.5898, + "step": 7576 + }, + { + "epoch": 2.0114164343555023, + "grad_norm": 0.4228253571150553, + "learning_rate": 3.7565452635947106e-06, + "loss": 0.6099, + "step": 7577 + }, + { + "epoch": 2.0116819328288864, + "grad_norm": 0.41617814443775253, + "learning_rate": 3.7562434272716157e-06, + "loss": 0.5865, + "step": 7578 + }, + { + "epoch": 2.01194743130227, + "grad_norm": 0.420106666536967, + "learning_rate": 3.7559415664486985e-06, + "loss": 0.5606, + "step": 7579 + }, + { + "epoch": 2.0122129297756537, + "grad_norm": 0.4164899990776468, + "learning_rate": 3.7556396811318453e-06, + "loss": 0.5773, + "step": 7580 + }, + { + "epoch": 2.0124784282490378, + "grad_norm": 0.41961464901743245, + "learning_rate": 3.7553377713269447e-06, + "loss": 0.5457, + "step": 7581 + }, + { + "epoch": 2.0127439267224214, + "grad_norm": 0.4139187721865026, + "learning_rate": 3.7550358370398825e-06, + "loss": 0.5719, + "step": 7582 + }, + { + "epoch": 2.013009425195805, + "grad_norm": 0.42218109155313555, + "learning_rate": 3.7547338782765496e-06, + "loss": 0.552, + "step": 7583 + }, + { + "epoch": 2.0132749236691887, + "grad_norm": 0.41731766090243194, + "learning_rate": 3.754431895042834e-06, + "loss": 0.5403, + "step": 7584 + }, + { + "epoch": 2.013540422142573, + "grad_norm": 0.41572247097844384, + "learning_rate": 3.754129887344624e-06, + "loss": 0.5516, + "step": 7585 + }, + { + "epoch": 2.0138059206159564, + "grad_norm": 0.41925982552092605, + "learning_rate": 3.753827855187811e-06, + "loss": 0.5741, + "step": 7586 + }, + { + "epoch": 2.01407141908934, + "grad_norm": 0.4108585298729098, + "learning_rate": 3.753525798578286e-06, + "loss": 0.5357, + "step": 7587 + }, + { + "epoch": 2.014336917562724, + "grad_norm": 0.41471959591468255, + "learning_rate": 3.7532237175219378e-06, + "loss": 0.54, + "step": 7588 + }, + { + "epoch": 2.014602416036108, + "grad_norm": 0.4281300748310313, + "learning_rate": 3.7529216120246585e-06, + "loss": 0.5714, + "step": 7589 + }, + { + "epoch": 2.0148679145094914, + "grad_norm": 0.4236491915516071, + "learning_rate": 3.752619482092341e-06, + "loss": 0.5955, + "step": 7590 + }, + { + "epoch": 2.0151334129828755, + "grad_norm": 0.41502115587310157, + "learning_rate": 3.7523173277308763e-06, + "loss": 0.5411, + "step": 7591 + }, + { + "epoch": 2.015398911456259, + "grad_norm": 0.4137020625036817, + "learning_rate": 3.752015148946157e-06, + "loss": 0.5631, + "step": 7592 + }, + { + "epoch": 2.015664409929643, + "grad_norm": 0.4325603342359071, + "learning_rate": 3.7517129457440772e-06, + "loss": 0.5581, + "step": 7593 + }, + { + "epoch": 2.015929908403027, + "grad_norm": 0.4201421730784346, + "learning_rate": 3.7514107181305302e-06, + "loss": 0.5733, + "step": 7594 + }, + { + "epoch": 2.0161954068764105, + "grad_norm": 0.40944023257344, + "learning_rate": 3.7511084661114094e-06, + "loss": 0.5389, + "step": 7595 + }, + { + "epoch": 2.016460905349794, + "grad_norm": 0.4126118131152411, + "learning_rate": 3.7508061896926117e-06, + "loss": 0.5064, + "step": 7596 + }, + { + "epoch": 2.016726403823178, + "grad_norm": 0.42269380346032576, + "learning_rate": 3.7505038888800304e-06, + "loss": 0.6093, + "step": 7597 + }, + { + "epoch": 2.016991902296562, + "grad_norm": 0.4287442720345595, + "learning_rate": 3.750201563679561e-06, + "loss": 0.5907, + "step": 7598 + }, + { + "epoch": 2.0172574007699455, + "grad_norm": 0.45242643608015465, + "learning_rate": 3.7498992140971007e-06, + "loss": 0.53, + "step": 7599 + }, + { + "epoch": 2.017522899243329, + "grad_norm": 0.42679580149521024, + "learning_rate": 3.749596840138545e-06, + "loss": 0.5527, + "step": 7600 + }, + { + "epoch": 2.0177883977167133, + "grad_norm": 0.4170698822851295, + "learning_rate": 3.7492944418097913e-06, + "loss": 0.5646, + "step": 7601 + }, + { + "epoch": 2.018053896190097, + "grad_norm": 0.4246478310886877, + "learning_rate": 3.7489920191167376e-06, + "loss": 0.5789, + "step": 7602 + }, + { + "epoch": 2.0183193946634805, + "grad_norm": 0.4403978176363262, + "learning_rate": 3.7486895720652818e-06, + "loss": 0.5738, + "step": 7603 + }, + { + "epoch": 2.0185848931368646, + "grad_norm": 0.4371435385363269, + "learning_rate": 3.7483871006613207e-06, + "loss": 0.523, + "step": 7604 + }, + { + "epoch": 2.0188503916102483, + "grad_norm": 0.3992124329816631, + "learning_rate": 3.7480846049107555e-06, + "loss": 0.5333, + "step": 7605 + }, + { + "epoch": 2.019115890083632, + "grad_norm": 0.4100451739526877, + "learning_rate": 3.7477820848194846e-06, + "loss": 0.5503, + "step": 7606 + }, + { + "epoch": 2.019381388557016, + "grad_norm": 0.4123828841803448, + "learning_rate": 3.7474795403934083e-06, + "loss": 0.6124, + "step": 7607 + }, + { + "epoch": 2.0196468870303996, + "grad_norm": 0.43135332673488136, + "learning_rate": 3.747176971638426e-06, + "loss": 0.6032, + "step": 7608 + }, + { + "epoch": 2.0199123855037833, + "grad_norm": 0.42000556467921024, + "learning_rate": 3.746874378560439e-06, + "loss": 0.5929, + "step": 7609 + }, + { + "epoch": 2.0201778839771674, + "grad_norm": 0.46319565109114114, + "learning_rate": 3.7465717611653495e-06, + "loss": 0.5828, + "step": 7610 + }, + { + "epoch": 2.020443382450551, + "grad_norm": 0.4413532638846013, + "learning_rate": 3.746269119459058e-06, + "loss": 0.5787, + "step": 7611 + }, + { + "epoch": 2.0207088809239346, + "grad_norm": 0.41573442279836215, + "learning_rate": 3.745966453447467e-06, + "loss": 0.5654, + "step": 7612 + }, + { + "epoch": 2.0209743793973183, + "grad_norm": 0.4162260772594856, + "learning_rate": 3.7456637631364802e-06, + "loss": 0.5592, + "step": 7613 + }, + { + "epoch": 2.0212398778707024, + "grad_norm": 0.4410681515808186, + "learning_rate": 3.7453610485319993e-06, + "loss": 0.55, + "step": 7614 + }, + { + "epoch": 2.021505376344086, + "grad_norm": 0.4495847194498986, + "learning_rate": 3.7450583096399294e-06, + "loss": 0.5343, + "step": 7615 + }, + { + "epoch": 2.0217708748174696, + "grad_norm": 0.42121033902836386, + "learning_rate": 3.744755546466174e-06, + "loss": 0.5929, + "step": 7616 + }, + { + "epoch": 2.0220363732908537, + "grad_norm": 0.43407158503299403, + "learning_rate": 3.744452759016638e-06, + "loss": 0.6031, + "step": 7617 + }, + { + "epoch": 2.0223018717642374, + "grad_norm": 0.4186600240145897, + "learning_rate": 3.7441499472972254e-06, + "loss": 0.5669, + "step": 7618 + }, + { + "epoch": 2.022567370237621, + "grad_norm": 0.41865460694290674, + "learning_rate": 3.7438471113138434e-06, + "loss": 0.5922, + "step": 7619 + }, + { + "epoch": 2.022832868711005, + "grad_norm": 0.42194320821898607, + "learning_rate": 3.7435442510723967e-06, + "loss": 0.5646, + "step": 7620 + }, + { + "epoch": 2.0230983671843887, + "grad_norm": 0.4317384900363479, + "learning_rate": 3.7432413665787924e-06, + "loss": 0.5969, + "step": 7621 + }, + { + "epoch": 2.0233638656577724, + "grad_norm": 0.4221989337973331, + "learning_rate": 3.742938457838938e-06, + "loss": 0.5728, + "step": 7622 + }, + { + "epoch": 2.0236293641311565, + "grad_norm": 0.4302089253587822, + "learning_rate": 3.74263552485874e-06, + "loss": 0.5962, + "step": 7623 + }, + { + "epoch": 2.02389486260454, + "grad_norm": 0.4169788331119294, + "learning_rate": 3.742332567644107e-06, + "loss": 0.5741, + "step": 7624 + }, + { + "epoch": 2.0241603610779237, + "grad_norm": 0.4127579688579369, + "learning_rate": 3.7420295862009474e-06, + "loss": 0.5491, + "step": 7625 + }, + { + "epoch": 2.0244258595513074, + "grad_norm": 0.4240942212642293, + "learning_rate": 3.7417265805351694e-06, + "loss": 0.5652, + "step": 7626 + }, + { + "epoch": 2.0246913580246915, + "grad_norm": 0.4283571800703669, + "learning_rate": 3.7414235506526837e-06, + "loss": 0.5731, + "step": 7627 + }, + { + "epoch": 2.024956856498075, + "grad_norm": 0.4460787575655464, + "learning_rate": 3.7411204965593982e-06, + "loss": 0.5805, + "step": 7628 + }, + { + "epoch": 2.0252223549714587, + "grad_norm": 0.40790318427616307, + "learning_rate": 3.7408174182612247e-06, + "loss": 0.5861, + "step": 7629 + }, + { + "epoch": 2.025487853444843, + "grad_norm": 0.41931174937974974, + "learning_rate": 3.740514315764073e-06, + "loss": 0.552, + "step": 7630 + }, + { + "epoch": 2.0257533519182265, + "grad_norm": 0.418633780463944, + "learning_rate": 3.7402111890738545e-06, + "loss": 0.5542, + "step": 7631 + }, + { + "epoch": 2.02601885039161, + "grad_norm": 0.43929183786810916, + "learning_rate": 3.7399080381964825e-06, + "loss": 0.539, + "step": 7632 + }, + { + "epoch": 2.026284348864994, + "grad_norm": 0.4331059734537798, + "learning_rate": 3.7396048631378674e-06, + "loss": 0.5622, + "step": 7633 + }, + { + "epoch": 2.026549847338378, + "grad_norm": 0.4301383493546203, + "learning_rate": 3.7393016639039228e-06, + "loss": 0.5667, + "step": 7634 + }, + { + "epoch": 2.0268153458117615, + "grad_norm": 0.4248557675957671, + "learning_rate": 3.7389984405005604e-06, + "loss": 0.5844, + "step": 7635 + }, + { + "epoch": 2.0270808442851456, + "grad_norm": 0.4203227396536002, + "learning_rate": 3.7386951929336957e-06, + "loss": 0.5519, + "step": 7636 + }, + { + "epoch": 2.027346342758529, + "grad_norm": 0.4305163204228117, + "learning_rate": 3.7383919212092416e-06, + "loss": 0.5751, + "step": 7637 + }, + { + "epoch": 2.027611841231913, + "grad_norm": 0.43216988356933955, + "learning_rate": 3.7380886253331126e-06, + "loss": 0.5831, + "step": 7638 + }, + { + "epoch": 2.0278773397052965, + "grad_norm": 0.4377616383381146, + "learning_rate": 3.737785305311225e-06, + "loss": 0.5081, + "step": 7639 + }, + { + "epoch": 2.0281428381786806, + "grad_norm": 0.41417289928853834, + "learning_rate": 3.7374819611494927e-06, + "loss": 0.554, + "step": 7640 + }, + { + "epoch": 2.028408336652064, + "grad_norm": 0.4190053409408561, + "learning_rate": 3.737178592853832e-06, + "loss": 0.6212, + "step": 7641 + }, + { + "epoch": 2.028673835125448, + "grad_norm": 0.4745876794533292, + "learning_rate": 3.73687520043016e-06, + "loss": 0.5296, + "step": 7642 + }, + { + "epoch": 2.028939333598832, + "grad_norm": 0.4146131427047848, + "learning_rate": 3.7365717838843928e-06, + "loss": 0.5037, + "step": 7643 + }, + { + "epoch": 2.0292048320722156, + "grad_norm": 0.41213155652429123, + "learning_rate": 3.7362683432224484e-06, + "loss": 0.5598, + "step": 7644 + }, + { + "epoch": 2.029470330545599, + "grad_norm": 0.4267860938556258, + "learning_rate": 3.7359648784502444e-06, + "loss": 0.5625, + "step": 7645 + }, + { + "epoch": 2.0297358290189833, + "grad_norm": 0.43792749386608554, + "learning_rate": 3.7356613895736993e-06, + "loss": 0.5766, + "step": 7646 + }, + { + "epoch": 2.030001327492367, + "grad_norm": 0.44112459213729, + "learning_rate": 3.7353578765987315e-06, + "loss": 0.5795, + "step": 7647 + }, + { + "epoch": 2.0302668259657506, + "grad_norm": 0.44261619652241374, + "learning_rate": 3.7350543395312604e-06, + "loss": 0.531, + "step": 7648 + }, + { + "epoch": 2.0305323244391347, + "grad_norm": 0.4360306095858179, + "learning_rate": 3.734750778377206e-06, + "loss": 0.5928, + "step": 7649 + }, + { + "epoch": 2.0307978229125183, + "grad_norm": 0.4333461417705298, + "learning_rate": 3.734447193142488e-06, + "loss": 0.5462, + "step": 7650 + }, + { + "epoch": 2.031063321385902, + "grad_norm": 0.4666508118346647, + "learning_rate": 3.734143583833027e-06, + "loss": 0.559, + "step": 7651 + }, + { + "epoch": 2.0313288198592856, + "grad_norm": 0.43882769553941153, + "learning_rate": 3.7338399504547445e-06, + "loss": 0.5748, + "step": 7652 + }, + { + "epoch": 2.0315943183326697, + "grad_norm": 0.4194428029826446, + "learning_rate": 3.733536293013563e-06, + "loss": 0.5978, + "step": 7653 + }, + { + "epoch": 2.0318598168060533, + "grad_norm": 0.417744108275885, + "learning_rate": 3.7332326115154025e-06, + "loss": 0.5844, + "step": 7654 + }, + { + "epoch": 2.032125315279437, + "grad_norm": 0.4354069542579492, + "learning_rate": 3.7329289059661868e-06, + "loss": 0.5549, + "step": 7655 + }, + { + "epoch": 2.032390813752821, + "grad_norm": 0.42014331447022146, + "learning_rate": 3.732625176371839e-06, + "loss": 0.5597, + "step": 7656 + }, + { + "epoch": 2.0326563122262047, + "grad_norm": 0.41969329872377203, + "learning_rate": 3.732321422738282e-06, + "loss": 0.5816, + "step": 7657 + }, + { + "epoch": 2.0329218106995883, + "grad_norm": 0.44417633924115807, + "learning_rate": 3.73201764507144e-06, + "loss": 0.5443, + "step": 7658 + }, + { + "epoch": 2.0331873091729724, + "grad_norm": 0.43760628593704454, + "learning_rate": 3.7317138433772377e-06, + "loss": 0.5481, + "step": 7659 + }, + { + "epoch": 2.033452807646356, + "grad_norm": 0.43643749682735955, + "learning_rate": 3.731410017661599e-06, + "loss": 0.562, + "step": 7660 + }, + { + "epoch": 2.0337183061197397, + "grad_norm": 0.42125702993078595, + "learning_rate": 3.7311061679304506e-06, + "loss": 0.5775, + "step": 7661 + }, + { + "epoch": 2.033983804593124, + "grad_norm": 0.412025601256085, + "learning_rate": 3.730802294189718e-06, + "loss": 0.5401, + "step": 7662 + }, + { + "epoch": 2.0342493030665074, + "grad_norm": 0.40227071163334316, + "learning_rate": 3.730498396445326e-06, + "loss": 0.5538, + "step": 7663 + }, + { + "epoch": 2.034514801539891, + "grad_norm": 0.42522090826179904, + "learning_rate": 3.7301944747032037e-06, + "loss": 0.5707, + "step": 7664 + }, + { + "epoch": 2.034780300013275, + "grad_norm": 0.4076057269084683, + "learning_rate": 3.7298905289692768e-06, + "loss": 0.5489, + "step": 7665 + }, + { + "epoch": 2.035045798486659, + "grad_norm": 0.4261798306346251, + "learning_rate": 3.7295865592494727e-06, + "loss": 0.5942, + "step": 7666 + }, + { + "epoch": 2.0353112969600424, + "grad_norm": 0.40597294431936004, + "learning_rate": 3.7292825655497208e-06, + "loss": 0.5898, + "step": 7667 + }, + { + "epoch": 2.035576795433426, + "grad_norm": 0.42170416130366817, + "learning_rate": 3.728978547875948e-06, + "loss": 0.5609, + "step": 7668 + }, + { + "epoch": 2.03584229390681, + "grad_norm": 0.42465680129814404, + "learning_rate": 3.7286745062340856e-06, + "loss": 0.5939, + "step": 7669 + }, + { + "epoch": 2.036107792380194, + "grad_norm": 0.4260096417406519, + "learning_rate": 3.7283704406300613e-06, + "loss": 0.5669, + "step": 7670 + }, + { + "epoch": 2.0363732908535774, + "grad_norm": 0.4371695031094179, + "learning_rate": 3.728066351069807e-06, + "loss": 0.6047, + "step": 7671 + }, + { + "epoch": 2.0366387893269615, + "grad_norm": 0.43228718040309333, + "learning_rate": 3.7277622375592504e-06, + "loss": 0.5872, + "step": 7672 + }, + { + "epoch": 2.036904287800345, + "grad_norm": 0.42126725579026864, + "learning_rate": 3.727458100104325e-06, + "loss": 0.5564, + "step": 7673 + }, + { + "epoch": 2.037169786273729, + "grad_norm": 0.4203582358892254, + "learning_rate": 3.727153938710962e-06, + "loss": 0.56, + "step": 7674 + }, + { + "epoch": 2.037435284747113, + "grad_norm": 0.43054509101938965, + "learning_rate": 3.7268497533850912e-06, + "loss": 0.5924, + "step": 7675 + }, + { + "epoch": 2.0377007832204965, + "grad_norm": 0.4403891041563899, + "learning_rate": 3.726545544132647e-06, + "loss": 0.561, + "step": 7676 + }, + { + "epoch": 2.03796628169388, + "grad_norm": 0.4300042180236443, + "learning_rate": 3.7262413109595614e-06, + "loss": 0.5722, + "step": 7677 + }, + { + "epoch": 2.0382317801672643, + "grad_norm": 0.41465669546975464, + "learning_rate": 3.725937053871768e-06, + "loss": 0.5469, + "step": 7678 + }, + { + "epoch": 2.038497278640648, + "grad_norm": 0.41521138930570706, + "learning_rate": 3.7256327728752004e-06, + "loss": 0.5241, + "step": 7679 + }, + { + "epoch": 2.0387627771140315, + "grad_norm": 0.4333990338459932, + "learning_rate": 3.7253284679757936e-06, + "loss": 0.5455, + "step": 7680 + }, + { + "epoch": 2.039028275587415, + "grad_norm": 0.4424437364284227, + "learning_rate": 3.7250241391794807e-06, + "loss": 0.579, + "step": 7681 + }, + { + "epoch": 2.0392937740607993, + "grad_norm": 0.4205536813721641, + "learning_rate": 3.724719786492198e-06, + "loss": 0.5407, + "step": 7682 + }, + { + "epoch": 2.039559272534183, + "grad_norm": 0.4336418021408473, + "learning_rate": 3.7244154099198805e-06, + "loss": 0.5719, + "step": 7683 + }, + { + "epoch": 2.0398247710075665, + "grad_norm": 0.4185342525626771, + "learning_rate": 3.7241110094684648e-06, + "loss": 0.5998, + "step": 7684 + }, + { + "epoch": 2.0400902694809506, + "grad_norm": 0.4092990278385108, + "learning_rate": 3.7238065851438875e-06, + "loss": 0.5307, + "step": 7685 + }, + { + "epoch": 2.0403557679543343, + "grad_norm": 0.42220281642527613, + "learning_rate": 3.7235021369520846e-06, + "loss": 0.5948, + "step": 7686 + }, + { + "epoch": 2.040621266427718, + "grad_norm": 0.41946934430187743, + "learning_rate": 3.7231976648989952e-06, + "loss": 0.555, + "step": 7687 + }, + { + "epoch": 2.040886764901102, + "grad_norm": 0.4155330687715104, + "learning_rate": 3.722893168990557e-06, + "loss": 0.5624, + "step": 7688 + }, + { + "epoch": 2.0411522633744856, + "grad_norm": 0.43694318526983805, + "learning_rate": 3.7225886492327066e-06, + "loss": 0.5874, + "step": 7689 + }, + { + "epoch": 2.0414177618478693, + "grad_norm": 0.42589338686301553, + "learning_rate": 3.722284105631384e-06, + "loss": 0.601, + "step": 7690 + }, + { + "epoch": 2.0416832603212534, + "grad_norm": 0.41972012350617455, + "learning_rate": 3.72197953819253e-06, + "loss": 0.552, + "step": 7691 + }, + { + "epoch": 2.041948758794637, + "grad_norm": 0.4254822587465102, + "learning_rate": 3.7216749469220824e-06, + "loss": 0.5359, + "step": 7692 + }, + { + "epoch": 2.0422142572680206, + "grad_norm": 0.42636959739717756, + "learning_rate": 3.7213703318259818e-06, + "loss": 0.5773, + "step": 7693 + }, + { + "epoch": 2.0424797557414043, + "grad_norm": 0.43572473475673823, + "learning_rate": 3.7210656929101703e-06, + "loss": 0.6001, + "step": 7694 + }, + { + "epoch": 2.0427452542147884, + "grad_norm": 0.4228827725201327, + "learning_rate": 3.7207610301805873e-06, + "loss": 0.5574, + "step": 7695 + }, + { + "epoch": 2.043010752688172, + "grad_norm": 0.4397596116298814, + "learning_rate": 3.720456343643175e-06, + "loss": 0.5829, + "step": 7696 + }, + { + "epoch": 2.0432762511615556, + "grad_norm": 0.4259909422937582, + "learning_rate": 3.720151633303876e-06, + "loss": 0.5973, + "step": 7697 + }, + { + "epoch": 2.0435417496349397, + "grad_norm": 0.42250652170575065, + "learning_rate": 3.719846899168633e-06, + "loss": 0.5779, + "step": 7698 + }, + { + "epoch": 2.0438072481083234, + "grad_norm": 0.42582558909710705, + "learning_rate": 3.7195421412433886e-06, + "loss": 0.5898, + "step": 7699 + }, + { + "epoch": 2.044072746581707, + "grad_norm": 0.4374999968666794, + "learning_rate": 3.719237359534087e-06, + "loss": 0.6003, + "step": 7700 + }, + { + "epoch": 2.044338245055091, + "grad_norm": 0.43402508356364894, + "learning_rate": 3.718932554046671e-06, + "loss": 0.5316, + "step": 7701 + }, + { + "epoch": 2.0446037435284747, + "grad_norm": 0.4217374193641303, + "learning_rate": 3.718627724787086e-06, + "loss": 0.5294, + "step": 7702 + }, + { + "epoch": 2.0448692420018584, + "grad_norm": 0.4357271044509355, + "learning_rate": 3.718322871761277e-06, + "loss": 0.5812, + "step": 7703 + }, + { + "epoch": 2.0451347404752425, + "grad_norm": 0.418012641235714, + "learning_rate": 3.7180179949751883e-06, + "loss": 0.5752, + "step": 7704 + }, + { + "epoch": 2.045400238948626, + "grad_norm": 0.4301091068066781, + "learning_rate": 3.7177130944347666e-06, + "loss": 0.5386, + "step": 7705 + }, + { + "epoch": 2.0456657374220097, + "grad_norm": 0.4203131046534478, + "learning_rate": 3.7174081701459586e-06, + "loss": 0.5622, + "step": 7706 + }, + { + "epoch": 2.0459312358953934, + "grad_norm": 0.4240063413130722, + "learning_rate": 3.7171032221147103e-06, + "loss": 0.5582, + "step": 7707 + }, + { + "epoch": 2.0461967343687775, + "grad_norm": 0.4060322554264638, + "learning_rate": 3.716798250346969e-06, + "loss": 0.5161, + "step": 7708 + }, + { + "epoch": 2.046462232842161, + "grad_norm": 0.43352299455194343, + "learning_rate": 3.716493254848683e-06, + "loss": 0.5981, + "step": 7709 + }, + { + "epoch": 2.0467277313155448, + "grad_norm": 0.4211014646050209, + "learning_rate": 3.7161882356257993e-06, + "loss": 0.5362, + "step": 7710 + }, + { + "epoch": 2.046993229788929, + "grad_norm": 0.4293299229493533, + "learning_rate": 3.7158831926842687e-06, + "loss": 0.6055, + "step": 7711 + }, + { + "epoch": 2.0472587282623125, + "grad_norm": 0.41760152208003454, + "learning_rate": 3.715578126030037e-06, + "loss": 0.5693, + "step": 7712 + }, + { + "epoch": 2.047524226735696, + "grad_norm": 0.4243942442997636, + "learning_rate": 3.715273035669057e-06, + "loss": 0.556, + "step": 7713 + }, + { + "epoch": 2.04778972520908, + "grad_norm": 0.4173919697997775, + "learning_rate": 3.7149679216072765e-06, + "loss": 0.5945, + "step": 7714 + }, + { + "epoch": 2.048055223682464, + "grad_norm": 0.45992829964457643, + "learning_rate": 3.7146627838506472e-06, + "loss": 0.5588, + "step": 7715 + }, + { + "epoch": 2.0483207221558475, + "grad_norm": 0.43886136896037264, + "learning_rate": 3.714357622405119e-06, + "loss": 0.5657, + "step": 7716 + }, + { + "epoch": 2.0485862206292316, + "grad_norm": 0.42856309454913505, + "learning_rate": 3.714052437276645e-06, + "loss": 0.5857, + "step": 7717 + }, + { + "epoch": 2.048851719102615, + "grad_norm": 0.4245673323923108, + "learning_rate": 3.713747228471175e-06, + "loss": 0.578, + "step": 7718 + }, + { + "epoch": 2.049117217575999, + "grad_norm": 0.42822911570742994, + "learning_rate": 3.7134419959946626e-06, + "loss": 0.6105, + "step": 7719 + }, + { + "epoch": 2.049382716049383, + "grad_norm": 0.43342355652095027, + "learning_rate": 3.7131367398530605e-06, + "loss": 0.5639, + "step": 7720 + }, + { + "epoch": 2.0496482145227666, + "grad_norm": 0.42306124914564985, + "learning_rate": 3.7128314600523217e-06, + "loss": 0.53, + "step": 7721 + }, + { + "epoch": 2.04991371299615, + "grad_norm": 0.429524960789082, + "learning_rate": 3.7125261565983996e-06, + "loss": 0.5773, + "step": 7722 + }, + { + "epoch": 2.050179211469534, + "grad_norm": 0.4165027448814866, + "learning_rate": 3.7122208294972484e-06, + "loss": 0.5814, + "step": 7723 + }, + { + "epoch": 2.050444709942918, + "grad_norm": 0.41478576095573133, + "learning_rate": 3.7119154787548233e-06, + "loss": 0.5573, + "step": 7724 + }, + { + "epoch": 2.0507102084163016, + "grad_norm": 0.42798827161094444, + "learning_rate": 3.711610104377079e-06, + "loss": 0.6165, + "step": 7725 + }, + { + "epoch": 2.0509757068896852, + "grad_norm": 0.4381756160787756, + "learning_rate": 3.7113047063699716e-06, + "loss": 0.5448, + "step": 7726 + }, + { + "epoch": 2.0512412053630693, + "grad_norm": 0.4380547496917146, + "learning_rate": 3.710999284739456e-06, + "loss": 0.5532, + "step": 7727 + }, + { + "epoch": 2.051506703836453, + "grad_norm": 0.4091692603052029, + "learning_rate": 3.7106938394914905e-06, + "loss": 0.5262, + "step": 7728 + }, + { + "epoch": 2.0517722023098366, + "grad_norm": 0.42371305527673586, + "learning_rate": 3.7103883706320298e-06, + "loss": 0.5913, + "step": 7729 + }, + { + "epoch": 2.0520377007832207, + "grad_norm": 0.44482620761765956, + "learning_rate": 3.710082878167032e-06, + "loss": 0.6009, + "step": 7730 + }, + { + "epoch": 2.0523031992566043, + "grad_norm": 0.4478590625609402, + "learning_rate": 3.709777362102456e-06, + "loss": 0.6007, + "step": 7731 + }, + { + "epoch": 2.052568697729988, + "grad_norm": 0.4200693991034797, + "learning_rate": 3.709471822444259e-06, + "loss": 0.5158, + "step": 7732 + }, + { + "epoch": 2.052834196203372, + "grad_norm": 0.4274649048330147, + "learning_rate": 3.7091662591984e-06, + "loss": 0.5679, + "step": 7733 + }, + { + "epoch": 2.0530996946767557, + "grad_norm": 0.4265328375630852, + "learning_rate": 3.7088606723708393e-06, + "loss": 0.541, + "step": 7734 + }, + { + "epoch": 2.0533651931501393, + "grad_norm": 0.43518243420586755, + "learning_rate": 3.7085550619675347e-06, + "loss": 0.5586, + "step": 7735 + }, + { + "epoch": 2.053630691623523, + "grad_norm": 0.4214372646265983, + "learning_rate": 3.708249427994448e-06, + "loss": 0.5604, + "step": 7736 + }, + { + "epoch": 2.053896190096907, + "grad_norm": 0.41798223480817026, + "learning_rate": 3.7079437704575395e-06, + "loss": 0.5592, + "step": 7737 + }, + { + "epoch": 2.0541616885702907, + "grad_norm": 0.42885288716245795, + "learning_rate": 3.707638089362769e-06, + "loss": 0.5929, + "step": 7738 + }, + { + "epoch": 2.0544271870436743, + "grad_norm": 0.4152669744642473, + "learning_rate": 3.7073323847160992e-06, + "loss": 0.6087, + "step": 7739 + }, + { + "epoch": 2.0546926855170584, + "grad_norm": 0.4269617943515965, + "learning_rate": 3.7070266565234917e-06, + "loss": 0.5771, + "step": 7740 + }, + { + "epoch": 2.054958183990442, + "grad_norm": 0.43199520898848653, + "learning_rate": 3.706720904790909e-06, + "loss": 0.6017, + "step": 7741 + }, + { + "epoch": 2.0552236824638257, + "grad_norm": 0.4200441556777171, + "learning_rate": 3.7064151295243146e-06, + "loss": 0.5731, + "step": 7742 + }, + { + "epoch": 2.05548918093721, + "grad_norm": 0.4302128136855923, + "learning_rate": 3.706109330729671e-06, + "loss": 0.566, + "step": 7743 + }, + { + "epoch": 2.0557546794105934, + "grad_norm": 0.4157932270518941, + "learning_rate": 3.7058035084129427e-06, + "loss": 0.5234, + "step": 7744 + }, + { + "epoch": 2.056020177883977, + "grad_norm": 0.42383304192797683, + "learning_rate": 3.705497662580093e-06, + "loss": 0.578, + "step": 7745 + }, + { + "epoch": 2.056285676357361, + "grad_norm": 0.418269199905343, + "learning_rate": 3.7051917932370886e-06, + "loss": 0.5404, + "step": 7746 + }, + { + "epoch": 2.056551174830745, + "grad_norm": 0.43038760469113735, + "learning_rate": 3.704885900389892e-06, + "loss": 0.5853, + "step": 7747 + }, + { + "epoch": 2.0568166733041284, + "grad_norm": 0.42837077521366473, + "learning_rate": 3.7045799840444712e-06, + "loss": 0.5683, + "step": 7748 + }, + { + "epoch": 2.057082171777512, + "grad_norm": 0.4159337958849824, + "learning_rate": 3.7042740442067914e-06, + "loss": 0.5873, + "step": 7749 + }, + { + "epoch": 2.057347670250896, + "grad_norm": 0.4155364626869827, + "learning_rate": 3.7039680808828187e-06, + "loss": 0.5045, + "step": 7750 + }, + { + "epoch": 2.05761316872428, + "grad_norm": 0.4147872071239813, + "learning_rate": 3.7036620940785205e-06, + "loss": 0.5425, + "step": 7751 + }, + { + "epoch": 2.0578786671976634, + "grad_norm": 0.43504772272129594, + "learning_rate": 3.703356083799865e-06, + "loss": 0.5861, + "step": 7752 + }, + { + "epoch": 2.0581441656710475, + "grad_norm": 0.4417888768490918, + "learning_rate": 3.703050050052819e-06, + "loss": 0.5967, + "step": 7753 + }, + { + "epoch": 2.058409664144431, + "grad_norm": 0.44110579582481657, + "learning_rate": 3.702743992843352e-06, + "loss": 0.6009, + "step": 7754 + }, + { + "epoch": 2.058675162617815, + "grad_norm": 0.44121555242124766, + "learning_rate": 3.7024379121774316e-06, + "loss": 0.5667, + "step": 7755 + }, + { + "epoch": 2.058940661091199, + "grad_norm": 0.4413542512011281, + "learning_rate": 3.702131808061028e-06, + "loss": 0.5714, + "step": 7756 + }, + { + "epoch": 2.0592061595645825, + "grad_norm": 0.41821395731660016, + "learning_rate": 3.701825680500112e-06, + "loss": 0.5289, + "step": 7757 + }, + { + "epoch": 2.059471658037966, + "grad_norm": 0.4264440248501459, + "learning_rate": 3.701519529500651e-06, + "loss": 0.5664, + "step": 7758 + }, + { + "epoch": 2.0597371565113503, + "grad_norm": 0.4348680370036617, + "learning_rate": 3.701213355068618e-06, + "loss": 0.5764, + "step": 7759 + }, + { + "epoch": 2.060002654984734, + "grad_norm": 0.4169045795485788, + "learning_rate": 3.7009071572099837e-06, + "loss": 0.5458, + "step": 7760 + }, + { + "epoch": 2.0602681534581175, + "grad_norm": 0.4212789827782027, + "learning_rate": 3.7006009359307193e-06, + "loss": 0.5645, + "step": 7761 + }, + { + "epoch": 2.060533651931501, + "grad_norm": 0.4294729359979502, + "learning_rate": 3.7002946912367964e-06, + "loss": 0.5873, + "step": 7762 + }, + { + "epoch": 2.0607991504048853, + "grad_norm": 0.42833423953440547, + "learning_rate": 3.6999884231341897e-06, + "loss": 0.5573, + "step": 7763 + }, + { + "epoch": 2.061064648878269, + "grad_norm": 0.4129329112730173, + "learning_rate": 3.6996821316288694e-06, + "loss": 0.5151, + "step": 7764 + }, + { + "epoch": 2.0613301473516525, + "grad_norm": 0.4136739284238691, + "learning_rate": 3.6993758167268106e-06, + "loss": 0.5752, + "step": 7765 + }, + { + "epoch": 2.0615956458250366, + "grad_norm": 0.4277226213962251, + "learning_rate": 3.6990694784339874e-06, + "loss": 0.5866, + "step": 7766 + }, + { + "epoch": 2.0618611442984203, + "grad_norm": 0.45117832437480265, + "learning_rate": 3.698763116756373e-06, + "loss": 0.5296, + "step": 7767 + }, + { + "epoch": 2.062126642771804, + "grad_norm": 0.4359516593196853, + "learning_rate": 3.6984567316999424e-06, + "loss": 0.5816, + "step": 7768 + }, + { + "epoch": 2.062392141245188, + "grad_norm": 0.42430459215163924, + "learning_rate": 3.698150323270672e-06, + "loss": 0.5559, + "step": 7769 + }, + { + "epoch": 2.0626576397185716, + "grad_norm": 0.4316193966556973, + "learning_rate": 3.697843891474536e-06, + "loss": 0.5435, + "step": 7770 + }, + { + "epoch": 2.0629231381919553, + "grad_norm": 0.4201709266798109, + "learning_rate": 3.6975374363175107e-06, + "loss": 0.6033, + "step": 7771 + }, + { + "epoch": 2.0631886366653394, + "grad_norm": 0.41667461001322864, + "learning_rate": 3.697230957805575e-06, + "loss": 0.6042, + "step": 7772 + }, + { + "epoch": 2.063454135138723, + "grad_norm": 0.4070286357155365, + "learning_rate": 3.696924455944703e-06, + "loss": 0.5628, + "step": 7773 + }, + { + "epoch": 2.0637196336121066, + "grad_norm": 0.4240294318916209, + "learning_rate": 3.6966179307408747e-06, + "loss": 0.6006, + "step": 7774 + }, + { + "epoch": 2.0639851320854907, + "grad_norm": 0.4333631719587378, + "learning_rate": 3.6963113822000663e-06, + "loss": 0.6029, + "step": 7775 + }, + { + "epoch": 2.0642506305588744, + "grad_norm": 0.4266711901374407, + "learning_rate": 3.6960048103282563e-06, + "loss": 0.5604, + "step": 7776 + }, + { + "epoch": 2.064516129032258, + "grad_norm": 0.4217592571285186, + "learning_rate": 3.6956982151314252e-06, + "loss": 0.5784, + "step": 7777 + }, + { + "epoch": 2.0647816275056416, + "grad_norm": 0.4178207373455119, + "learning_rate": 3.69539159661555e-06, + "loss": 0.5623, + "step": 7778 + }, + { + "epoch": 2.0650471259790257, + "grad_norm": 0.43476193943169533, + "learning_rate": 3.695084954786613e-06, + "loss": 0.5765, + "step": 7779 + }, + { + "epoch": 2.0653126244524094, + "grad_norm": 0.41752260794030094, + "learning_rate": 3.694778289650593e-06, + "loss": 0.5727, + "step": 7780 + }, + { + "epoch": 2.065578122925793, + "grad_norm": 0.42570125662112296, + "learning_rate": 3.6944716012134707e-06, + "loss": 0.5573, + "step": 7781 + }, + { + "epoch": 2.065843621399177, + "grad_norm": 0.41994692482972573, + "learning_rate": 3.6941648894812284e-06, + "loss": 0.55, + "step": 7782 + }, + { + "epoch": 2.0661091198725607, + "grad_norm": 0.42563655301362135, + "learning_rate": 3.693858154459846e-06, + "loss": 0.5887, + "step": 7783 + }, + { + "epoch": 2.0663746183459444, + "grad_norm": 0.42205018601159106, + "learning_rate": 3.6935513961553073e-06, + "loss": 0.5659, + "step": 7784 + }, + { + "epoch": 2.0666401168193285, + "grad_norm": 0.4376486616043209, + "learning_rate": 3.6932446145735933e-06, + "loss": 0.5667, + "step": 7785 + }, + { + "epoch": 2.066905615292712, + "grad_norm": 0.422678319460469, + "learning_rate": 3.6929378097206887e-06, + "loss": 0.5637, + "step": 7786 + }, + { + "epoch": 2.0671711137660957, + "grad_norm": 0.4365645980330758, + "learning_rate": 3.692630981602575e-06, + "loss": 0.5637, + "step": 7787 + }, + { + "epoch": 2.06743661223948, + "grad_norm": 0.42443119232306886, + "learning_rate": 3.6923241302252374e-06, + "loss": 0.5754, + "step": 7788 + }, + { + "epoch": 2.0677021107128635, + "grad_norm": 0.4271289335997637, + "learning_rate": 3.6920172555946603e-06, + "loss": 0.546, + "step": 7789 + }, + { + "epoch": 2.067967609186247, + "grad_norm": 0.42034105394703203, + "learning_rate": 3.691710357716828e-06, + "loss": 0.5659, + "step": 7790 + }, + { + "epoch": 2.0682331076596308, + "grad_norm": 0.43598933072187385, + "learning_rate": 3.691403436597726e-06, + "loss": 0.6016, + "step": 7791 + }, + { + "epoch": 2.068498606133015, + "grad_norm": 0.42939731646220797, + "learning_rate": 3.69109649224334e-06, + "loss": 0.576, + "step": 7792 + }, + { + "epoch": 2.0687641046063985, + "grad_norm": 0.42126264339623465, + "learning_rate": 3.6907895246596557e-06, + "loss": 0.5835, + "step": 7793 + }, + { + "epoch": 2.069029603079782, + "grad_norm": 0.41874512403417113, + "learning_rate": 3.69048253385266e-06, + "loss": 0.5481, + "step": 7794 + }, + { + "epoch": 2.069295101553166, + "grad_norm": 0.4430669140381732, + "learning_rate": 3.6901755198283407e-06, + "loss": 0.5892, + "step": 7795 + }, + { + "epoch": 2.06956060002655, + "grad_norm": 0.42390619153150105, + "learning_rate": 3.6898684825926845e-06, + "loss": 0.5641, + "step": 7796 + }, + { + "epoch": 2.0698260984999335, + "grad_norm": 0.4310311062309684, + "learning_rate": 3.6895614221516797e-06, + "loss": 0.6118, + "step": 7797 + }, + { + "epoch": 2.0700915969733176, + "grad_norm": 0.43030874812934844, + "learning_rate": 3.6892543385113137e-06, + "loss": 0.5567, + "step": 7798 + }, + { + "epoch": 2.070357095446701, + "grad_norm": 0.4218225175263902, + "learning_rate": 3.688947231677577e-06, + "loss": 0.5876, + "step": 7799 + }, + { + "epoch": 2.070622593920085, + "grad_norm": 0.4349945784479379, + "learning_rate": 3.688640101656459e-06, + "loss": 0.617, + "step": 7800 + }, + { + "epoch": 2.070888092393469, + "grad_norm": 0.4416526910478568, + "learning_rate": 3.688332948453948e-06, + "loss": 0.5852, + "step": 7801 + }, + { + "epoch": 2.0711535908668526, + "grad_norm": 0.4001023980887157, + "learning_rate": 3.6880257720760343e-06, + "loss": 0.5177, + "step": 7802 + }, + { + "epoch": 2.071419089340236, + "grad_norm": 0.4059228460822013, + "learning_rate": 3.6877185725287102e-06, + "loss": 0.5201, + "step": 7803 + }, + { + "epoch": 2.07168458781362, + "grad_norm": 0.4369884170155583, + "learning_rate": 3.6874113498179653e-06, + "loss": 0.5785, + "step": 7804 + }, + { + "epoch": 2.071950086287004, + "grad_norm": 0.4149170948361833, + "learning_rate": 3.6871041039497924e-06, + "loss": 0.5795, + "step": 7805 + }, + { + "epoch": 2.0722155847603876, + "grad_norm": 0.43479784841791536, + "learning_rate": 3.6867968349301825e-06, + "loss": 0.5827, + "step": 7806 + }, + { + "epoch": 2.0724810832337712, + "grad_norm": 0.42217766096651677, + "learning_rate": 3.686489542765128e-06, + "loss": 0.5708, + "step": 7807 + }, + { + "epoch": 2.0727465817071553, + "grad_norm": 0.441584051134115, + "learning_rate": 3.686182227460623e-06, + "loss": 0.5584, + "step": 7808 + }, + { + "epoch": 2.073012080180539, + "grad_norm": 0.4252942384765948, + "learning_rate": 3.6858748890226607e-06, + "loss": 0.5413, + "step": 7809 + }, + { + "epoch": 2.0732775786539226, + "grad_norm": 0.42717772899322654, + "learning_rate": 3.6855675274572343e-06, + "loss": 0.5658, + "step": 7810 + }, + { + "epoch": 2.0735430771273067, + "grad_norm": 0.4298944965076522, + "learning_rate": 3.685260142770338e-06, + "loss": 0.5544, + "step": 7811 + }, + { + "epoch": 2.0738085756006903, + "grad_norm": 0.43074005346617233, + "learning_rate": 3.6849527349679674e-06, + "loss": 0.6079, + "step": 7812 + }, + { + "epoch": 2.074074074074074, + "grad_norm": 0.4400285741011014, + "learning_rate": 3.684645304056117e-06, + "loss": 0.5894, + "step": 7813 + }, + { + "epoch": 2.074339572547458, + "grad_norm": 0.4306261837250573, + "learning_rate": 3.6843378500407824e-06, + "loss": 0.5645, + "step": 7814 + }, + { + "epoch": 2.0746050710208417, + "grad_norm": 0.4503920983394631, + "learning_rate": 3.6840303729279604e-06, + "loss": 0.5602, + "step": 7815 + }, + { + "epoch": 2.0748705694942253, + "grad_norm": 0.4197862097696484, + "learning_rate": 3.6837228727236464e-06, + "loss": 0.5854, + "step": 7816 + }, + { + "epoch": 2.075136067967609, + "grad_norm": 0.4202525082448422, + "learning_rate": 3.6834153494338387e-06, + "loss": 0.602, + "step": 7817 + }, + { + "epoch": 2.075401566440993, + "grad_norm": 0.43072344293661036, + "learning_rate": 3.6831078030645346e-06, + "loss": 0.5567, + "step": 7818 + }, + { + "epoch": 2.0756670649143767, + "grad_norm": 0.4122247956530686, + "learning_rate": 3.6828002336217307e-06, + "loss": 0.5537, + "step": 7819 + }, + { + "epoch": 2.0759325633877603, + "grad_norm": 0.4281749161669776, + "learning_rate": 3.6824926411114265e-06, + "loss": 0.566, + "step": 7820 + }, + { + "epoch": 2.0761980618611444, + "grad_norm": 0.4358615646343696, + "learning_rate": 3.682185025539621e-06, + "loss": 0.6005, + "step": 7821 + }, + { + "epoch": 2.076463560334528, + "grad_norm": 0.421684392003355, + "learning_rate": 3.6818773869123127e-06, + "loss": 0.5749, + "step": 7822 + }, + { + "epoch": 2.0767290588079117, + "grad_norm": 0.4264060883500459, + "learning_rate": 3.6815697252355016e-06, + "loss": 0.5729, + "step": 7823 + }, + { + "epoch": 2.076994557281296, + "grad_norm": 0.4322489423194596, + "learning_rate": 3.6812620405151876e-06, + "loss": 0.5565, + "step": 7824 + }, + { + "epoch": 2.0772600557546794, + "grad_norm": 0.4221332849925765, + "learning_rate": 3.6809543327573715e-06, + "loss": 0.5759, + "step": 7825 + }, + { + "epoch": 2.077525554228063, + "grad_norm": 0.4043112791709982, + "learning_rate": 3.6806466019680552e-06, + "loss": 0.5482, + "step": 7826 + }, + { + "epoch": 2.077791052701447, + "grad_norm": 0.44061677408904004, + "learning_rate": 3.6803388481532393e-06, + "loss": 0.5584, + "step": 7827 + }, + { + "epoch": 2.078056551174831, + "grad_norm": 0.4494377632037603, + "learning_rate": 3.6800310713189258e-06, + "loss": 0.5335, + "step": 7828 + }, + { + "epoch": 2.0783220496482144, + "grad_norm": 0.422506496395252, + "learning_rate": 3.679723271471117e-06, + "loss": 0.5445, + "step": 7829 + }, + { + "epoch": 2.0785875481215985, + "grad_norm": 0.42260268368289405, + "learning_rate": 3.6794154486158163e-06, + "loss": 0.5836, + "step": 7830 + }, + { + "epoch": 2.078853046594982, + "grad_norm": 0.4392693482202733, + "learning_rate": 3.6791076027590257e-06, + "loss": 0.617, + "step": 7831 + }, + { + "epoch": 2.079118545068366, + "grad_norm": 0.5520143619580332, + "learning_rate": 3.678799733906751e-06, + "loss": 0.5524, + "step": 7832 + }, + { + "epoch": 2.0793840435417494, + "grad_norm": 0.44238743355848575, + "learning_rate": 3.6784918420649952e-06, + "loss": 0.5948, + "step": 7833 + }, + { + "epoch": 2.0796495420151335, + "grad_norm": 0.4160650541061754, + "learning_rate": 3.6781839272397616e-06, + "loss": 0.5558, + "step": 7834 + }, + { + "epoch": 2.079915040488517, + "grad_norm": 0.4661944410342518, + "learning_rate": 3.677875989437058e-06, + "loss": 0.5924, + "step": 7835 + }, + { + "epoch": 2.080180538961901, + "grad_norm": 0.45399211395202205, + "learning_rate": 3.6775680286628886e-06, + "loss": 0.5928, + "step": 7836 + }, + { + "epoch": 2.080446037435285, + "grad_norm": 0.44154831088542723, + "learning_rate": 3.6772600449232594e-06, + "loss": 0.5499, + "step": 7837 + }, + { + "epoch": 2.0807115359086685, + "grad_norm": 0.43493627876892177, + "learning_rate": 3.676952038224177e-06, + "loss": 0.5467, + "step": 7838 + }, + { + "epoch": 2.080977034382052, + "grad_norm": 0.4406376060046193, + "learning_rate": 3.6766440085716478e-06, + "loss": 0.5637, + "step": 7839 + }, + { + "epoch": 2.0812425328554363, + "grad_norm": 0.44109584523635187, + "learning_rate": 3.6763359559716793e-06, + "loss": 0.5541, + "step": 7840 + }, + { + "epoch": 2.08150803132882, + "grad_norm": 0.4297389256959757, + "learning_rate": 3.676027880430281e-06, + "loss": 0.5512, + "step": 7841 + }, + { + "epoch": 2.0817735298022035, + "grad_norm": 0.42799420454007564, + "learning_rate": 3.675719781953458e-06, + "loss": 0.5652, + "step": 7842 + }, + { + "epoch": 2.0820390282755876, + "grad_norm": 0.4288910378779645, + "learning_rate": 3.6754116605472205e-06, + "loss": 0.5994, + "step": 7843 + }, + { + "epoch": 2.0823045267489713, + "grad_norm": 0.4278300480846039, + "learning_rate": 3.6751035162175784e-06, + "loss": 0.5502, + "step": 7844 + }, + { + "epoch": 2.082570025222355, + "grad_norm": 0.43631333428729263, + "learning_rate": 3.67479534897054e-06, + "loss": 0.5565, + "step": 7845 + }, + { + "epoch": 2.0828355236957385, + "grad_norm": 0.45569801737704285, + "learning_rate": 3.6744871588121173e-06, + "loss": 0.577, + "step": 7846 + }, + { + "epoch": 2.0831010221691226, + "grad_norm": 0.433607166444531, + "learning_rate": 3.674178945748318e-06, + "loss": 0.6072, + "step": 7847 + }, + { + "epoch": 2.0833665206425063, + "grad_norm": 0.42771059784218063, + "learning_rate": 3.673870709785155e-06, + "loss": 0.5176, + "step": 7848 + }, + { + "epoch": 2.08363201911589, + "grad_norm": 0.4327113037110931, + "learning_rate": 3.6735624509286393e-06, + "loss": 0.5737, + "step": 7849 + }, + { + "epoch": 2.083897517589274, + "grad_norm": 0.4236840089192895, + "learning_rate": 3.6732541691847816e-06, + "loss": 0.582, + "step": 7850 + }, + { + "epoch": 2.0841630160626576, + "grad_norm": 0.4187295235442086, + "learning_rate": 3.6729458645595957e-06, + "loss": 0.589, + "step": 7851 + }, + { + "epoch": 2.0844285145360413, + "grad_norm": 0.44070146895733775, + "learning_rate": 3.6726375370590927e-06, + "loss": 0.5637, + "step": 7852 + }, + { + "epoch": 2.0846940130094254, + "grad_norm": 0.4443438621289547, + "learning_rate": 3.6723291866892872e-06, + "loss": 0.5556, + "step": 7853 + }, + { + "epoch": 2.084959511482809, + "grad_norm": 0.4342890464262517, + "learning_rate": 3.6720208134561918e-06, + "loss": 0.565, + "step": 7854 + }, + { + "epoch": 2.0852250099561926, + "grad_norm": 0.4302908075942359, + "learning_rate": 3.671712417365822e-06, + "loss": 0.584, + "step": 7855 + }, + { + "epoch": 2.0854905084295767, + "grad_norm": 0.460500205522762, + "learning_rate": 3.67140399842419e-06, + "loss": 0.5568, + "step": 7856 + }, + { + "epoch": 2.0857560069029604, + "grad_norm": 0.4284636655785662, + "learning_rate": 3.6710955566373125e-06, + "loss": 0.5904, + "step": 7857 + }, + { + "epoch": 2.086021505376344, + "grad_norm": 0.4313426045883189, + "learning_rate": 3.670787092011205e-06, + "loss": 0.5881, + "step": 7858 + }, + { + "epoch": 2.0862870038497277, + "grad_norm": 0.4252411605383489, + "learning_rate": 3.6704786045518815e-06, + "loss": 0.5788, + "step": 7859 + }, + { + "epoch": 2.0865525023231117, + "grad_norm": 0.42749198497027774, + "learning_rate": 3.6701700942653595e-06, + "loss": 0.5627, + "step": 7860 + }, + { + "epoch": 2.0868180007964954, + "grad_norm": 0.43148041896821854, + "learning_rate": 3.669861561157656e-06, + "loss": 0.5496, + "step": 7861 + }, + { + "epoch": 2.087083499269879, + "grad_norm": 0.4143656108805999, + "learning_rate": 3.669553005234787e-06, + "loss": 0.5581, + "step": 7862 + }, + { + "epoch": 2.087348997743263, + "grad_norm": 0.4189094219989511, + "learning_rate": 3.6692444265027716e-06, + "loss": 0.555, + "step": 7863 + }, + { + "epoch": 2.0876144962166467, + "grad_norm": 0.42629701565663236, + "learning_rate": 3.6689358249676276e-06, + "loss": 0.5688, + "step": 7864 + }, + { + "epoch": 2.0878799946900304, + "grad_norm": 0.42472258749475905, + "learning_rate": 3.668627200635372e-06, + "loss": 0.5918, + "step": 7865 + }, + { + "epoch": 2.0881454931634145, + "grad_norm": 0.4435142083748394, + "learning_rate": 3.6683185535120246e-06, + "loss": 0.5646, + "step": 7866 + }, + { + "epoch": 2.088410991636798, + "grad_norm": 0.4251172245310108, + "learning_rate": 3.668009883603606e-06, + "loss": 0.5582, + "step": 7867 + }, + { + "epoch": 2.0886764901101817, + "grad_norm": 0.44418341786848337, + "learning_rate": 3.667701190916134e-06, + "loss": 0.5756, + "step": 7868 + }, + { + "epoch": 2.088941988583566, + "grad_norm": 0.42879137804402984, + "learning_rate": 3.6673924754556293e-06, + "loss": 0.5742, + "step": 7869 + }, + { + "epoch": 2.0892074870569495, + "grad_norm": 0.4295736665423941, + "learning_rate": 3.667083737228114e-06, + "loss": 0.5691, + "step": 7870 + }, + { + "epoch": 2.089472985530333, + "grad_norm": 0.43522413336830995, + "learning_rate": 3.6667749762396075e-06, + "loss": 0.5852, + "step": 7871 + }, + { + "epoch": 2.0897384840037168, + "grad_norm": 0.4298377639649642, + "learning_rate": 3.666466192496133e-06, + "loss": 0.5524, + "step": 7872 + }, + { + "epoch": 2.090003982477101, + "grad_norm": 0.4354748938119789, + "learning_rate": 3.6661573860037112e-06, + "loss": 0.5595, + "step": 7873 + }, + { + "epoch": 2.0902694809504845, + "grad_norm": 0.4353282255972668, + "learning_rate": 3.6658485567683657e-06, + "loss": 0.6068, + "step": 7874 + }, + { + "epoch": 2.090534979423868, + "grad_norm": 0.43772776649650164, + "learning_rate": 3.665539704796119e-06, + "loss": 0.5852, + "step": 7875 + }, + { + "epoch": 2.090800477897252, + "grad_norm": 0.4304313700711266, + "learning_rate": 3.6652308300929935e-06, + "loss": 0.5547, + "step": 7876 + }, + { + "epoch": 2.091065976370636, + "grad_norm": 0.41683603307771433, + "learning_rate": 3.664921932665014e-06, + "loss": 0.5892, + "step": 7877 + }, + { + "epoch": 2.0913314748440195, + "grad_norm": 0.41729247567345373, + "learning_rate": 3.664613012518205e-06, + "loss": 0.5591, + "step": 7878 + }, + { + "epoch": 2.0915969733174036, + "grad_norm": 0.42378015548893694, + "learning_rate": 3.6643040696585898e-06, + "loss": 0.584, + "step": 7879 + }, + { + "epoch": 2.091862471790787, + "grad_norm": 0.4197127999612347, + "learning_rate": 3.663995104092195e-06, + "loss": 0.5585, + "step": 7880 + }, + { + "epoch": 2.092127970264171, + "grad_norm": 0.42554404644342747, + "learning_rate": 3.663686115825047e-06, + "loss": 0.5608, + "step": 7881 + }, + { + "epoch": 2.092393468737555, + "grad_norm": 0.425752726289791, + "learning_rate": 3.663377104863169e-06, + "loss": 0.5621, + "step": 7882 + }, + { + "epoch": 2.0926589672109386, + "grad_norm": 0.42509746453236497, + "learning_rate": 3.663068071212589e-06, + "loss": 0.5937, + "step": 7883 + }, + { + "epoch": 2.092924465684322, + "grad_norm": 0.4357672106439374, + "learning_rate": 3.662759014879335e-06, + "loss": 0.5692, + "step": 7884 + }, + { + "epoch": 2.0931899641577063, + "grad_norm": 0.448253511697435, + "learning_rate": 3.662449935869433e-06, + "loss": 0.5504, + "step": 7885 + }, + { + "epoch": 2.09345546263109, + "grad_norm": 0.43049396137508955, + "learning_rate": 3.662140834188911e-06, + "loss": 0.56, + "step": 7886 + }, + { + "epoch": 2.0937209611044736, + "grad_norm": 0.431989678115792, + "learning_rate": 3.6618317098437972e-06, + "loss": 0.6172, + "step": 7887 + }, + { + "epoch": 2.0939864595778572, + "grad_norm": 0.4206163013659491, + "learning_rate": 3.6615225628401202e-06, + "loss": 0.5342, + "step": 7888 + }, + { + "epoch": 2.0942519580512413, + "grad_norm": 0.4249195268651649, + "learning_rate": 3.661213393183909e-06, + "loss": 0.572, + "step": 7889 + }, + { + "epoch": 2.094517456524625, + "grad_norm": 0.43142726229964284, + "learning_rate": 3.6609042008811946e-06, + "loss": 0.6235, + "step": 7890 + }, + { + "epoch": 2.0947829549980086, + "grad_norm": 0.4209923733851621, + "learning_rate": 3.660594985938005e-06, + "loss": 0.5984, + "step": 7891 + }, + { + "epoch": 2.0950484534713927, + "grad_norm": 0.4089955513512835, + "learning_rate": 3.6602857483603713e-06, + "loss": 0.5504, + "step": 7892 + }, + { + "epoch": 2.0953139519447763, + "grad_norm": 0.4153052292989263, + "learning_rate": 3.659976488154326e-06, + "loss": 0.5446, + "step": 7893 + }, + { + "epoch": 2.09557945041816, + "grad_norm": 0.42494402334825065, + "learning_rate": 3.659667205325898e-06, + "loss": 0.6031, + "step": 7894 + }, + { + "epoch": 2.095844948891544, + "grad_norm": 0.406439501645814, + "learning_rate": 3.659357899881121e-06, + "loss": 0.527, + "step": 7895 + }, + { + "epoch": 2.0961104473649277, + "grad_norm": 0.42490501779063927, + "learning_rate": 3.6590485718260254e-06, + "loss": 0.5875, + "step": 7896 + }, + { + "epoch": 2.0963759458383113, + "grad_norm": 0.4256293857511059, + "learning_rate": 3.658739221166645e-06, + "loss": 0.5102, + "step": 7897 + }, + { + "epoch": 2.0966414443116954, + "grad_norm": 0.4174293269870777, + "learning_rate": 3.658429847909013e-06, + "loss": 0.5687, + "step": 7898 + }, + { + "epoch": 2.096906942785079, + "grad_norm": 0.42167866535279896, + "learning_rate": 3.6581204520591625e-06, + "loss": 0.5511, + "step": 7899 + }, + { + "epoch": 2.0971724412584627, + "grad_norm": 0.4173528229810813, + "learning_rate": 3.657811033623127e-06, + "loss": 0.5484, + "step": 7900 + }, + { + "epoch": 2.0974379397318463, + "grad_norm": 0.4248313656996904, + "learning_rate": 3.6575015926069423e-06, + "loss": 0.5458, + "step": 7901 + }, + { + "epoch": 2.0977034382052304, + "grad_norm": 0.4293679515800337, + "learning_rate": 3.6571921290166425e-06, + "loss": 0.5815, + "step": 7902 + }, + { + "epoch": 2.097968936678614, + "grad_norm": 0.4355740075608646, + "learning_rate": 3.6568826428582625e-06, + "loss": 0.5889, + "step": 7903 + }, + { + "epoch": 2.0982344351519977, + "grad_norm": 0.4243226525417147, + "learning_rate": 3.656573134137839e-06, + "loss": 0.5705, + "step": 7904 + }, + { + "epoch": 2.098499933625382, + "grad_norm": 0.4317945401813962, + "learning_rate": 3.6562636028614075e-06, + "loss": 0.5721, + "step": 7905 + }, + { + "epoch": 2.0987654320987654, + "grad_norm": 0.42922172303927425, + "learning_rate": 3.6559540490350044e-06, + "loss": 0.559, + "step": 7906 + }, + { + "epoch": 2.099030930572149, + "grad_norm": 0.4315668031094222, + "learning_rate": 3.655644472664667e-06, + "loss": 0.5646, + "step": 7907 + }, + { + "epoch": 2.099296429045533, + "grad_norm": 0.4244453093877998, + "learning_rate": 3.6553348737564328e-06, + "loss": 0.5725, + "step": 7908 + }, + { + "epoch": 2.099561927518917, + "grad_norm": 0.41901433325931725, + "learning_rate": 3.65502525231634e-06, + "loss": 0.5548, + "step": 7909 + }, + { + "epoch": 2.0998274259923004, + "grad_norm": 0.4280858028951082, + "learning_rate": 3.654715608350427e-06, + "loss": 0.5952, + "step": 7910 + }, + { + "epoch": 2.1000929244656845, + "grad_norm": 0.41480282933417045, + "learning_rate": 3.654405941864732e-06, + "loss": 0.5894, + "step": 7911 + }, + { + "epoch": 2.100358422939068, + "grad_norm": 0.4288332772602598, + "learning_rate": 3.6540962528652952e-06, + "loss": 0.5828, + "step": 7912 + }, + { + "epoch": 2.100623921412452, + "grad_norm": 0.4151636991315554, + "learning_rate": 3.653786541358156e-06, + "loss": 0.5817, + "step": 7913 + }, + { + "epoch": 2.100889419885836, + "grad_norm": 0.43319164793869924, + "learning_rate": 3.6534768073493533e-06, + "loss": 0.5824, + "step": 7914 + }, + { + "epoch": 2.1011549183592195, + "grad_norm": 0.4258782225111984, + "learning_rate": 3.653167050844929e-06, + "loss": 0.5446, + "step": 7915 + }, + { + "epoch": 2.101420416832603, + "grad_norm": 0.4300869467993502, + "learning_rate": 3.6528572718509237e-06, + "loss": 0.5743, + "step": 7916 + }, + { + "epoch": 2.101685915305987, + "grad_norm": 0.45577949026192444, + "learning_rate": 3.6525474703733787e-06, + "loss": 0.575, + "step": 7917 + }, + { + "epoch": 2.101951413779371, + "grad_norm": 0.43087875667575537, + "learning_rate": 3.6522376464183372e-06, + "loss": 0.5694, + "step": 7918 + }, + { + "epoch": 2.1022169122527545, + "grad_norm": 0.41371958916682094, + "learning_rate": 3.6519277999918394e-06, + "loss": 0.553, + "step": 7919 + }, + { + "epoch": 2.102482410726138, + "grad_norm": 0.4267650460591571, + "learning_rate": 3.6516179310999294e-06, + "loss": 0.5441, + "step": 7920 + }, + { + "epoch": 2.1027479091995223, + "grad_norm": 0.4278498949530199, + "learning_rate": 3.6513080397486506e-06, + "loss": 0.5389, + "step": 7921 + }, + { + "epoch": 2.103013407672906, + "grad_norm": 0.4210942039430507, + "learning_rate": 3.650998125944045e-06, + "loss": 0.535, + "step": 7922 + }, + { + "epoch": 2.1032789061462895, + "grad_norm": 0.42434874349698026, + "learning_rate": 3.6506881896921588e-06, + "loss": 0.5483, + "step": 7923 + }, + { + "epoch": 2.1035444046196736, + "grad_norm": 0.4087339471219136, + "learning_rate": 3.6503782309990354e-06, + "loss": 0.5322, + "step": 7924 + }, + { + "epoch": 2.1038099030930573, + "grad_norm": 0.422093877810464, + "learning_rate": 3.6500682498707197e-06, + "loss": 0.5711, + "step": 7925 + }, + { + "epoch": 2.104075401566441, + "grad_norm": 0.4198895471968135, + "learning_rate": 3.649758246313257e-06, + "loss": 0.5393, + "step": 7926 + }, + { + "epoch": 2.1043409000398245, + "grad_norm": 0.41453907000977114, + "learning_rate": 3.6494482203326944e-06, + "loss": 0.5553, + "step": 7927 + }, + { + "epoch": 2.1046063985132086, + "grad_norm": 0.4178900416487073, + "learning_rate": 3.649138171935076e-06, + "loss": 0.568, + "step": 7928 + }, + { + "epoch": 2.1048718969865923, + "grad_norm": 0.415799286109944, + "learning_rate": 3.6488281011264496e-06, + "loss": 0.5657, + "step": 7929 + }, + { + "epoch": 2.105137395459976, + "grad_norm": 0.42752419460187796, + "learning_rate": 3.6485180079128636e-06, + "loss": 0.5697, + "step": 7930 + }, + { + "epoch": 2.10540289393336, + "grad_norm": 0.42708227175139585, + "learning_rate": 3.6482078923003633e-06, + "loss": 0.549, + "step": 7931 + }, + { + "epoch": 2.1056683924067436, + "grad_norm": 0.44545415765637775, + "learning_rate": 3.6478977542949982e-06, + "loss": 0.544, + "step": 7932 + }, + { + "epoch": 2.1059338908801273, + "grad_norm": 0.431567309040952, + "learning_rate": 3.6475875939028166e-06, + "loss": 0.5629, + "step": 7933 + }, + { + "epoch": 2.1061993893535114, + "grad_norm": 0.43517500341419263, + "learning_rate": 3.6472774111298668e-06, + "loss": 0.5719, + "step": 7934 + }, + { + "epoch": 2.106464887826895, + "grad_norm": 0.43137560609523234, + "learning_rate": 3.646967205982198e-06, + "loss": 0.5716, + "step": 7935 + }, + { + "epoch": 2.1067303863002786, + "grad_norm": 0.4190466606296365, + "learning_rate": 3.646656978465861e-06, + "loss": 0.5606, + "step": 7936 + }, + { + "epoch": 2.1069958847736627, + "grad_norm": 0.43430164920882863, + "learning_rate": 3.646346728586906e-06, + "loss": 0.5418, + "step": 7937 + }, + { + "epoch": 2.1072613832470464, + "grad_norm": 0.4279186469353687, + "learning_rate": 3.6460364563513823e-06, + "loss": 0.5984, + "step": 7938 + }, + { + "epoch": 2.10752688172043, + "grad_norm": 0.4231647221027598, + "learning_rate": 3.645726161765342e-06, + "loss": 0.5735, + "step": 7939 + }, + { + "epoch": 2.107792380193814, + "grad_norm": 0.41170892065694004, + "learning_rate": 3.6454158448348363e-06, + "loss": 0.5453, + "step": 7940 + }, + { + "epoch": 2.1080578786671977, + "grad_norm": 0.44205172045602886, + "learning_rate": 3.6451055055659167e-06, + "loss": 0.5985, + "step": 7941 + }, + { + "epoch": 2.1083233771405814, + "grad_norm": 0.4211496955186089, + "learning_rate": 3.6447951439646368e-06, + "loss": 0.5833, + "step": 7942 + }, + { + "epoch": 2.108588875613965, + "grad_norm": 0.4298877651793863, + "learning_rate": 3.644484760037048e-06, + "loss": 0.5569, + "step": 7943 + }, + { + "epoch": 2.108854374087349, + "grad_norm": 0.43779770166616805, + "learning_rate": 3.6441743537892045e-06, + "loss": 0.6026, + "step": 7944 + }, + { + "epoch": 2.1091198725607327, + "grad_norm": 0.40915734122466724, + "learning_rate": 3.6438639252271597e-06, + "loss": 0.5454, + "step": 7945 + }, + { + "epoch": 2.1093853710341164, + "grad_norm": 0.42734547561812297, + "learning_rate": 3.6435534743569674e-06, + "loss": 0.5699, + "step": 7946 + }, + { + "epoch": 2.1096508695075005, + "grad_norm": 0.43949928742694055, + "learning_rate": 3.643243001184683e-06, + "loss": 0.5908, + "step": 7947 + }, + { + "epoch": 2.109916367980884, + "grad_norm": 0.42244903702406783, + "learning_rate": 3.6429325057163607e-06, + "loss": 0.5814, + "step": 7948 + }, + { + "epoch": 2.1101818664542678, + "grad_norm": 0.42671917355588357, + "learning_rate": 3.642621987958056e-06, + "loss": 0.5301, + "step": 7949 + }, + { + "epoch": 2.110447364927652, + "grad_norm": 0.4132984600526833, + "learning_rate": 3.642311447915825e-06, + "loss": 0.535, + "step": 7950 + }, + { + "epoch": 2.1107128634010355, + "grad_norm": 0.4375408792660125, + "learning_rate": 3.642000885595724e-06, + "loss": 0.5494, + "step": 7951 + }, + { + "epoch": 2.110978361874419, + "grad_norm": 0.42541073135700963, + "learning_rate": 3.6416903010038096e-06, + "loss": 0.5951, + "step": 7952 + }, + { + "epoch": 2.111243860347803, + "grad_norm": 0.4345740682350128, + "learning_rate": 3.641379694146139e-06, + "loss": 0.5559, + "step": 7953 + }, + { + "epoch": 2.111509358821187, + "grad_norm": 0.4186693681895498, + "learning_rate": 3.6410690650287696e-06, + "loss": 0.5488, + "step": 7954 + }, + { + "epoch": 2.1117748572945705, + "grad_norm": 0.41633769862776476, + "learning_rate": 3.640758413657759e-06, + "loss": 0.5742, + "step": 7955 + }, + { + "epoch": 2.112040355767954, + "grad_norm": 0.4243956064162778, + "learning_rate": 3.6404477400391675e-06, + "loss": 0.5534, + "step": 7956 + }, + { + "epoch": 2.112305854241338, + "grad_norm": 0.4266805913086236, + "learning_rate": 3.640137044179052e-06, + "loss": 0.5407, + "step": 7957 + }, + { + "epoch": 2.112571352714722, + "grad_norm": 0.4330549608013007, + "learning_rate": 3.6398263260834733e-06, + "loss": 0.5817, + "step": 7958 + }, + { + "epoch": 2.1128368511881055, + "grad_norm": 0.4100806467189942, + "learning_rate": 3.6395155857584898e-06, + "loss": 0.5624, + "step": 7959 + }, + { + "epoch": 2.1131023496614896, + "grad_norm": 0.43435924398440234, + "learning_rate": 3.639204823210163e-06, + "loss": 0.581, + "step": 7960 + }, + { + "epoch": 2.113367848134873, + "grad_norm": 0.4329545894381265, + "learning_rate": 3.638894038444552e-06, + "loss": 0.5702, + "step": 7961 + }, + { + "epoch": 2.113633346608257, + "grad_norm": 0.45340530202681095, + "learning_rate": 3.6385832314677193e-06, + "loss": 0.5568, + "step": 7962 + }, + { + "epoch": 2.113898845081641, + "grad_norm": 0.42964666595175455, + "learning_rate": 3.6382724022857254e-06, + "loss": 0.6124, + "step": 7963 + }, + { + "epoch": 2.1141643435550246, + "grad_norm": 0.419428067900188, + "learning_rate": 3.6379615509046327e-06, + "loss": 0.5548, + "step": 7964 + }, + { + "epoch": 2.1144298420284082, + "grad_norm": 0.4516037056068796, + "learning_rate": 3.6376506773305047e-06, + "loss": 0.5831, + "step": 7965 + }, + { + "epoch": 2.1146953405017923, + "grad_norm": 0.427759583741649, + "learning_rate": 3.6373397815694023e-06, + "loss": 0.561, + "step": 7966 + }, + { + "epoch": 2.114960838975176, + "grad_norm": 0.42126228929129966, + "learning_rate": 3.63702886362739e-06, + "loss": 0.5392, + "step": 7967 + }, + { + "epoch": 2.1152263374485596, + "grad_norm": 0.42500561873963555, + "learning_rate": 3.63671792351053e-06, + "loss": 0.5771, + "step": 7968 + }, + { + "epoch": 2.1154918359219437, + "grad_norm": 0.4279648708519892, + "learning_rate": 3.6364069612248874e-06, + "loss": 0.5985, + "step": 7969 + }, + { + "epoch": 2.1157573343953273, + "grad_norm": 0.4258078527664367, + "learning_rate": 3.636095976776527e-06, + "loss": 0.5845, + "step": 7970 + }, + { + "epoch": 2.116022832868711, + "grad_norm": 0.4229238300489938, + "learning_rate": 3.635784970171513e-06, + "loss": 0.5392, + "step": 7971 + }, + { + "epoch": 2.1162883313420946, + "grad_norm": 0.42217674975822295, + "learning_rate": 3.6354739414159114e-06, + "loss": 0.5678, + "step": 7972 + }, + { + "epoch": 2.1165538298154787, + "grad_norm": 0.4244005319842895, + "learning_rate": 3.6351628905157886e-06, + "loss": 0.5554, + "step": 7973 + }, + { + "epoch": 2.1168193282888623, + "grad_norm": 0.4328728459958939, + "learning_rate": 3.634851817477209e-06, + "loss": 0.5587, + "step": 7974 + }, + { + "epoch": 2.117084826762246, + "grad_norm": 0.42755433048338387, + "learning_rate": 3.634540722306241e-06, + "loss": 0.5676, + "step": 7975 + }, + { + "epoch": 2.11735032523563, + "grad_norm": 0.41638394714778576, + "learning_rate": 3.634229605008951e-06, + "loss": 0.5536, + "step": 7976 + }, + { + "epoch": 2.1176158237090137, + "grad_norm": 0.4244115106752363, + "learning_rate": 3.633918465591407e-06, + "loss": 0.5454, + "step": 7977 + }, + { + "epoch": 2.1178813221823973, + "grad_norm": 0.43650908093444724, + "learning_rate": 3.6336073040596752e-06, + "loss": 0.5715, + "step": 7978 + }, + { + "epoch": 2.1181468206557814, + "grad_norm": 0.4223321960262128, + "learning_rate": 3.633296120419827e-06, + "loss": 0.5519, + "step": 7979 + }, + { + "epoch": 2.118412319129165, + "grad_norm": 0.43854703367933506, + "learning_rate": 3.632984914677929e-06, + "loss": 0.5834, + "step": 7980 + }, + { + "epoch": 2.1186778176025487, + "grad_norm": 0.4151364291354364, + "learning_rate": 3.632673686840051e-06, + "loss": 0.5493, + "step": 7981 + }, + { + "epoch": 2.1189433160759323, + "grad_norm": 0.441090733203994, + "learning_rate": 3.632362436912263e-06, + "loss": 0.5394, + "step": 7982 + }, + { + "epoch": 2.1192088145493164, + "grad_norm": 0.4536107233261637, + "learning_rate": 3.632051164900635e-06, + "loss": 0.5274, + "step": 7983 + }, + { + "epoch": 2.1194743130227, + "grad_norm": 0.4200671092614244, + "learning_rate": 3.6317398708112372e-06, + "loss": 0.5733, + "step": 7984 + }, + { + "epoch": 2.1197398114960837, + "grad_norm": 0.4277111804410303, + "learning_rate": 3.6314285546501415e-06, + "loss": 0.569, + "step": 7985 + }, + { + "epoch": 2.120005309969468, + "grad_norm": 0.4328333608833753, + "learning_rate": 3.6311172164234187e-06, + "loss": 0.5748, + "step": 7986 + }, + { + "epoch": 2.1202708084428514, + "grad_norm": 0.4243852529436048, + "learning_rate": 3.6308058561371407e-06, + "loss": 0.5801, + "step": 7987 + }, + { + "epoch": 2.120536306916235, + "grad_norm": 0.4265879943858205, + "learning_rate": 3.6304944737973794e-06, + "loss": 0.571, + "step": 7988 + }, + { + "epoch": 2.120801805389619, + "grad_norm": 0.4181442163237821, + "learning_rate": 3.6301830694102086e-06, + "loss": 0.5641, + "step": 7989 + }, + { + "epoch": 2.121067303863003, + "grad_norm": 0.42227138240633094, + "learning_rate": 3.6298716429817006e-06, + "loss": 0.5782, + "step": 7990 + }, + { + "epoch": 2.1213328023363864, + "grad_norm": 0.4260358429920136, + "learning_rate": 3.6295601945179282e-06, + "loss": 0.5711, + "step": 7991 + }, + { + "epoch": 2.1215983008097705, + "grad_norm": 0.43433164184629003, + "learning_rate": 3.6292487240249676e-06, + "loss": 0.5369, + "step": 7992 + }, + { + "epoch": 2.121863799283154, + "grad_norm": 0.4268611538270948, + "learning_rate": 3.6289372315088916e-06, + "loss": 0.5665, + "step": 7993 + }, + { + "epoch": 2.122129297756538, + "grad_norm": 0.4168088351296817, + "learning_rate": 3.6286257169757756e-06, + "loss": 0.5574, + "step": 7994 + }, + { + "epoch": 2.122394796229922, + "grad_norm": 0.4246541435935857, + "learning_rate": 3.628314180431694e-06, + "loss": 0.5463, + "step": 7995 + }, + { + "epoch": 2.1226602947033055, + "grad_norm": 0.4319435219741565, + "learning_rate": 3.6280026218827247e-06, + "loss": 0.5888, + "step": 7996 + }, + { + "epoch": 2.122925793176689, + "grad_norm": 0.43502368816550957, + "learning_rate": 3.627691041334941e-06, + "loss": 0.5462, + "step": 7997 + }, + { + "epoch": 2.123191291650073, + "grad_norm": 0.43009773525111383, + "learning_rate": 3.6273794387944216e-06, + "loss": 0.5422, + "step": 7998 + }, + { + "epoch": 2.123456790123457, + "grad_norm": 0.441744401430134, + "learning_rate": 3.6270678142672426e-06, + "loss": 0.6043, + "step": 7999 + }, + { + "epoch": 2.1237222885968405, + "grad_norm": 0.4414479905702202, + "learning_rate": 3.626756167759482e-06, + "loss": 0.5723, + "step": 8000 + }, + { + "epoch": 2.123987787070224, + "grad_norm": 0.43047843867502195, + "learning_rate": 3.626444499277217e-06, + "loss": 0.5541, + "step": 8001 + }, + { + "epoch": 2.1242532855436083, + "grad_norm": 0.45411837737981864, + "learning_rate": 3.626132808826527e-06, + "loss": 0.5222, + "step": 8002 + }, + { + "epoch": 2.124518784016992, + "grad_norm": 0.43146575662073927, + "learning_rate": 3.625821096413489e-06, + "loss": 0.5749, + "step": 8003 + }, + { + "epoch": 2.1247842824903755, + "grad_norm": 0.4304879493679724, + "learning_rate": 3.6255093620441835e-06, + "loss": 0.5564, + "step": 8004 + }, + { + "epoch": 2.1250497809637596, + "grad_norm": 0.4293964986404451, + "learning_rate": 3.62519760572469e-06, + "loss": 0.5852, + "step": 8005 + }, + { + "epoch": 2.1253152794371433, + "grad_norm": 0.4348393127234673, + "learning_rate": 3.624885827461088e-06, + "loss": 0.5763, + "step": 8006 + }, + { + "epoch": 2.125580777910527, + "grad_norm": 0.42904264331724723, + "learning_rate": 3.6245740272594584e-06, + "loss": 0.5952, + "step": 8007 + }, + { + "epoch": 2.125846276383911, + "grad_norm": 0.42892641580927876, + "learning_rate": 3.624262205125881e-06, + "loss": 0.5453, + "step": 8008 + }, + { + "epoch": 2.1261117748572946, + "grad_norm": 0.4337721649196399, + "learning_rate": 3.623950361066439e-06, + "loss": 0.5639, + "step": 8009 + }, + { + "epoch": 2.1263772733306783, + "grad_norm": 0.4379513763977376, + "learning_rate": 3.623638495087212e-06, + "loss": 0.5963, + "step": 8010 + }, + { + "epoch": 2.126642771804062, + "grad_norm": 0.4283183752062311, + "learning_rate": 3.6233266071942845e-06, + "loss": 0.5912, + "step": 8011 + }, + { + "epoch": 2.126908270277446, + "grad_norm": 0.42475082307663986, + "learning_rate": 3.623014697393737e-06, + "loss": 0.5317, + "step": 8012 + }, + { + "epoch": 2.1271737687508296, + "grad_norm": 0.4312996694785182, + "learning_rate": 3.622702765691653e-06, + "loss": 0.5558, + "step": 8013 + }, + { + "epoch": 2.1274392672242133, + "grad_norm": 0.4429595843632247, + "learning_rate": 3.622390812094117e-06, + "loss": 0.5431, + "step": 8014 + }, + { + "epoch": 2.1277047656975974, + "grad_norm": 0.418639786992392, + "learning_rate": 3.622078836607212e-06, + "loss": 0.5398, + "step": 8015 + }, + { + "epoch": 2.127970264170981, + "grad_norm": 0.43397650063216997, + "learning_rate": 3.621766839237022e-06, + "loss": 0.568, + "step": 8016 + }, + { + "epoch": 2.1282357626443646, + "grad_norm": 0.4283674590653182, + "learning_rate": 3.6214548199896315e-06, + "loss": 0.5535, + "step": 8017 + }, + { + "epoch": 2.1285012611177487, + "grad_norm": 0.4208320807455846, + "learning_rate": 3.621142778871127e-06, + "loss": 0.5756, + "step": 8018 + }, + { + "epoch": 2.1287667595911324, + "grad_norm": 0.4120720051953142, + "learning_rate": 3.6208307158875934e-06, + "loss": 0.5948, + "step": 8019 + }, + { + "epoch": 2.129032258064516, + "grad_norm": 0.44451895183248785, + "learning_rate": 3.620518631045116e-06, + "loss": 0.5772, + "step": 8020 + }, + { + "epoch": 2.1292977565379, + "grad_norm": 0.4189982601159866, + "learning_rate": 3.620206524349782e-06, + "loss": 0.5257, + "step": 8021 + }, + { + "epoch": 2.1295632550112837, + "grad_norm": 0.4153254071728542, + "learning_rate": 3.619894395807679e-06, + "loss": 0.5639, + "step": 8022 + }, + { + "epoch": 2.1298287534846674, + "grad_norm": 0.41547191334593847, + "learning_rate": 3.619582245424892e-06, + "loss": 0.5525, + "step": 8023 + }, + { + "epoch": 2.1300942519580515, + "grad_norm": 0.42484213287825123, + "learning_rate": 3.6192700732075103e-06, + "loss": 0.5077, + "step": 8024 + }, + { + "epoch": 2.130359750431435, + "grad_norm": 0.43510874529626953, + "learning_rate": 3.618957879161622e-06, + "loss": 0.5738, + "step": 8025 + }, + { + "epoch": 2.1306252489048187, + "grad_norm": 0.4335061024685529, + "learning_rate": 3.618645663293315e-06, + "loss": 0.6059, + "step": 8026 + }, + { + "epoch": 2.1308907473782024, + "grad_norm": 0.4303836677947689, + "learning_rate": 3.618333425608679e-06, + "loss": 0.6138, + "step": 8027 + }, + { + "epoch": 2.1311562458515865, + "grad_norm": 0.42186408094798866, + "learning_rate": 3.6180211661138034e-06, + "loss": 0.5721, + "step": 8028 + }, + { + "epoch": 2.13142174432497, + "grad_norm": 0.43452533559782536, + "learning_rate": 3.617708884814777e-06, + "loss": 0.5752, + "step": 8029 + }, + { + "epoch": 2.1316872427983538, + "grad_norm": 0.4411459749259788, + "learning_rate": 3.6173965817176902e-06, + "loss": 0.5692, + "step": 8030 + }, + { + "epoch": 2.131952741271738, + "grad_norm": 0.42028956939591017, + "learning_rate": 3.617084256828636e-06, + "loss": 0.5617, + "step": 8031 + }, + { + "epoch": 2.1322182397451215, + "grad_norm": 0.427974439534533, + "learning_rate": 3.6167719101537024e-06, + "loss": 0.5914, + "step": 8032 + }, + { + "epoch": 2.132483738218505, + "grad_norm": 0.4226976099060145, + "learning_rate": 3.6164595416989823e-06, + "loss": 0.5614, + "step": 8033 + }, + { + "epoch": 2.132749236691889, + "grad_norm": 0.4340575600979119, + "learning_rate": 3.6161471514705683e-06, + "loss": 0.5873, + "step": 8034 + }, + { + "epoch": 2.133014735165273, + "grad_norm": 0.44571354721405104, + "learning_rate": 3.6158347394745507e-06, + "loss": 0.5968, + "step": 8035 + }, + { + "epoch": 2.1332802336386565, + "grad_norm": 0.4340542730915543, + "learning_rate": 3.6155223057170247e-06, + "loss": 0.5347, + "step": 8036 + }, + { + "epoch": 2.13354573211204, + "grad_norm": 0.42381527257855994, + "learning_rate": 3.615209850204082e-06, + "loss": 0.5525, + "step": 8037 + }, + { + "epoch": 2.133811230585424, + "grad_norm": 0.41780539211667134, + "learning_rate": 3.614897372941817e-06, + "loss": 0.5564, + "step": 8038 + }, + { + "epoch": 2.134076729058808, + "grad_norm": 0.42993891244135257, + "learning_rate": 3.614584873936323e-06, + "loss": 0.5646, + "step": 8039 + }, + { + "epoch": 2.1343422275321915, + "grad_norm": 0.43428479653921503, + "learning_rate": 3.614272353193696e-06, + "loss": 0.5477, + "step": 8040 + }, + { + "epoch": 2.1346077260055756, + "grad_norm": 0.42400207189222794, + "learning_rate": 3.613959810720029e-06, + "loss": 0.5746, + "step": 8041 + }, + { + "epoch": 2.134873224478959, + "grad_norm": 0.42962978979385796, + "learning_rate": 3.613647246521419e-06, + "loss": 0.5555, + "step": 8042 + }, + { + "epoch": 2.135138722952343, + "grad_norm": 0.4480349561269434, + "learning_rate": 3.6133346606039605e-06, + "loss": 0.5532, + "step": 8043 + }, + { + "epoch": 2.135404221425727, + "grad_norm": 0.42689626530334523, + "learning_rate": 3.61302205297375e-06, + "loss": 0.5883, + "step": 8044 + }, + { + "epoch": 2.1356697198991106, + "grad_norm": 0.41723989131049216, + "learning_rate": 3.6127094236368842e-06, + "loss": 0.4984, + "step": 8045 + }, + { + "epoch": 2.1359352183724942, + "grad_norm": 0.430163185314334, + "learning_rate": 3.612396772599461e-06, + "loss": 0.5698, + "step": 8046 + }, + { + "epoch": 2.1362007168458783, + "grad_norm": 0.4322140002268539, + "learning_rate": 3.6120840998675766e-06, + "loss": 0.5881, + "step": 8047 + }, + { + "epoch": 2.136466215319262, + "grad_norm": 0.42714408247846825, + "learning_rate": 3.6117714054473298e-06, + "loss": 0.5855, + "step": 8048 + }, + { + "epoch": 2.1367317137926456, + "grad_norm": 0.416319829667135, + "learning_rate": 3.611458689344818e-06, + "loss": 0.5893, + "step": 8049 + }, + { + "epoch": 2.1369972122660297, + "grad_norm": 0.4357965771127121, + "learning_rate": 3.6111459515661407e-06, + "loss": 0.602, + "step": 8050 + }, + { + "epoch": 2.1372627107394133, + "grad_norm": 0.424766280488201, + "learning_rate": 3.6108331921173966e-06, + "loss": 0.5722, + "step": 8051 + }, + { + "epoch": 2.137528209212797, + "grad_norm": 0.42527505481123656, + "learning_rate": 3.610520411004686e-06, + "loss": 0.5733, + "step": 8052 + }, + { + "epoch": 2.1377937076861806, + "grad_norm": 0.4208184276162359, + "learning_rate": 3.6102076082341075e-06, + "loss": 0.5832, + "step": 8053 + }, + { + "epoch": 2.1380592061595647, + "grad_norm": 0.4531490526184039, + "learning_rate": 3.609894783811763e-06, + "loss": 0.5647, + "step": 8054 + }, + { + "epoch": 2.1383247046329483, + "grad_norm": 0.43306972875768146, + "learning_rate": 3.6095819377437523e-06, + "loss": 0.6015, + "step": 8055 + }, + { + "epoch": 2.138590203106332, + "grad_norm": 0.4248125265115133, + "learning_rate": 3.6092690700361778e-06, + "loss": 0.556, + "step": 8056 + }, + { + "epoch": 2.138855701579716, + "grad_norm": 0.4370564627532615, + "learning_rate": 3.60895618069514e-06, + "loss": 0.5919, + "step": 8057 + }, + { + "epoch": 2.1391212000530997, + "grad_norm": 0.41607888538439014, + "learning_rate": 3.6086432697267414e-06, + "loss": 0.5885, + "step": 8058 + }, + { + "epoch": 2.1393866985264833, + "grad_norm": 0.42706851500603765, + "learning_rate": 3.6083303371370847e-06, + "loss": 0.5934, + "step": 8059 + }, + { + "epoch": 2.1396521969998674, + "grad_norm": 0.44096229557413885, + "learning_rate": 3.6080173829322734e-06, + "loss": 0.5688, + "step": 8060 + }, + { + "epoch": 2.139917695473251, + "grad_norm": 0.4363152436603377, + "learning_rate": 3.60770440711841e-06, + "loss": 0.5972, + "step": 8061 + }, + { + "epoch": 2.1401831939466347, + "grad_norm": 0.4277806010825455, + "learning_rate": 3.607391409701598e-06, + "loss": 0.5646, + "step": 8062 + }, + { + "epoch": 2.140448692420019, + "grad_norm": 0.408768985475413, + "learning_rate": 3.607078390687942e-06, + "loss": 0.5734, + "step": 8063 + }, + { + "epoch": 2.1407141908934024, + "grad_norm": 0.4367264131501375, + "learning_rate": 3.606765350083547e-06, + "loss": 0.5662, + "step": 8064 + }, + { + "epoch": 2.140979689366786, + "grad_norm": 0.4152403541160022, + "learning_rate": 3.6064522878945184e-06, + "loss": 0.5351, + "step": 8065 + }, + { + "epoch": 2.1412451878401697, + "grad_norm": 0.42026768087018546, + "learning_rate": 3.6061392041269605e-06, + "loss": 0.5614, + "step": 8066 + }, + { + "epoch": 2.141510686313554, + "grad_norm": 0.43103548155596705, + "learning_rate": 3.60582609878698e-06, + "loss": 0.5663, + "step": 8067 + }, + { + "epoch": 2.1417761847869374, + "grad_norm": 0.41058773194358855, + "learning_rate": 3.6055129718806836e-06, + "loss": 0.5411, + "step": 8068 + }, + { + "epoch": 2.142041683260321, + "grad_norm": 0.4326831638926904, + "learning_rate": 3.6051998234141768e-06, + "loss": 0.6307, + "step": 8069 + }, + { + "epoch": 2.142307181733705, + "grad_norm": 0.42999013981109785, + "learning_rate": 3.6048866533935673e-06, + "loss": 0.5793, + "step": 8070 + }, + { + "epoch": 2.142572680207089, + "grad_norm": 0.44126473351861395, + "learning_rate": 3.604573461824964e-06, + "loss": 0.552, + "step": 8071 + }, + { + "epoch": 2.1428381786804724, + "grad_norm": 0.4611303067588401, + "learning_rate": 3.6042602487144716e-06, + "loss": 0.5756, + "step": 8072 + }, + { + "epoch": 2.1431036771538565, + "grad_norm": 0.44271024251920693, + "learning_rate": 3.603947014068202e-06, + "loss": 0.556, + "step": 8073 + }, + { + "epoch": 2.14336917562724, + "grad_norm": 0.43259245576915406, + "learning_rate": 3.6036337578922633e-06, + "loss": 0.5881, + "step": 8074 + }, + { + "epoch": 2.143634674100624, + "grad_norm": 0.4319794228696632, + "learning_rate": 3.6033204801927636e-06, + "loss": 0.5588, + "step": 8075 + }, + { + "epoch": 2.143900172574008, + "grad_norm": 0.4323547684393841, + "learning_rate": 3.6030071809758127e-06, + "loss": 0.5472, + "step": 8076 + }, + { + "epoch": 2.1441656710473915, + "grad_norm": 0.42154147532371194, + "learning_rate": 3.6026938602475212e-06, + "loss": 0.5672, + "step": 8077 + }, + { + "epoch": 2.144431169520775, + "grad_norm": 0.43110878047796575, + "learning_rate": 3.6023805180140005e-06, + "loss": 0.5643, + "step": 8078 + }, + { + "epoch": 2.1446966679941593, + "grad_norm": 0.4430635262160183, + "learning_rate": 3.6020671542813596e-06, + "loss": 0.5922, + "step": 8079 + }, + { + "epoch": 2.144962166467543, + "grad_norm": 0.42957359014746876, + "learning_rate": 3.6017537690557114e-06, + "loss": 0.5824, + "step": 8080 + }, + { + "epoch": 2.1452276649409265, + "grad_norm": 0.42740105360366026, + "learning_rate": 3.601440362343167e-06, + "loss": 0.5611, + "step": 8081 + }, + { + "epoch": 2.14549316341431, + "grad_norm": 0.46092266723123154, + "learning_rate": 3.601126934149838e-06, + "loss": 0.5469, + "step": 8082 + }, + { + "epoch": 2.1457586618876943, + "grad_norm": 0.42044305592506603, + "learning_rate": 3.6008134844818386e-06, + "loss": 0.5478, + "step": 8083 + }, + { + "epoch": 2.146024160361078, + "grad_norm": 0.41919413812783474, + "learning_rate": 3.600500013345281e-06, + "loss": 0.5518, + "step": 8084 + }, + { + "epoch": 2.1462896588344615, + "grad_norm": 0.45848238302689487, + "learning_rate": 3.6001865207462785e-06, + "loss": 0.5655, + "step": 8085 + }, + { + "epoch": 2.1465551573078456, + "grad_norm": 0.43841601883293296, + "learning_rate": 3.5998730066909453e-06, + "loss": 0.5497, + "step": 8086 + }, + { + "epoch": 2.1468206557812293, + "grad_norm": 0.438067347636522, + "learning_rate": 3.5995594711853955e-06, + "loss": 0.5965, + "step": 8087 + }, + { + "epoch": 2.147086154254613, + "grad_norm": 0.43603604180414585, + "learning_rate": 3.5992459142357438e-06, + "loss": 0.5841, + "step": 8088 + }, + { + "epoch": 2.147351652727997, + "grad_norm": 0.4308631569429699, + "learning_rate": 3.5989323358481053e-06, + "loss": 0.5906, + "step": 8089 + }, + { + "epoch": 2.1476171512013806, + "grad_norm": 0.4829050416934733, + "learning_rate": 3.598618736028595e-06, + "loss": 0.5717, + "step": 8090 + }, + { + "epoch": 2.1478826496747643, + "grad_norm": 0.4534862302323038, + "learning_rate": 3.59830511478333e-06, + "loss": 0.5893, + "step": 8091 + }, + { + "epoch": 2.148148148148148, + "grad_norm": 0.41791927793458056, + "learning_rate": 3.5979914721184263e-06, + "loss": 0.5859, + "step": 8092 + }, + { + "epoch": 2.148413646621532, + "grad_norm": 0.4537914041379908, + "learning_rate": 3.5976778080399997e-06, + "loss": 0.5884, + "step": 8093 + }, + { + "epoch": 2.1486791450949156, + "grad_norm": 0.4388072513963964, + "learning_rate": 3.59736412255417e-06, + "loss": 0.5781, + "step": 8094 + }, + { + "epoch": 2.1489446435682993, + "grad_norm": 0.431744575916571, + "learning_rate": 3.5970504156670516e-06, + "loss": 0.6, + "step": 8095 + }, + { + "epoch": 2.1492101420416834, + "grad_norm": 0.42671965013016383, + "learning_rate": 3.5967366873847643e-06, + "loss": 0.6091, + "step": 8096 + }, + { + "epoch": 2.149475640515067, + "grad_norm": 0.4281110184081167, + "learning_rate": 3.5964229377134276e-06, + "loss": 0.5553, + "step": 8097 + }, + { + "epoch": 2.1497411389884507, + "grad_norm": 0.4360037237708422, + "learning_rate": 3.5961091666591574e-06, + "loss": 0.5869, + "step": 8098 + }, + { + "epoch": 2.1500066374618347, + "grad_norm": 0.4287728608608719, + "learning_rate": 3.5957953742280754e-06, + "loss": 0.6016, + "step": 8099 + }, + { + "epoch": 2.1502721359352184, + "grad_norm": 0.4220330290905117, + "learning_rate": 3.5954815604263004e-06, + "loss": 0.553, + "step": 8100 + }, + { + "epoch": 2.150537634408602, + "grad_norm": 0.432355940946539, + "learning_rate": 3.5951677252599527e-06, + "loss": 0.5819, + "step": 8101 + }, + { + "epoch": 2.150803132881986, + "grad_norm": 0.4385362896641794, + "learning_rate": 3.594853868735153e-06, + "loss": 0.5809, + "step": 8102 + }, + { + "epoch": 2.1510686313553697, + "grad_norm": 0.43228388668039286, + "learning_rate": 3.5945399908580233e-06, + "loss": 0.5941, + "step": 8103 + }, + { + "epoch": 2.1513341298287534, + "grad_norm": 0.4224342324211673, + "learning_rate": 3.5942260916346828e-06, + "loss": 0.5754, + "step": 8104 + }, + { + "epoch": 2.1515996283021375, + "grad_norm": 0.41315324887917493, + "learning_rate": 3.5939121710712543e-06, + "loss": 0.5743, + "step": 8105 + }, + { + "epoch": 2.151865126775521, + "grad_norm": 0.4297833452425808, + "learning_rate": 3.5935982291738603e-06, + "loss": 0.5863, + "step": 8106 + }, + { + "epoch": 2.1521306252489047, + "grad_norm": 0.4358073161958455, + "learning_rate": 3.5932842659486235e-06, + "loss": 0.5354, + "step": 8107 + }, + { + "epoch": 2.1523961237222884, + "grad_norm": 0.42483957253791854, + "learning_rate": 3.5929702814016667e-06, + "loss": 0.5311, + "step": 8108 + }, + { + "epoch": 2.1526616221956725, + "grad_norm": 0.423219127056296, + "learning_rate": 3.5926562755391126e-06, + "loss": 0.5983, + "step": 8109 + }, + { + "epoch": 2.152927120669056, + "grad_norm": 0.422511706840988, + "learning_rate": 3.592342248367086e-06, + "loss": 0.5679, + "step": 8110 + }, + { + "epoch": 2.1531926191424398, + "grad_norm": 0.4377553129409497, + "learning_rate": 3.592028199891712e-06, + "loss": 0.5605, + "step": 8111 + }, + { + "epoch": 2.153458117615824, + "grad_norm": 0.43475481025721574, + "learning_rate": 3.5917141301191137e-06, + "loss": 0.6075, + "step": 8112 + }, + { + "epoch": 2.1537236160892075, + "grad_norm": 0.4237312870259854, + "learning_rate": 3.591400039055417e-06, + "loss": 0.603, + "step": 8113 + }, + { + "epoch": 2.153989114562591, + "grad_norm": 0.41950635868227365, + "learning_rate": 3.5910859267067476e-06, + "loss": 0.5634, + "step": 8114 + }, + { + "epoch": 2.154254613035975, + "grad_norm": 0.4519774747924744, + "learning_rate": 3.5907717930792307e-06, + "loss": 0.5785, + "step": 8115 + }, + { + "epoch": 2.154520111509359, + "grad_norm": 0.4291079808804259, + "learning_rate": 3.5904576381789933e-06, + "loss": 0.5923, + "step": 8116 + }, + { + "epoch": 2.1547856099827425, + "grad_norm": 0.4265704040921036, + "learning_rate": 3.590143462012162e-06, + "loss": 0.5782, + "step": 8117 + }, + { + "epoch": 2.1550511084561266, + "grad_norm": 0.4334135139626096, + "learning_rate": 3.589829264584864e-06, + "loss": 0.5823, + "step": 8118 + }, + { + "epoch": 2.15531660692951, + "grad_norm": 0.43512986410951887, + "learning_rate": 3.5895150459032268e-06, + "loss": 0.5774, + "step": 8119 + }, + { + "epoch": 2.155582105402894, + "grad_norm": 0.452573659656349, + "learning_rate": 3.5892008059733795e-06, + "loss": 0.5534, + "step": 8120 + }, + { + "epoch": 2.1558476038762775, + "grad_norm": 0.41307716634649994, + "learning_rate": 3.5888865448014486e-06, + "loss": 0.5509, + "step": 8121 + }, + { + "epoch": 2.1561131023496616, + "grad_norm": 0.44650225414869477, + "learning_rate": 3.5885722623935648e-06, + "loss": 0.5652, + "step": 8122 + }, + { + "epoch": 2.156378600823045, + "grad_norm": 0.4259864728899922, + "learning_rate": 3.5882579587558564e-06, + "loss": 0.5575, + "step": 8123 + }, + { + "epoch": 2.156644099296429, + "grad_norm": 0.444593125610364, + "learning_rate": 3.5879436338944534e-06, + "loss": 0.5622, + "step": 8124 + }, + { + "epoch": 2.156909597769813, + "grad_norm": 0.4299837604372277, + "learning_rate": 3.5876292878154854e-06, + "loss": 0.535, + "step": 8125 + }, + { + "epoch": 2.1571750962431966, + "grad_norm": 0.42588043999293956, + "learning_rate": 3.587314920525084e-06, + "loss": 0.5742, + "step": 8126 + }, + { + "epoch": 2.1574405947165802, + "grad_norm": 0.4435783841822148, + "learning_rate": 3.587000532029379e-06, + "loss": 0.548, + "step": 8127 + }, + { + "epoch": 2.1577060931899643, + "grad_norm": 0.4402130338499585, + "learning_rate": 3.586686122334502e-06, + "loss": 0.5835, + "step": 8128 + }, + { + "epoch": 2.157971591663348, + "grad_norm": 0.45109904864501765, + "learning_rate": 3.586371691446585e-06, + "loss": 0.5684, + "step": 8129 + }, + { + "epoch": 2.1582370901367316, + "grad_norm": 0.4403816607230921, + "learning_rate": 3.58605723937176e-06, + "loss": 0.5946, + "step": 8130 + }, + { + "epoch": 2.1585025886101157, + "grad_norm": 0.4402194753650105, + "learning_rate": 3.5857427661161597e-06, + "loss": 0.5795, + "step": 8131 + }, + { + "epoch": 2.1587680870834993, + "grad_norm": 0.45238727725993394, + "learning_rate": 3.5854282716859177e-06, + "loss": 0.5631, + "step": 8132 + }, + { + "epoch": 2.159033585556883, + "grad_norm": 0.4550158011099502, + "learning_rate": 3.585113756087166e-06, + "loss": 0.5771, + "step": 8133 + }, + { + "epoch": 2.159299084030267, + "grad_norm": 0.44036667215678515, + "learning_rate": 3.58479921932604e-06, + "loss": 0.5737, + "step": 8134 + }, + { + "epoch": 2.1595645825036507, + "grad_norm": 0.4271960324273925, + "learning_rate": 3.584484661408674e-06, + "loss": 0.5727, + "step": 8135 + }, + { + "epoch": 2.1598300809770343, + "grad_norm": 0.4402278424947391, + "learning_rate": 3.5841700823412e-06, + "loss": 0.5649, + "step": 8136 + }, + { + "epoch": 2.160095579450418, + "grad_norm": 0.4508434051235174, + "learning_rate": 3.5838554821297555e-06, + "loss": 0.6116, + "step": 8137 + }, + { + "epoch": 2.160361077923802, + "grad_norm": 0.42963368327466434, + "learning_rate": 3.583540860780475e-06, + "loss": 0.5767, + "step": 8138 + }, + { + "epoch": 2.1606265763971857, + "grad_norm": 0.44143819305822724, + "learning_rate": 3.5832262182994955e-06, + "loss": 0.6067, + "step": 8139 + }, + { + "epoch": 2.1608920748705693, + "grad_norm": 0.42630891603874577, + "learning_rate": 3.582911554692953e-06, + "loss": 0.565, + "step": 8140 + }, + { + "epoch": 2.1611575733439534, + "grad_norm": 0.44728169741793067, + "learning_rate": 3.5825968699669823e-06, + "loss": 0.5853, + "step": 8141 + }, + { + "epoch": 2.161423071817337, + "grad_norm": 0.42220483558386435, + "learning_rate": 3.582282164127723e-06, + "loss": 0.5943, + "step": 8142 + }, + { + "epoch": 2.1616885702907207, + "grad_norm": 0.4239303312228211, + "learning_rate": 3.5819674371813113e-06, + "loss": 0.5686, + "step": 8143 + }, + { + "epoch": 2.161954068764105, + "grad_norm": 0.42560296859199037, + "learning_rate": 3.5816526891338856e-06, + "loss": 0.6004, + "step": 8144 + }, + { + "epoch": 2.1622195672374884, + "grad_norm": 0.44369941551844255, + "learning_rate": 3.5813379199915837e-06, + "loss": 0.6044, + "step": 8145 + }, + { + "epoch": 2.162485065710872, + "grad_norm": 0.4313591694719809, + "learning_rate": 3.5810231297605446e-06, + "loss": 0.5853, + "step": 8146 + }, + { + "epoch": 2.1627505641842557, + "grad_norm": 0.4124783745105141, + "learning_rate": 3.5807083184469083e-06, + "loss": 0.5293, + "step": 8147 + }, + { + "epoch": 2.16301606265764, + "grad_norm": 0.4181751115950785, + "learning_rate": 3.5803934860568134e-06, + "loss": 0.5376, + "step": 8148 + }, + { + "epoch": 2.1632815611310234, + "grad_norm": 0.43002984512747994, + "learning_rate": 3.580078632596401e-06, + "loss": 0.5607, + "step": 8149 + }, + { + "epoch": 2.163547059604407, + "grad_norm": 0.42232055231610094, + "learning_rate": 3.5797637580718102e-06, + "loss": 0.5597, + "step": 8150 + }, + { + "epoch": 2.163812558077791, + "grad_norm": 0.41709659332862226, + "learning_rate": 3.5794488624891823e-06, + "loss": 0.5562, + "step": 8151 + }, + { + "epoch": 2.164078056551175, + "grad_norm": 0.43682527666604676, + "learning_rate": 3.579133945854659e-06, + "loss": 0.5874, + "step": 8152 + }, + { + "epoch": 2.1643435550245584, + "grad_norm": 0.4286003119540955, + "learning_rate": 3.578819008174381e-06, + "loss": 0.6286, + "step": 8153 + }, + { + "epoch": 2.1646090534979425, + "grad_norm": 0.4156870132736457, + "learning_rate": 3.5785040494544915e-06, + "loss": 0.5182, + "step": 8154 + }, + { + "epoch": 2.164874551971326, + "grad_norm": 0.4285602721767473, + "learning_rate": 3.5781890697011323e-06, + "loss": 0.5518, + "step": 8155 + }, + { + "epoch": 2.16514005044471, + "grad_norm": 0.4245474171901688, + "learning_rate": 3.577874068920446e-06, + "loss": 0.5514, + "step": 8156 + }, + { + "epoch": 2.165405548918094, + "grad_norm": 0.43446645739812156, + "learning_rate": 3.577559047118576e-06, + "loss": 0.5768, + "step": 8157 + }, + { + "epoch": 2.1656710473914775, + "grad_norm": 0.4293236783877426, + "learning_rate": 3.5772440043016676e-06, + "loss": 0.5603, + "step": 8158 + }, + { + "epoch": 2.165936545864861, + "grad_norm": 0.42444648015691877, + "learning_rate": 3.5769289404758622e-06, + "loss": 0.5458, + "step": 8159 + }, + { + "epoch": 2.1662020443382453, + "grad_norm": 0.43845915890762643, + "learning_rate": 3.576613855647306e-06, + "loss": 0.5283, + "step": 8160 + }, + { + "epoch": 2.166467542811629, + "grad_norm": 0.44043928057689624, + "learning_rate": 3.5762987498221446e-06, + "loss": 0.5834, + "step": 8161 + }, + { + "epoch": 2.1667330412850125, + "grad_norm": 0.4163077062318956, + "learning_rate": 3.5759836230065214e-06, + "loss": 0.6, + "step": 8162 + }, + { + "epoch": 2.1669985397583966, + "grad_norm": 0.43435588587831675, + "learning_rate": 3.5756684752065833e-06, + "loss": 0.5668, + "step": 8163 + }, + { + "epoch": 2.1672640382317803, + "grad_norm": 0.43363716148412906, + "learning_rate": 3.575353306428476e-06, + "loss": 0.5596, + "step": 8164 + }, + { + "epoch": 2.167529536705164, + "grad_norm": 0.4257634052135782, + "learning_rate": 3.5750381166783466e-06, + "loss": 0.561, + "step": 8165 + }, + { + "epoch": 2.1677950351785475, + "grad_norm": 0.4350425327264742, + "learning_rate": 3.5747229059623423e-06, + "loss": 0.6215, + "step": 8166 + }, + { + "epoch": 2.1680605336519316, + "grad_norm": 0.44343055431724576, + "learning_rate": 3.574407674286609e-06, + "loss": 0.5518, + "step": 8167 + }, + { + "epoch": 2.1683260321253153, + "grad_norm": 0.4132255299484832, + "learning_rate": 3.574092421657296e-06, + "loss": 0.5256, + "step": 8168 + }, + { + "epoch": 2.168591530598699, + "grad_norm": 0.4185672844410977, + "learning_rate": 3.5737771480805512e-06, + "loss": 0.546, + "step": 8169 + }, + { + "epoch": 2.168857029072083, + "grad_norm": 0.4401080218773448, + "learning_rate": 3.573461853562522e-06, + "loss": 0.5384, + "step": 8170 + }, + { + "epoch": 2.1691225275454666, + "grad_norm": 0.41538005645587806, + "learning_rate": 3.573146538109359e-06, + "loss": 0.574, + "step": 8171 + }, + { + "epoch": 2.1693880260188503, + "grad_norm": 0.42820904736525567, + "learning_rate": 3.5728312017272116e-06, + "loss": 0.5972, + "step": 8172 + }, + { + "epoch": 2.1696535244922344, + "grad_norm": 0.43682301655958694, + "learning_rate": 3.572515844422228e-06, + "loss": 0.5796, + "step": 8173 + }, + { + "epoch": 2.169919022965618, + "grad_norm": 0.4404141268523503, + "learning_rate": 3.572200466200559e-06, + "loss": 0.6077, + "step": 8174 + }, + { + "epoch": 2.1701845214390016, + "grad_norm": 0.4209395464405756, + "learning_rate": 3.571885067068357e-06, + "loss": 0.5504, + "step": 8175 + }, + { + "epoch": 2.1704500199123853, + "grad_norm": 0.43985891046831765, + "learning_rate": 3.571569647031771e-06, + "loss": 0.557, + "step": 8176 + }, + { + "epoch": 2.1707155183857694, + "grad_norm": 0.4337082913883347, + "learning_rate": 3.5712542060969536e-06, + "loss": 0.6126, + "step": 8177 + }, + { + "epoch": 2.170981016859153, + "grad_norm": 0.4134877894423131, + "learning_rate": 3.570938744270056e-06, + "loss": 0.5112, + "step": 8178 + }, + { + "epoch": 2.1712465153325367, + "grad_norm": 0.4304441292411016, + "learning_rate": 3.570623261557231e-06, + "loss": 0.5728, + "step": 8179 + }, + { + "epoch": 2.1715120138059207, + "grad_norm": 0.43413905956266996, + "learning_rate": 3.570307757964631e-06, + "loss": 0.5962, + "step": 8180 + }, + { + "epoch": 2.1717775122793044, + "grad_norm": 0.4318863366540812, + "learning_rate": 3.5699922334984092e-06, + "loss": 0.584, + "step": 8181 + }, + { + "epoch": 2.172043010752688, + "grad_norm": 0.42977470607838475, + "learning_rate": 3.5696766881647196e-06, + "loss": 0.5461, + "step": 8182 + }, + { + "epoch": 2.172308509226072, + "grad_norm": 0.42740902399867675, + "learning_rate": 3.569361121969714e-06, + "loss": 0.5473, + "step": 8183 + }, + { + "epoch": 2.1725740076994557, + "grad_norm": 0.4201858059150775, + "learning_rate": 3.56904553491955e-06, + "loss": 0.6002, + "step": 8184 + }, + { + "epoch": 2.1728395061728394, + "grad_norm": 0.4214972963254856, + "learning_rate": 3.5687299270203797e-06, + "loss": 0.5679, + "step": 8185 + }, + { + "epoch": 2.1731050046462235, + "grad_norm": 0.4357809484239828, + "learning_rate": 3.568414298278359e-06, + "loss": 0.5326, + "step": 8186 + }, + { + "epoch": 2.173370503119607, + "grad_norm": 0.4182997779840323, + "learning_rate": 3.5680986486996437e-06, + "loss": 0.537, + "step": 8187 + }, + { + "epoch": 2.1736360015929908, + "grad_norm": 0.4265751365852189, + "learning_rate": 3.56778297829039e-06, + "loss": 0.5582, + "step": 8188 + }, + { + "epoch": 2.173901500066375, + "grad_norm": 0.4268933036491036, + "learning_rate": 3.567467287056754e-06, + "loss": 0.5654, + "step": 8189 + }, + { + "epoch": 2.1741669985397585, + "grad_norm": 0.4245872754260466, + "learning_rate": 3.5671515750048913e-06, + "loss": 0.5678, + "step": 8190 + }, + { + "epoch": 2.174432497013142, + "grad_norm": 0.4350204810888233, + "learning_rate": 3.566835842140961e-06, + "loss": 0.5996, + "step": 8191 + }, + { + "epoch": 2.1746979954865258, + "grad_norm": 0.4327623634826044, + "learning_rate": 3.5665200884711194e-06, + "loss": 0.5766, + "step": 8192 + }, + { + "epoch": 2.17496349395991, + "grad_norm": 0.4400935898065364, + "learning_rate": 3.566204314001524e-06, + "loss": 0.6037, + "step": 8193 + }, + { + "epoch": 2.1752289924332935, + "grad_norm": 0.4429245613639675, + "learning_rate": 3.5658885187383346e-06, + "loss": 0.5913, + "step": 8194 + }, + { + "epoch": 2.175494490906677, + "grad_norm": 0.42846851720164053, + "learning_rate": 3.5655727026877095e-06, + "loss": 0.5797, + "step": 8195 + }, + { + "epoch": 2.175759989380061, + "grad_norm": 0.4362558953744138, + "learning_rate": 3.565256865855808e-06, + "loss": 0.5229, + "step": 8196 + }, + { + "epoch": 2.176025487853445, + "grad_norm": 0.4290593696978451, + "learning_rate": 3.564941008248789e-06, + "loss": 0.5165, + "step": 8197 + }, + { + "epoch": 2.1762909863268285, + "grad_norm": 0.42812846591824266, + "learning_rate": 3.564625129872813e-06, + "loss": 0.5968, + "step": 8198 + }, + { + "epoch": 2.1765564848002126, + "grad_norm": 0.4253343475371097, + "learning_rate": 3.5643092307340406e-06, + "loss": 0.578, + "step": 8199 + }, + { + "epoch": 2.176821983273596, + "grad_norm": 0.44309630840798947, + "learning_rate": 3.5639933108386314e-06, + "loss": 0.5547, + "step": 8200 + }, + { + "epoch": 2.17708748174698, + "grad_norm": 0.44839504606183656, + "learning_rate": 3.5636773701927484e-06, + "loss": 0.5828, + "step": 8201 + }, + { + "epoch": 2.1773529802203635, + "grad_norm": 0.42413245538646405, + "learning_rate": 3.5633614088025513e-06, + "loss": 0.5709, + "step": 8202 + }, + { + "epoch": 2.1776184786937476, + "grad_norm": 0.4345874024185505, + "learning_rate": 3.563045426674204e-06, + "loss": 0.5607, + "step": 8203 + }, + { + "epoch": 2.1778839771671312, + "grad_norm": 0.4382591407266538, + "learning_rate": 3.562729423813869e-06, + "loss": 0.5688, + "step": 8204 + }, + { + "epoch": 2.178149475640515, + "grad_norm": 0.4508822147921061, + "learning_rate": 3.5624134002277065e-06, + "loss": 0.552, + "step": 8205 + }, + { + "epoch": 2.178414974113899, + "grad_norm": 0.43771362012163545, + "learning_rate": 3.5620973559218823e-06, + "loss": 0.5623, + "step": 8206 + }, + { + "epoch": 2.1786804725872826, + "grad_norm": 0.4349094077445569, + "learning_rate": 3.5617812909025594e-06, + "loss": 0.5655, + "step": 8207 + }, + { + "epoch": 2.1789459710606662, + "grad_norm": 0.419731233863387, + "learning_rate": 3.5614652051759012e-06, + "loss": 0.5582, + "step": 8208 + }, + { + "epoch": 2.1792114695340503, + "grad_norm": 0.42420289270805106, + "learning_rate": 3.561149098748073e-06, + "loss": 0.5656, + "step": 8209 + }, + { + "epoch": 2.179476968007434, + "grad_norm": 0.44416463287902547, + "learning_rate": 3.5608329716252388e-06, + "loss": 0.5858, + "step": 8210 + }, + { + "epoch": 2.1797424664808176, + "grad_norm": 0.4328970772730496, + "learning_rate": 3.5605168238135647e-06, + "loss": 0.5897, + "step": 8211 + }, + { + "epoch": 2.1800079649542017, + "grad_norm": 0.485429221181809, + "learning_rate": 3.560200655319216e-06, + "loss": 0.5677, + "step": 8212 + }, + { + "epoch": 2.1802734634275853, + "grad_norm": 0.42794070030007736, + "learning_rate": 3.559884466148358e-06, + "loss": 0.5977, + "step": 8213 + }, + { + "epoch": 2.180538961900969, + "grad_norm": 0.4303144894484324, + "learning_rate": 3.559568256307158e-06, + "loss": 0.5959, + "step": 8214 + }, + { + "epoch": 2.180804460374353, + "grad_norm": 0.420496518331875, + "learning_rate": 3.559252025801784e-06, + "loss": 0.5758, + "step": 8215 + }, + { + "epoch": 2.1810699588477367, + "grad_norm": 0.45985407294003083, + "learning_rate": 3.5589357746384e-06, + "loss": 0.569, + "step": 8216 + }, + { + "epoch": 2.1813354573211203, + "grad_norm": 0.44820774763996896, + "learning_rate": 3.5586195028231766e-06, + "loss": 0.6039, + "step": 8217 + }, + { + "epoch": 2.1816009557945044, + "grad_norm": 0.44703879405779456, + "learning_rate": 3.558303210362282e-06, + "loss": 0.5783, + "step": 8218 + }, + { + "epoch": 2.181866454267888, + "grad_norm": 0.42251292720987427, + "learning_rate": 3.557986897261882e-06, + "loss": 0.5745, + "step": 8219 + }, + { + "epoch": 2.1821319527412717, + "grad_norm": 0.4214502888738516, + "learning_rate": 3.5576705635281474e-06, + "loss": 0.5952, + "step": 8220 + }, + { + "epoch": 2.1823974512146553, + "grad_norm": 0.4299714354129137, + "learning_rate": 3.557354209167248e-06, + "loss": 0.5466, + "step": 8221 + }, + { + "epoch": 2.1826629496880394, + "grad_norm": 0.42249806672703577, + "learning_rate": 3.557037834185352e-06, + "loss": 0.5809, + "step": 8222 + }, + { + "epoch": 2.182928448161423, + "grad_norm": 0.4175285126918826, + "learning_rate": 3.5567214385886305e-06, + "loss": 0.5162, + "step": 8223 + }, + { + "epoch": 2.1831939466348067, + "grad_norm": 0.4417292348315966, + "learning_rate": 3.5564050223832538e-06, + "loss": 0.5713, + "step": 8224 + }, + { + "epoch": 2.183459445108191, + "grad_norm": 0.42572683804037714, + "learning_rate": 3.5560885855753926e-06, + "loss": 0.5477, + "step": 8225 + }, + { + "epoch": 2.1837249435815744, + "grad_norm": 0.42012695262841354, + "learning_rate": 3.5557721281712176e-06, + "loss": 0.5495, + "step": 8226 + }, + { + "epoch": 2.183990442054958, + "grad_norm": 0.43424284486386594, + "learning_rate": 3.5554556501769015e-06, + "loss": 0.5477, + "step": 8227 + }, + { + "epoch": 2.184255940528342, + "grad_norm": 0.43081846640872273, + "learning_rate": 3.5551391515986163e-06, + "loss": 0.5655, + "step": 8228 + }, + { + "epoch": 2.184521439001726, + "grad_norm": 0.43668195409678656, + "learning_rate": 3.554822632442534e-06, + "loss": 0.5812, + "step": 8229 + }, + { + "epoch": 2.1847869374751094, + "grad_norm": 0.4667848112936377, + "learning_rate": 3.5545060927148273e-06, + "loss": 0.5317, + "step": 8230 + }, + { + "epoch": 2.185052435948493, + "grad_norm": 0.4266263563969217, + "learning_rate": 3.5541895324216703e-06, + "loss": 0.5307, + "step": 8231 + }, + { + "epoch": 2.185317934421877, + "grad_norm": 0.43559739727073227, + "learning_rate": 3.553872951569236e-06, + "loss": 0.5541, + "step": 8232 + }, + { + "epoch": 2.185583432895261, + "grad_norm": 0.43064820473070414, + "learning_rate": 3.5535563501636993e-06, + "loss": 0.5627, + "step": 8233 + }, + { + "epoch": 2.1858489313686444, + "grad_norm": 0.44219672662368725, + "learning_rate": 3.553239728211234e-06, + "loss": 0.5732, + "step": 8234 + }, + { + "epoch": 2.1861144298420285, + "grad_norm": 0.42773234888704736, + "learning_rate": 3.552923085718016e-06, + "loss": 0.5234, + "step": 8235 + }, + { + "epoch": 2.186379928315412, + "grad_norm": 0.4279662449696274, + "learning_rate": 3.552606422690219e-06, + "loss": 0.5576, + "step": 8236 + }, + { + "epoch": 2.186645426788796, + "grad_norm": 0.4114495811837613, + "learning_rate": 3.5522897391340193e-06, + "loss": 0.5448, + "step": 8237 + }, + { + "epoch": 2.18691092526218, + "grad_norm": 0.43408766284319705, + "learning_rate": 3.5519730350555935e-06, + "loss": 0.5925, + "step": 8238 + }, + { + "epoch": 2.1871764237355635, + "grad_norm": 0.41787049555493355, + "learning_rate": 3.551656310461118e-06, + "loss": 0.5787, + "step": 8239 + }, + { + "epoch": 2.187441922208947, + "grad_norm": 0.42038280658004606, + "learning_rate": 3.551339565356769e-06, + "loss": 0.5781, + "step": 8240 + }, + { + "epoch": 2.1877074206823313, + "grad_norm": 0.426015512443387, + "learning_rate": 3.5510227997487252e-06, + "loss": 0.5385, + "step": 8241 + }, + { + "epoch": 2.187972919155715, + "grad_norm": 0.4208504215952971, + "learning_rate": 3.550706013643163e-06, + "loss": 0.5574, + "step": 8242 + }, + { + "epoch": 2.1882384176290985, + "grad_norm": 0.43581887705368705, + "learning_rate": 3.550389207046261e-06, + "loss": 0.5608, + "step": 8243 + }, + { + "epoch": 2.1885039161024826, + "grad_norm": 0.42153121829924495, + "learning_rate": 3.550072379964198e-06, + "loss": 0.5397, + "step": 8244 + }, + { + "epoch": 2.1887694145758663, + "grad_norm": 0.4322108869054508, + "learning_rate": 3.549755532403152e-06, + "loss": 0.5702, + "step": 8245 + }, + { + "epoch": 2.18903491304925, + "grad_norm": 0.43294191445238983, + "learning_rate": 3.549438664369303e-06, + "loss": 0.5758, + "step": 8246 + }, + { + "epoch": 2.1893004115226335, + "grad_norm": 0.4395060571566151, + "learning_rate": 3.549121775868831e-06, + "loss": 0.5859, + "step": 8247 + }, + { + "epoch": 2.1895659099960176, + "grad_norm": 0.43256943379293084, + "learning_rate": 3.548804866907915e-06, + "loss": 0.5604, + "step": 8248 + }, + { + "epoch": 2.1898314084694013, + "grad_norm": 0.46267838789413823, + "learning_rate": 3.548487937492736e-06, + "loss": 0.5406, + "step": 8249 + }, + { + "epoch": 2.190096906942785, + "grad_norm": 0.4371956688996471, + "learning_rate": 3.548170987629477e-06, + "loss": 0.5688, + "step": 8250 + }, + { + "epoch": 2.190362405416169, + "grad_norm": 0.4355967835311301, + "learning_rate": 3.5478540173243153e-06, + "loss": 0.5934, + "step": 8251 + }, + { + "epoch": 2.1906279038895526, + "grad_norm": 0.4358903697195902, + "learning_rate": 3.547537026583435e-06, + "loss": 0.5856, + "step": 8252 + }, + { + "epoch": 2.1908934023629363, + "grad_norm": 0.41563418588273116, + "learning_rate": 3.5472200154130187e-06, + "loss": 0.5655, + "step": 8253 + }, + { + "epoch": 2.1911589008363204, + "grad_norm": 0.42897289332947863, + "learning_rate": 3.546902983819248e-06, + "loss": 0.583, + "step": 8254 + }, + { + "epoch": 2.191424399309704, + "grad_norm": 0.4428316543180506, + "learning_rate": 3.5465859318083053e-06, + "loss": 0.5912, + "step": 8255 + }, + { + "epoch": 2.1916898977830876, + "grad_norm": 0.44889167917542233, + "learning_rate": 3.546268859386374e-06, + "loss": 0.5769, + "step": 8256 + }, + { + "epoch": 2.1919553962564717, + "grad_norm": 0.4196362672219529, + "learning_rate": 3.5459517665596387e-06, + "loss": 0.5845, + "step": 8257 + }, + { + "epoch": 2.1922208947298554, + "grad_norm": 0.41756354057729594, + "learning_rate": 3.545634653334284e-06, + "loss": 0.5346, + "step": 8258 + }, + { + "epoch": 2.192486393203239, + "grad_norm": 0.4314161492977687, + "learning_rate": 3.5453175197164923e-06, + "loss": 0.5666, + "step": 8259 + }, + { + "epoch": 2.1927518916766227, + "grad_norm": 0.4279655761928378, + "learning_rate": 3.54500036571245e-06, + "loss": 0.553, + "step": 8260 + }, + { + "epoch": 2.1930173901500067, + "grad_norm": 0.43685382032894915, + "learning_rate": 3.544683191328342e-06, + "loss": 0.5881, + "step": 8261 + }, + { + "epoch": 2.1932828886233904, + "grad_norm": 0.43426636251180323, + "learning_rate": 3.5443659965703537e-06, + "loss": 0.603, + "step": 8262 + }, + { + "epoch": 2.193548387096774, + "grad_norm": 0.4424976177629162, + "learning_rate": 3.5440487814446712e-06, + "loss": 0.5668, + "step": 8263 + }, + { + "epoch": 2.193813885570158, + "grad_norm": 0.4239060470627123, + "learning_rate": 3.5437315459574823e-06, + "loss": 0.5854, + "step": 8264 + }, + { + "epoch": 2.1940793840435417, + "grad_norm": 0.4184852893279085, + "learning_rate": 3.5434142901149714e-06, + "loss": 0.5392, + "step": 8265 + }, + { + "epoch": 2.1943448825169254, + "grad_norm": 0.4207446567401549, + "learning_rate": 3.5430970139233274e-06, + "loss": 0.5681, + "step": 8266 + }, + { + "epoch": 2.1946103809903095, + "grad_norm": 0.43230939022087506, + "learning_rate": 3.542779717388738e-06, + "loss": 0.5785, + "step": 8267 + }, + { + "epoch": 2.194875879463693, + "grad_norm": 0.44145054946334555, + "learning_rate": 3.5424624005173914e-06, + "loss": 0.5799, + "step": 8268 + }, + { + "epoch": 2.1951413779370768, + "grad_norm": 0.4271199181712962, + "learning_rate": 3.542145063315475e-06, + "loss": 0.5561, + "step": 8269 + }, + { + "epoch": 2.195406876410461, + "grad_norm": 0.4254452778114775, + "learning_rate": 3.541827705789178e-06, + "loss": 0.5742, + "step": 8270 + }, + { + "epoch": 2.1956723748838445, + "grad_norm": 0.42974644438478876, + "learning_rate": 3.5415103279446905e-06, + "loss": 0.5686, + "step": 8271 + }, + { + "epoch": 2.195937873357228, + "grad_norm": 0.41875794267624056, + "learning_rate": 3.5411929297882016e-06, + "loss": 0.5525, + "step": 8272 + }, + { + "epoch": 2.196203371830612, + "grad_norm": 0.41629790580721837, + "learning_rate": 3.5408755113259006e-06, + "loss": 0.5324, + "step": 8273 + }, + { + "epoch": 2.196468870303996, + "grad_norm": 0.4265955826691378, + "learning_rate": 3.5405580725639795e-06, + "loss": 0.5632, + "step": 8274 + }, + { + "epoch": 2.1967343687773795, + "grad_norm": 0.43937639606143214, + "learning_rate": 3.540240613508627e-06, + "loss": 0.573, + "step": 8275 + }, + { + "epoch": 2.196999867250763, + "grad_norm": 0.42558416950544165, + "learning_rate": 3.5399231341660367e-06, + "loss": 0.5586, + "step": 8276 + }, + { + "epoch": 2.197265365724147, + "grad_norm": 0.4502049916517734, + "learning_rate": 3.539605634542399e-06, + "loss": 0.585, + "step": 8277 + }, + { + "epoch": 2.197530864197531, + "grad_norm": 0.43075218583839087, + "learning_rate": 3.539288114643905e-06, + "loss": 0.5879, + "step": 8278 + }, + { + "epoch": 2.1977963626709145, + "grad_norm": 0.4307620764894786, + "learning_rate": 3.53897057447675e-06, + "loss": 0.5757, + "step": 8279 + }, + { + "epoch": 2.1980618611442986, + "grad_norm": 0.4065437801292464, + "learning_rate": 3.5386530140471235e-06, + "loss": 0.5488, + "step": 8280 + }, + { + "epoch": 2.198327359617682, + "grad_norm": 0.42752226628840745, + "learning_rate": 3.5383354333612208e-06, + "loss": 0.584, + "step": 8281 + }, + { + "epoch": 2.198592858091066, + "grad_norm": 0.4233632441946237, + "learning_rate": 3.5380178324252344e-06, + "loss": 0.6003, + "step": 8282 + }, + { + "epoch": 2.19885835656445, + "grad_norm": 0.41544740428974836, + "learning_rate": 3.5377002112453597e-06, + "loss": 0.5864, + "step": 8283 + }, + { + "epoch": 2.1991238550378336, + "grad_norm": 0.43009166407730703, + "learning_rate": 3.537382569827789e-06, + "loss": 0.5552, + "step": 8284 + }, + { + "epoch": 2.1993893535112172, + "grad_norm": 0.41465725294161654, + "learning_rate": 3.537064908178718e-06, + "loss": 0.6049, + "step": 8285 + }, + { + "epoch": 2.199654851984601, + "grad_norm": 0.4342671353261724, + "learning_rate": 3.5367472263043435e-06, + "loss": 0.5894, + "step": 8286 + }, + { + "epoch": 2.199920350457985, + "grad_norm": 0.43241203932344835, + "learning_rate": 3.536429524210859e-06, + "loss": 0.5864, + "step": 8287 + }, + { + "epoch": 2.2001858489313686, + "grad_norm": 0.46321167563830457, + "learning_rate": 3.5361118019044616e-06, + "loss": 0.5681, + "step": 8288 + }, + { + "epoch": 2.2004513474047522, + "grad_norm": 0.4298134654298973, + "learning_rate": 3.5357940593913464e-06, + "loss": 0.5573, + "step": 8289 + }, + { + "epoch": 2.2007168458781363, + "grad_norm": 0.43331419233183927, + "learning_rate": 3.5354762966777118e-06, + "loss": 0.581, + "step": 8290 + }, + { + "epoch": 2.20098234435152, + "grad_norm": 0.4136023374845611, + "learning_rate": 3.5351585137697537e-06, + "loss": 0.5372, + "step": 8291 + }, + { + "epoch": 2.2012478428249036, + "grad_norm": 0.44456136916550354, + "learning_rate": 3.53484071067367e-06, + "loss": 0.5666, + "step": 8292 + }, + { + "epoch": 2.2015133412982877, + "grad_norm": 0.4431851966579242, + "learning_rate": 3.5345228873956583e-06, + "loss": 0.5442, + "step": 8293 + }, + { + "epoch": 2.2017788397716713, + "grad_norm": 0.42116011574848733, + "learning_rate": 3.5342050439419177e-06, + "loss": 0.5414, + "step": 8294 + }, + { + "epoch": 2.202044338245055, + "grad_norm": 0.43025446794780403, + "learning_rate": 3.533887180318647e-06, + "loss": 0.5592, + "step": 8295 + }, + { + "epoch": 2.202309836718439, + "grad_norm": 0.4120201442272711, + "learning_rate": 3.533569296532045e-06, + "loss": 0.5608, + "step": 8296 + }, + { + "epoch": 2.2025753351918227, + "grad_norm": 0.4277289989056543, + "learning_rate": 3.5332513925883105e-06, + "loss": 0.5553, + "step": 8297 + }, + { + "epoch": 2.2028408336652063, + "grad_norm": 0.42433709926151014, + "learning_rate": 3.532933468493644e-06, + "loss": 0.5891, + "step": 8298 + }, + { + "epoch": 2.2031063321385904, + "grad_norm": 0.4414095462830388, + "learning_rate": 3.5326155242542465e-06, + "loss": 0.5707, + "step": 8299 + }, + { + "epoch": 2.203371830611974, + "grad_norm": 0.4320114733918895, + "learning_rate": 3.5322975598763176e-06, + "loss": 0.5411, + "step": 8300 + }, + { + "epoch": 2.2036373290853577, + "grad_norm": 0.4367447582100221, + "learning_rate": 3.5319795753660584e-06, + "loss": 0.545, + "step": 8301 + }, + { + "epoch": 2.2039028275587413, + "grad_norm": 0.40914417434468037, + "learning_rate": 3.5316615707296707e-06, + "loss": 0.5422, + "step": 8302 + }, + { + "epoch": 2.2041683260321254, + "grad_norm": 0.4238222869219569, + "learning_rate": 3.5313435459733568e-06, + "loss": 0.576, + "step": 8303 + }, + { + "epoch": 2.204433824505509, + "grad_norm": 0.4284090015978858, + "learning_rate": 3.531025501103319e-06, + "loss": 0.5879, + "step": 8304 + }, + { + "epoch": 2.2046993229788927, + "grad_norm": 0.41743672713568314, + "learning_rate": 3.530707436125759e-06, + "loss": 0.5287, + "step": 8305 + }, + { + "epoch": 2.204964821452277, + "grad_norm": 0.41605541793651385, + "learning_rate": 3.5303893510468804e-06, + "loss": 0.5495, + "step": 8306 + }, + { + "epoch": 2.2052303199256604, + "grad_norm": 0.44833032441811826, + "learning_rate": 3.5300712458728874e-06, + "loss": 0.5551, + "step": 8307 + }, + { + "epoch": 2.205495818399044, + "grad_norm": 0.43177408499726616, + "learning_rate": 3.529753120609982e-06, + "loss": 0.5378, + "step": 8308 + }, + { + "epoch": 2.205761316872428, + "grad_norm": 0.4299711299636745, + "learning_rate": 3.5294349752643695e-06, + "loss": 0.5416, + "step": 8309 + }, + { + "epoch": 2.206026815345812, + "grad_norm": 0.4304833589950197, + "learning_rate": 3.529116809842255e-06, + "loss": 0.5527, + "step": 8310 + }, + { + "epoch": 2.2062923138191954, + "grad_norm": 0.42530363028349183, + "learning_rate": 3.528798624349843e-06, + "loss": 0.5733, + "step": 8311 + }, + { + "epoch": 2.2065578122925795, + "grad_norm": 0.4565838053712839, + "learning_rate": 3.5284804187933388e-06, + "loss": 0.5615, + "step": 8312 + }, + { + "epoch": 2.206823310765963, + "grad_norm": 0.42473966476144603, + "learning_rate": 3.5281621931789483e-06, + "loss": 0.5518, + "step": 8313 + }, + { + "epoch": 2.207088809239347, + "grad_norm": 0.43216909661291353, + "learning_rate": 3.527843947512878e-06, + "loss": 0.5538, + "step": 8314 + }, + { + "epoch": 2.2073543077127304, + "grad_norm": 0.43320833386447716, + "learning_rate": 3.527525681801333e-06, + "loss": 0.5575, + "step": 8315 + }, + { + "epoch": 2.2076198061861145, + "grad_norm": 0.42953050913820645, + "learning_rate": 3.5272073960505226e-06, + "loss": 0.5488, + "step": 8316 + }, + { + "epoch": 2.207885304659498, + "grad_norm": 0.42303394981790604, + "learning_rate": 3.5268890902666524e-06, + "loss": 0.5829, + "step": 8317 + }, + { + "epoch": 2.208150803132882, + "grad_norm": 0.4349062050497459, + "learning_rate": 3.5265707644559312e-06, + "loss": 0.5888, + "step": 8318 + }, + { + "epoch": 2.208416301606266, + "grad_norm": 0.43152811257234547, + "learning_rate": 3.526252418624566e-06, + "loss": 0.5514, + "step": 8319 + }, + { + "epoch": 2.2086818000796495, + "grad_norm": 0.4333169027420834, + "learning_rate": 3.525934052778766e-06, + "loss": 0.5718, + "step": 8320 + }, + { + "epoch": 2.208947298553033, + "grad_norm": 0.416132330653859, + "learning_rate": 3.52561566692474e-06, + "loss": 0.5699, + "step": 8321 + }, + { + "epoch": 2.2092127970264173, + "grad_norm": 0.43472154429191506, + "learning_rate": 3.5252972610686974e-06, + "loss": 0.5688, + "step": 8322 + }, + { + "epoch": 2.209478295499801, + "grad_norm": 0.4429422051314671, + "learning_rate": 3.5249788352168478e-06, + "loss": 0.5708, + "step": 8323 + }, + { + "epoch": 2.2097437939731845, + "grad_norm": 0.4215939656471332, + "learning_rate": 3.5246603893754017e-06, + "loss": 0.5994, + "step": 8324 + }, + { + "epoch": 2.2100092924465686, + "grad_norm": 0.43953672985193265, + "learning_rate": 3.5243419235505693e-06, + "loss": 0.5646, + "step": 8325 + }, + { + "epoch": 2.2102747909199523, + "grad_norm": 0.441010459267688, + "learning_rate": 3.5240234377485605e-06, + "loss": 0.5675, + "step": 8326 + }, + { + "epoch": 2.210540289393336, + "grad_norm": 0.4297897679398152, + "learning_rate": 3.5237049319755884e-06, + "loss": 0.5809, + "step": 8327 + }, + { + "epoch": 2.21080578786672, + "grad_norm": 0.4334697137103331, + "learning_rate": 3.523386406237863e-06, + "loss": 0.558, + "step": 8328 + }, + { + "epoch": 2.2110712863401036, + "grad_norm": 0.4251408146269121, + "learning_rate": 3.5230678605415976e-06, + "loss": 0.5963, + "step": 8329 + }, + { + "epoch": 2.2113367848134873, + "grad_norm": 0.4322699901421514, + "learning_rate": 3.5227492948930036e-06, + "loss": 0.5973, + "step": 8330 + }, + { + "epoch": 2.211602283286871, + "grad_norm": 0.4332140386304007, + "learning_rate": 3.522430709298294e-06, + "loss": 0.5691, + "step": 8331 + }, + { + "epoch": 2.211867781760255, + "grad_norm": 0.4216068715173031, + "learning_rate": 3.5221121037636825e-06, + "loss": 0.5386, + "step": 8332 + }, + { + "epoch": 2.2121332802336386, + "grad_norm": 0.4385664815616991, + "learning_rate": 3.5217934782953826e-06, + "loss": 0.567, + "step": 8333 + }, + { + "epoch": 2.2123987787070223, + "grad_norm": 0.4121095691103931, + "learning_rate": 3.5214748328996077e-06, + "loss": 0.5484, + "step": 8334 + }, + { + "epoch": 2.2126642771804064, + "grad_norm": 0.4266373223965364, + "learning_rate": 3.521156167582573e-06, + "loss": 0.5729, + "step": 8335 + }, + { + "epoch": 2.21292977565379, + "grad_norm": 0.43879405509868213, + "learning_rate": 3.520837482350493e-06, + "loss": 0.5173, + "step": 8336 + }, + { + "epoch": 2.2131952741271737, + "grad_norm": 0.4481433957343789, + "learning_rate": 3.5205187772095816e-06, + "loss": 0.5656, + "step": 8337 + }, + { + "epoch": 2.2134607726005577, + "grad_norm": 0.44229196800829995, + "learning_rate": 3.520200052166056e-06, + "loss": 0.5688, + "step": 8338 + }, + { + "epoch": 2.2137262710739414, + "grad_norm": 0.4302503723254411, + "learning_rate": 3.5198813072261316e-06, + "loss": 0.5484, + "step": 8339 + }, + { + "epoch": 2.213991769547325, + "grad_norm": 0.4615235568985375, + "learning_rate": 3.5195625423960243e-06, + "loss": 0.5363, + "step": 8340 + }, + { + "epoch": 2.2142572680207087, + "grad_norm": 0.4576982532899516, + "learning_rate": 3.5192437576819517e-06, + "loss": 0.5272, + "step": 8341 + }, + { + "epoch": 2.2145227664940927, + "grad_norm": 0.4292199220014541, + "learning_rate": 3.5189249530901303e-06, + "loss": 0.5462, + "step": 8342 + }, + { + "epoch": 2.2147882649674764, + "grad_norm": 0.43028663695064484, + "learning_rate": 3.518606128626777e-06, + "loss": 0.542, + "step": 8343 + }, + { + "epoch": 2.21505376344086, + "grad_norm": 0.4417543912262415, + "learning_rate": 3.51828728429811e-06, + "loss": 0.5651, + "step": 8344 + }, + { + "epoch": 2.215319261914244, + "grad_norm": 0.44432485196135546, + "learning_rate": 3.5179684201103487e-06, + "loss": 0.5935, + "step": 8345 + }, + { + "epoch": 2.2155847603876277, + "grad_norm": 0.4388244346598191, + "learning_rate": 3.51764953606971e-06, + "loss": 0.5503, + "step": 8346 + }, + { + "epoch": 2.2158502588610114, + "grad_norm": 0.4292639269340591, + "learning_rate": 3.517330632182414e-06, + "loss": 0.5643, + "step": 8347 + }, + { + "epoch": 2.2161157573343955, + "grad_norm": 0.43544551354561456, + "learning_rate": 3.5170117084546797e-06, + "loss": 0.5493, + "step": 8348 + }, + { + "epoch": 2.216381255807779, + "grad_norm": 0.4626442194391668, + "learning_rate": 3.5166927648927267e-06, + "loss": 0.5628, + "step": 8349 + }, + { + "epoch": 2.2166467542811628, + "grad_norm": 0.44386423900366967, + "learning_rate": 3.516373801502776e-06, + "loss": 0.576, + "step": 8350 + }, + { + "epoch": 2.216912252754547, + "grad_norm": 0.4262693813730813, + "learning_rate": 3.516054818291048e-06, + "loss": 0.5561, + "step": 8351 + }, + { + "epoch": 2.2171777512279305, + "grad_norm": 0.42048196950630246, + "learning_rate": 3.5157358152637623e-06, + "loss": 0.5693, + "step": 8352 + }, + { + "epoch": 2.217443249701314, + "grad_norm": 0.4368012279693426, + "learning_rate": 3.515416792427142e-06, + "loss": 0.5329, + "step": 8353 + }, + { + "epoch": 2.217708748174698, + "grad_norm": 0.42705886435995977, + "learning_rate": 3.5150977497874084e-06, + "loss": 0.5439, + "step": 8354 + }, + { + "epoch": 2.217974246648082, + "grad_norm": 0.43666914858250333, + "learning_rate": 3.5147786873507823e-06, + "loss": 0.5806, + "step": 8355 + }, + { + "epoch": 2.2182397451214655, + "grad_norm": 0.42809537616294396, + "learning_rate": 3.5144596051234875e-06, + "loss": 0.5895, + "step": 8356 + }, + { + "epoch": 2.218505243594849, + "grad_norm": 0.42874888281055884, + "learning_rate": 3.514140503111747e-06, + "loss": 0.5556, + "step": 8357 + }, + { + "epoch": 2.218770742068233, + "grad_norm": 0.4456059422685089, + "learning_rate": 3.5138213813217832e-06, + "loss": 0.508, + "step": 8358 + }, + { + "epoch": 2.219036240541617, + "grad_norm": 0.42721250452036336, + "learning_rate": 3.513502239759821e-06, + "loss": 0.5671, + "step": 8359 + }, + { + "epoch": 2.2193017390150005, + "grad_norm": 0.41772426094133414, + "learning_rate": 3.5131830784320824e-06, + "loss": 0.5502, + "step": 8360 + }, + { + "epoch": 2.2195672374883846, + "grad_norm": 0.43000282293241876, + "learning_rate": 3.5128638973447937e-06, + "loss": 0.5762, + "step": 8361 + }, + { + "epoch": 2.219832735961768, + "grad_norm": 0.4356162598757294, + "learning_rate": 3.512544696504179e-06, + "loss": 0.5539, + "step": 8362 + }, + { + "epoch": 2.220098234435152, + "grad_norm": 0.4340345631306146, + "learning_rate": 3.5122254759164644e-06, + "loss": 0.5562, + "step": 8363 + }, + { + "epoch": 2.220363732908536, + "grad_norm": 0.44443487464485515, + "learning_rate": 3.511906235587873e-06, + "loss": 0.5678, + "step": 8364 + }, + { + "epoch": 2.2206292313819196, + "grad_norm": 0.428294238127621, + "learning_rate": 3.511586975524634e-06, + "loss": 0.5414, + "step": 8365 + }, + { + "epoch": 2.2208947298553032, + "grad_norm": 0.43479708468158773, + "learning_rate": 3.511267695732971e-06, + "loss": 0.5721, + "step": 8366 + }, + { + "epoch": 2.2211602283286873, + "grad_norm": 0.4546812245249009, + "learning_rate": 3.5109483962191125e-06, + "loss": 0.5738, + "step": 8367 + }, + { + "epoch": 2.221425726802071, + "grad_norm": 0.4437203104892749, + "learning_rate": 3.5106290769892853e-06, + "loss": 0.5647, + "step": 8368 + }, + { + "epoch": 2.2216912252754546, + "grad_norm": 0.4295385350730987, + "learning_rate": 3.5103097380497162e-06, + "loss": 0.5914, + "step": 8369 + }, + { + "epoch": 2.2219567237488382, + "grad_norm": 0.4074080012729898, + "learning_rate": 3.5099903794066336e-06, + "loss": 0.5078, + "step": 8370 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.4344864084178576, + "learning_rate": 3.509671001066266e-06, + "loss": 0.5773, + "step": 8371 + }, + { + "epoch": 2.222487720695606, + "grad_norm": 0.4316399878979075, + "learning_rate": 3.509351603034842e-06, + "loss": 0.5633, + "step": 8372 + }, + { + "epoch": 2.2227532191689896, + "grad_norm": 0.44313824441204963, + "learning_rate": 3.5090321853185894e-06, + "loss": 0.5839, + "step": 8373 + }, + { + "epoch": 2.2230187176423737, + "grad_norm": 0.4293862377099834, + "learning_rate": 3.50871274792374e-06, + "loss": 0.5664, + "step": 8374 + }, + { + "epoch": 2.2232842161157573, + "grad_norm": 0.4379194728789382, + "learning_rate": 3.508393290856521e-06, + "loss": 0.5422, + "step": 8375 + }, + { + "epoch": 2.223549714589141, + "grad_norm": 0.42577918884594324, + "learning_rate": 3.5080738141231645e-06, + "loss": 0.5579, + "step": 8376 + }, + { + "epoch": 2.223815213062525, + "grad_norm": 0.44540219764276706, + "learning_rate": 3.5077543177299e-06, + "loss": 0.5511, + "step": 8377 + }, + { + "epoch": 2.2240807115359087, + "grad_norm": 0.4264588727183936, + "learning_rate": 3.5074348016829597e-06, + "loss": 0.565, + "step": 8378 + }, + { + "epoch": 2.2243462100092923, + "grad_norm": 0.4316595244470432, + "learning_rate": 3.507115265988574e-06, + "loss": 0.5746, + "step": 8379 + }, + { + "epoch": 2.2246117084826764, + "grad_norm": 0.4235960961538575, + "learning_rate": 3.506795710652974e-06, + "loss": 0.5833, + "step": 8380 + }, + { + "epoch": 2.22487720695606, + "grad_norm": 0.44094632257166927, + "learning_rate": 3.5064761356823928e-06, + "loss": 0.6209, + "step": 8381 + }, + { + "epoch": 2.2251427054294437, + "grad_norm": 0.43893112092169484, + "learning_rate": 3.5061565410830632e-06, + "loss": 0.5618, + "step": 8382 + }, + { + "epoch": 2.225408203902828, + "grad_norm": 0.43910983505882245, + "learning_rate": 3.5058369268612177e-06, + "loss": 0.5783, + "step": 8383 + }, + { + "epoch": 2.2256737023762114, + "grad_norm": 0.40921972468896206, + "learning_rate": 3.5055172930230885e-06, + "loss": 0.5603, + "step": 8384 + }, + { + "epoch": 2.225939200849595, + "grad_norm": 0.41199965500912983, + "learning_rate": 3.5051976395749104e-06, + "loss": 0.5544, + "step": 8385 + }, + { + "epoch": 2.2262046993229787, + "grad_norm": 0.4363581060915745, + "learning_rate": 3.504877966522917e-06, + "loss": 0.5532, + "step": 8386 + }, + { + "epoch": 2.226470197796363, + "grad_norm": 0.4241502466672741, + "learning_rate": 3.5045582738733435e-06, + "loss": 0.5946, + "step": 8387 + }, + { + "epoch": 2.2267356962697464, + "grad_norm": 0.42268410126011297, + "learning_rate": 3.5042385616324243e-06, + "loss": 0.5882, + "step": 8388 + }, + { + "epoch": 2.22700119474313, + "grad_norm": 0.41587079027519275, + "learning_rate": 3.5039188298063936e-06, + "loss": 0.5764, + "step": 8389 + }, + { + "epoch": 2.227266693216514, + "grad_norm": 0.43309899004353225, + "learning_rate": 3.5035990784014877e-06, + "loss": 0.5546, + "step": 8390 + }, + { + "epoch": 2.227532191689898, + "grad_norm": 0.43092205022114005, + "learning_rate": 3.5032793074239434e-06, + "loss": 0.5938, + "step": 8391 + }, + { + "epoch": 2.2277976901632814, + "grad_norm": 0.4399142930259134, + "learning_rate": 3.5029595168799955e-06, + "loss": 0.6215, + "step": 8392 + }, + { + "epoch": 2.2280631886366655, + "grad_norm": 0.42700584132387315, + "learning_rate": 3.502639706775881e-06, + "loss": 0.5604, + "step": 8393 + }, + { + "epoch": 2.228328687110049, + "grad_norm": 0.4218062317008268, + "learning_rate": 3.502319877117838e-06, + "loss": 0.5358, + "step": 8394 + }, + { + "epoch": 2.228594185583433, + "grad_norm": 0.4020563061483536, + "learning_rate": 3.502000027912103e-06, + "loss": 0.5333, + "step": 8395 + }, + { + "epoch": 2.2288596840568164, + "grad_norm": 0.4173276985188629, + "learning_rate": 3.501680159164914e-06, + "loss": 0.5708, + "step": 8396 + }, + { + "epoch": 2.2291251825302005, + "grad_norm": 0.4260525630596005, + "learning_rate": 3.5013602708825102e-06, + "loss": 0.5484, + "step": 8397 + }, + { + "epoch": 2.229390681003584, + "grad_norm": 0.4279832470834054, + "learning_rate": 3.5010403630711288e-06, + "loss": 0.5622, + "step": 8398 + }, + { + "epoch": 2.229656179476968, + "grad_norm": 0.4326158327036861, + "learning_rate": 3.5007204357370088e-06, + "loss": 0.5992, + "step": 8399 + }, + { + "epoch": 2.229921677950352, + "grad_norm": 0.44446908935812274, + "learning_rate": 3.500400488886391e-06, + "loss": 0.5598, + "step": 8400 + }, + { + "epoch": 2.2301871764237355, + "grad_norm": 0.44431110876211555, + "learning_rate": 3.5000805225255137e-06, + "loss": 0.5503, + "step": 8401 + }, + { + "epoch": 2.230452674897119, + "grad_norm": 0.4267874879091476, + "learning_rate": 3.499760536660618e-06, + "loss": 0.5321, + "step": 8402 + }, + { + "epoch": 2.2307181733705033, + "grad_norm": 0.43779091362781397, + "learning_rate": 3.4994405312979435e-06, + "loss": 0.5884, + "step": 8403 + }, + { + "epoch": 2.230983671843887, + "grad_norm": 0.43100017949898217, + "learning_rate": 3.4991205064437315e-06, + "loss": 0.5664, + "step": 8404 + }, + { + "epoch": 2.2312491703172705, + "grad_norm": 0.43140197524130475, + "learning_rate": 3.4988004621042245e-06, + "loss": 0.5873, + "step": 8405 + }, + { + "epoch": 2.2315146687906546, + "grad_norm": 0.4383920729388004, + "learning_rate": 3.4984803982856617e-06, + "loss": 0.5821, + "step": 8406 + }, + { + "epoch": 2.2317801672640383, + "grad_norm": 0.43647495617701904, + "learning_rate": 3.4981603149942867e-06, + "loss": 0.5725, + "step": 8407 + }, + { + "epoch": 2.232045665737422, + "grad_norm": 0.42724851875804976, + "learning_rate": 3.4978402122363425e-06, + "loss": 0.5707, + "step": 8408 + }, + { + "epoch": 2.232311164210806, + "grad_norm": 0.4366552943288066, + "learning_rate": 3.49752009001807e-06, + "loss": 0.5336, + "step": 8409 + }, + { + "epoch": 2.2325766626841896, + "grad_norm": 0.42268126300556375, + "learning_rate": 3.4971999483457144e-06, + "loss": 0.5606, + "step": 8410 + }, + { + "epoch": 2.2328421611575733, + "grad_norm": 0.4260872511726609, + "learning_rate": 3.496879787225518e-06, + "loss": 0.6096, + "step": 8411 + }, + { + "epoch": 2.233107659630957, + "grad_norm": 0.4169713035669326, + "learning_rate": 3.4965596066637235e-06, + "loss": 0.6025, + "step": 8412 + }, + { + "epoch": 2.233373158104341, + "grad_norm": 0.43080271560299715, + "learning_rate": 3.4962394066665778e-06, + "loss": 0.5622, + "step": 8413 + }, + { + "epoch": 2.2336386565777246, + "grad_norm": 0.4316169795396076, + "learning_rate": 3.495919187240325e-06, + "loss": 0.5627, + "step": 8414 + }, + { + "epoch": 2.2339041550511083, + "grad_norm": 0.43616716114427484, + "learning_rate": 3.4955989483912086e-06, + "loss": 0.5786, + "step": 8415 + }, + { + "epoch": 2.2341696535244924, + "grad_norm": 0.4428562149461648, + "learning_rate": 3.4952786901254754e-06, + "loss": 0.5784, + "step": 8416 + }, + { + "epoch": 2.234435151997876, + "grad_norm": 0.4447422787496083, + "learning_rate": 3.4949584124493712e-06, + "loss": 0.5511, + "step": 8417 + }, + { + "epoch": 2.2347006504712597, + "grad_norm": 0.4202045889292342, + "learning_rate": 3.4946381153691415e-06, + "loss": 0.5561, + "step": 8418 + }, + { + "epoch": 2.2349661489446437, + "grad_norm": 0.44377552102135, + "learning_rate": 3.494317798891034e-06, + "loss": 0.5705, + "step": 8419 + }, + { + "epoch": 2.2352316474180274, + "grad_norm": 0.4368560959490654, + "learning_rate": 3.493997463021294e-06, + "loss": 0.5848, + "step": 8420 + }, + { + "epoch": 2.235497145891411, + "grad_norm": 0.44062910768878577, + "learning_rate": 3.4936771077661703e-06, + "loss": 0.602, + "step": 8421 + }, + { + "epoch": 2.235762644364795, + "grad_norm": 0.4334399380264875, + "learning_rate": 3.4933567331319092e-06, + "loss": 0.5582, + "step": 8422 + }, + { + "epoch": 2.2360281428381787, + "grad_norm": 0.43395549333355654, + "learning_rate": 3.493036339124761e-06, + "loss": 0.5852, + "step": 8423 + }, + { + "epoch": 2.2362936413115624, + "grad_norm": 0.4236788773462707, + "learning_rate": 3.4927159257509715e-06, + "loss": 0.5992, + "step": 8424 + }, + { + "epoch": 2.236559139784946, + "grad_norm": 0.42910042262345455, + "learning_rate": 3.4923954930167914e-06, + "loss": 0.5663, + "step": 8425 + }, + { + "epoch": 2.23682463825833, + "grad_norm": 0.43484517357838604, + "learning_rate": 3.4920750409284697e-06, + "loss": 0.5856, + "step": 8426 + }, + { + "epoch": 2.2370901367317138, + "grad_norm": 0.4392414055927907, + "learning_rate": 3.4917545694922558e-06, + "loss": 0.6267, + "step": 8427 + }, + { + "epoch": 2.2373556352050974, + "grad_norm": 0.44459950273924503, + "learning_rate": 3.491434078714399e-06, + "loss": 0.5432, + "step": 8428 + }, + { + "epoch": 2.2376211336784815, + "grad_norm": 0.4510047451879924, + "learning_rate": 3.4911135686011503e-06, + "loss": 0.5169, + "step": 8429 + }, + { + "epoch": 2.237886632151865, + "grad_norm": 0.42857302178086804, + "learning_rate": 3.4907930391587606e-06, + "loss": 0.5865, + "step": 8430 + }, + { + "epoch": 2.2381521306252488, + "grad_norm": 0.4354733485790215, + "learning_rate": 3.4904724903934805e-06, + "loss": 0.5992, + "step": 8431 + }, + { + "epoch": 2.238417629098633, + "grad_norm": 0.4808390444400206, + "learning_rate": 3.4901519223115615e-06, + "loss": 0.552, + "step": 8432 + }, + { + "epoch": 2.2386831275720165, + "grad_norm": 0.487107469131134, + "learning_rate": 3.489831334919256e-06, + "loss": 0.5911, + "step": 8433 + }, + { + "epoch": 2.2389486260454, + "grad_norm": 0.42860446638123045, + "learning_rate": 3.4895107282228164e-06, + "loss": 0.5573, + "step": 8434 + }, + { + "epoch": 2.239214124518784, + "grad_norm": 0.43268622255302314, + "learning_rate": 3.4891901022284946e-06, + "loss": 0.5883, + "step": 8435 + }, + { + "epoch": 2.239479622992168, + "grad_norm": 0.43744792851396774, + "learning_rate": 3.4888694569425434e-06, + "loss": 0.5458, + "step": 8436 + }, + { + "epoch": 2.2397451214655515, + "grad_norm": 0.4313242608517905, + "learning_rate": 3.4885487923712174e-06, + "loss": 0.5748, + "step": 8437 + }, + { + "epoch": 2.2400106199389356, + "grad_norm": 0.4447843445986176, + "learning_rate": 3.488228108520769e-06, + "loss": 0.5502, + "step": 8438 + }, + { + "epoch": 2.240276118412319, + "grad_norm": 0.4386784978531559, + "learning_rate": 3.4879074053974526e-06, + "loss": 0.5464, + "step": 8439 + }, + { + "epoch": 2.240541616885703, + "grad_norm": 0.42300779698232555, + "learning_rate": 3.4875866830075233e-06, + "loss": 0.5665, + "step": 8440 + }, + { + "epoch": 2.2408071153590865, + "grad_norm": 0.43618717888098046, + "learning_rate": 3.4872659413572358e-06, + "loss": 0.552, + "step": 8441 + }, + { + "epoch": 2.2410726138324706, + "grad_norm": 0.4421362805617268, + "learning_rate": 3.486945180452845e-06, + "loss": 0.5928, + "step": 8442 + }, + { + "epoch": 2.2413381123058542, + "grad_norm": 0.4360589976311463, + "learning_rate": 3.486624400300607e-06, + "loss": 0.5553, + "step": 8443 + }, + { + "epoch": 2.241603610779238, + "grad_norm": 0.4283297760567781, + "learning_rate": 3.4863036009067775e-06, + "loss": 0.577, + "step": 8444 + }, + { + "epoch": 2.241869109252622, + "grad_norm": 0.4304774597555228, + "learning_rate": 3.4859827822776127e-06, + "loss": 0.5724, + "step": 8445 + }, + { + "epoch": 2.2421346077260056, + "grad_norm": 0.41816395848491394, + "learning_rate": 3.4856619444193694e-06, + "loss": 0.5676, + "step": 8446 + }, + { + "epoch": 2.2424001061993892, + "grad_norm": 0.42472783781238577, + "learning_rate": 3.4853410873383054e-06, + "loss": 0.5883, + "step": 8447 + }, + { + "epoch": 2.2426656046727733, + "grad_norm": 0.4252652818311552, + "learning_rate": 3.4850202110406773e-06, + "loss": 0.5832, + "step": 8448 + }, + { + "epoch": 2.242931103146157, + "grad_norm": 0.4270459141998556, + "learning_rate": 3.484699315532743e-06, + "loss": 0.5913, + "step": 8449 + }, + { + "epoch": 2.2431966016195406, + "grad_norm": 0.4189839876235439, + "learning_rate": 3.4843784008207614e-06, + "loss": 0.5721, + "step": 8450 + }, + { + "epoch": 2.2434621000929242, + "grad_norm": 0.43433407007206165, + "learning_rate": 3.4840574669109906e-06, + "loss": 0.5918, + "step": 8451 + }, + { + "epoch": 2.2437275985663083, + "grad_norm": 0.43364892076480527, + "learning_rate": 3.4837365138096906e-06, + "loss": 0.4985, + "step": 8452 + }, + { + "epoch": 2.243993097039692, + "grad_norm": 0.43427856578928115, + "learning_rate": 3.48341554152312e-06, + "loss": 0.5889, + "step": 8453 + }, + { + "epoch": 2.2442585955130756, + "grad_norm": 0.4432576798781629, + "learning_rate": 3.4830945500575373e-06, + "loss": 0.5798, + "step": 8454 + }, + { + "epoch": 2.2445240939864597, + "grad_norm": 0.4343245018863736, + "learning_rate": 3.482773539419205e-06, + "loss": 0.5774, + "step": 8455 + }, + { + "epoch": 2.2447895924598433, + "grad_norm": 0.43857212956487013, + "learning_rate": 3.4824525096143815e-06, + "loss": 0.5666, + "step": 8456 + }, + { + "epoch": 2.245055090933227, + "grad_norm": 0.4307807419805649, + "learning_rate": 3.4821314606493294e-06, + "loss": 0.5721, + "step": 8457 + }, + { + "epoch": 2.245320589406611, + "grad_norm": 0.4305290261024878, + "learning_rate": 3.481810392530308e-06, + "loss": 0.5545, + "step": 8458 + }, + { + "epoch": 2.2455860878799947, + "grad_norm": 0.4095779568327294, + "learning_rate": 3.481489305263581e-06, + "loss": 0.5491, + "step": 8459 + }, + { + "epoch": 2.2458515863533783, + "grad_norm": 0.4282508920274349, + "learning_rate": 3.4811681988554095e-06, + "loss": 0.562, + "step": 8460 + }, + { + "epoch": 2.2461170848267624, + "grad_norm": 0.42366582840713984, + "learning_rate": 3.480847073312056e-06, + "loss": 0.567, + "step": 8461 + }, + { + "epoch": 2.246382583300146, + "grad_norm": 0.4218657841167782, + "learning_rate": 3.4805259286397826e-06, + "loss": 0.5412, + "step": 8462 + }, + { + "epoch": 2.2466480817735297, + "grad_norm": 0.44488732581491075, + "learning_rate": 3.480204764844853e-06, + "loss": 0.5924, + "step": 8463 + }, + { + "epoch": 2.246913580246914, + "grad_norm": 0.4207054773093573, + "learning_rate": 3.4798835819335302e-06, + "loss": 0.5459, + "step": 8464 + }, + { + "epoch": 2.2471790787202974, + "grad_norm": 0.43785040096124667, + "learning_rate": 3.479562379912079e-06, + "loss": 0.5719, + "step": 8465 + }, + { + "epoch": 2.247444577193681, + "grad_norm": 0.42374473645492894, + "learning_rate": 3.4792411587867624e-06, + "loss": 0.5255, + "step": 8466 + }, + { + "epoch": 2.2477100756670647, + "grad_norm": 0.42372921167139677, + "learning_rate": 3.478919918563846e-06, + "loss": 0.5785, + "step": 8467 + }, + { + "epoch": 2.247975574140449, + "grad_norm": 0.4335511307988954, + "learning_rate": 3.4785986592495934e-06, + "loss": 0.5913, + "step": 8468 + }, + { + "epoch": 2.2482410726138324, + "grad_norm": 0.4164595517929875, + "learning_rate": 3.4782773808502723e-06, + "loss": 0.541, + "step": 8469 + }, + { + "epoch": 2.248506571087216, + "grad_norm": 0.42150552189659524, + "learning_rate": 3.4779560833721465e-06, + "loss": 0.5736, + "step": 8470 + }, + { + "epoch": 2.2487720695606, + "grad_norm": 0.42220024200623407, + "learning_rate": 3.4776347668214827e-06, + "loss": 0.5525, + "step": 8471 + }, + { + "epoch": 2.249037568033984, + "grad_norm": 0.4240358801598312, + "learning_rate": 3.4773134312045477e-06, + "loss": 0.5735, + "step": 8472 + }, + { + "epoch": 2.2493030665073674, + "grad_norm": 0.419433254850018, + "learning_rate": 3.4769920765276066e-06, + "loss": 0.5596, + "step": 8473 + }, + { + "epoch": 2.2495685649807515, + "grad_norm": 0.42030087495296353, + "learning_rate": 3.4766707027969287e-06, + "loss": 0.5292, + "step": 8474 + }, + { + "epoch": 2.249834063454135, + "grad_norm": 0.4360122013527655, + "learning_rate": 3.4763493100187813e-06, + "loss": 0.5923, + "step": 8475 + }, + { + "epoch": 2.250099561927519, + "grad_norm": 0.4383475295171756, + "learning_rate": 3.476027898199431e-06, + "loss": 0.5642, + "step": 8476 + }, + { + "epoch": 2.2503650604009025, + "grad_norm": 0.43993794768516414, + "learning_rate": 3.4757064673451476e-06, + "loss": 0.5548, + "step": 8477 + }, + { + "epoch": 2.2506305588742865, + "grad_norm": 0.4317676688889174, + "learning_rate": 3.4753850174621983e-06, + "loss": 0.5622, + "step": 8478 + }, + { + "epoch": 2.25089605734767, + "grad_norm": 0.43157829445075135, + "learning_rate": 3.475063548556854e-06, + "loss": 0.5484, + "step": 8479 + }, + { + "epoch": 2.251161555821054, + "grad_norm": 0.43973720973354885, + "learning_rate": 3.4747420606353828e-06, + "loss": 0.539, + "step": 8480 + }, + { + "epoch": 2.251427054294438, + "grad_norm": 0.4373353112850105, + "learning_rate": 3.4744205537040547e-06, + "loss": 0.5857, + "step": 8481 + }, + { + "epoch": 2.2516925527678215, + "grad_norm": 0.4322246048265512, + "learning_rate": 3.47409902776914e-06, + "loss": 0.5892, + "step": 8482 + }, + { + "epoch": 2.251958051241205, + "grad_norm": 0.4308019965712213, + "learning_rate": 3.4737774828369096e-06, + "loss": 0.5533, + "step": 8483 + }, + { + "epoch": 2.2522235497145893, + "grad_norm": 0.4274043010276875, + "learning_rate": 3.4734559189136337e-06, + "loss": 0.5349, + "step": 8484 + }, + { + "epoch": 2.252489048187973, + "grad_norm": 0.44018431430350496, + "learning_rate": 3.4731343360055835e-06, + "loss": 0.5583, + "step": 8485 + }, + { + "epoch": 2.2527545466613565, + "grad_norm": 0.44283943872646336, + "learning_rate": 3.472812734119032e-06, + "loss": 0.5221, + "step": 8486 + }, + { + "epoch": 2.2530200451347406, + "grad_norm": 0.42531913118141246, + "learning_rate": 3.4724911132602497e-06, + "loss": 0.5842, + "step": 8487 + }, + { + "epoch": 2.2532855436081243, + "grad_norm": 0.431922880906971, + "learning_rate": 3.4721694734355097e-06, + "loss": 0.5612, + "step": 8488 + }, + { + "epoch": 2.253551042081508, + "grad_norm": 0.44233742161464473, + "learning_rate": 3.4718478146510854e-06, + "loss": 0.5788, + "step": 8489 + }, + { + "epoch": 2.253816540554892, + "grad_norm": 0.435427261646667, + "learning_rate": 3.4715261369132486e-06, + "loss": 0.5779, + "step": 8490 + }, + { + "epoch": 2.2540820390282756, + "grad_norm": 0.43381754174498666, + "learning_rate": 3.471204440228273e-06, + "loss": 0.5239, + "step": 8491 + }, + { + "epoch": 2.2543475375016593, + "grad_norm": 0.4282838580195437, + "learning_rate": 3.470882724602434e-06, + "loss": 0.5169, + "step": 8492 + }, + { + "epoch": 2.2546130359750434, + "grad_norm": 0.43671304507945535, + "learning_rate": 3.470560990042004e-06, + "loss": 0.5639, + "step": 8493 + }, + { + "epoch": 2.254878534448427, + "grad_norm": 0.4189419053178972, + "learning_rate": 3.4702392365532585e-06, + "loss": 0.5359, + "step": 8494 + }, + { + "epoch": 2.2551440329218106, + "grad_norm": 0.42895566325218754, + "learning_rate": 3.469917464142472e-06, + "loss": 0.5482, + "step": 8495 + }, + { + "epoch": 2.2554095313951943, + "grad_norm": 0.4488541438777685, + "learning_rate": 3.4695956728159208e-06, + "loss": 0.5922, + "step": 8496 + }, + { + "epoch": 2.2556750298685784, + "grad_norm": 0.4415913834987696, + "learning_rate": 3.4692738625798794e-06, + "loss": 0.5406, + "step": 8497 + }, + { + "epoch": 2.255940528341962, + "grad_norm": 0.4240872696874829, + "learning_rate": 3.468952033440625e-06, + "loss": 0.5512, + "step": 8498 + }, + { + "epoch": 2.2562060268153457, + "grad_norm": 0.430669657574303, + "learning_rate": 3.4686301854044335e-06, + "loss": 0.5602, + "step": 8499 + }, + { + "epoch": 2.2564715252887297, + "grad_norm": 0.4277164877732677, + "learning_rate": 3.468308318477582e-06, + "loss": 0.5646, + "step": 8500 + }, + { + "epoch": 2.2567370237621134, + "grad_norm": 0.4432112090962664, + "learning_rate": 3.467986432666347e-06, + "loss": 0.5514, + "step": 8501 + }, + { + "epoch": 2.257002522235497, + "grad_norm": 0.4293313486671919, + "learning_rate": 3.467664527977006e-06, + "loss": 0.5716, + "step": 8502 + }, + { + "epoch": 2.257268020708881, + "grad_norm": 0.4214129461685631, + "learning_rate": 3.4673426044158383e-06, + "loss": 0.5567, + "step": 8503 + }, + { + "epoch": 2.2575335191822647, + "grad_norm": 0.42392837142685924, + "learning_rate": 3.467020661989121e-06, + "loss": 0.5541, + "step": 8504 + }, + { + "epoch": 2.2577990176556484, + "grad_norm": 0.4413780549515595, + "learning_rate": 3.4666987007031328e-06, + "loss": 0.566, + "step": 8505 + }, + { + "epoch": 2.258064516129032, + "grad_norm": 0.4276301304509837, + "learning_rate": 3.4663767205641542e-06, + "loss": 0.5415, + "step": 8506 + }, + { + "epoch": 2.258330014602416, + "grad_norm": 0.4236588998327288, + "learning_rate": 3.4660547215784622e-06, + "loss": 0.5871, + "step": 8507 + }, + { + "epoch": 2.2585955130757998, + "grad_norm": 0.4341673134835704, + "learning_rate": 3.465732703752338e-06, + "loss": 0.5618, + "step": 8508 + }, + { + "epoch": 2.2588610115491834, + "grad_norm": 0.4134262277626446, + "learning_rate": 3.465410667092063e-06, + "loss": 0.5583, + "step": 8509 + }, + { + "epoch": 2.2591265100225675, + "grad_norm": 0.4382664778368452, + "learning_rate": 3.4650886116039145e-06, + "loss": 0.552, + "step": 8510 + }, + { + "epoch": 2.259392008495951, + "grad_norm": 0.41936945793274105, + "learning_rate": 3.4647665372941757e-06, + "loss": 0.5453, + "step": 8511 + }, + { + "epoch": 2.2596575069693348, + "grad_norm": 0.42806826429789735, + "learning_rate": 3.4644444441691276e-06, + "loss": 0.5861, + "step": 8512 + }, + { + "epoch": 2.259923005442719, + "grad_norm": 0.43602259223378537, + "learning_rate": 3.4641223322350513e-06, + "loss": 0.5837, + "step": 8513 + }, + { + "epoch": 2.2601885039161025, + "grad_norm": 0.4220217377826108, + "learning_rate": 3.4638002014982286e-06, + "loss": 0.5752, + "step": 8514 + }, + { + "epoch": 2.260454002389486, + "grad_norm": 0.42791868265428396, + "learning_rate": 3.4634780519649425e-06, + "loss": 0.5585, + "step": 8515 + }, + { + "epoch": 2.26071950086287, + "grad_norm": 0.4402367817049422, + "learning_rate": 3.4631558836414753e-06, + "loss": 0.5341, + "step": 8516 + }, + { + "epoch": 2.260984999336254, + "grad_norm": 0.4230092355063566, + "learning_rate": 3.46283369653411e-06, + "loss": 0.5485, + "step": 8517 + }, + { + "epoch": 2.2612504978096375, + "grad_norm": 0.4331083590954356, + "learning_rate": 3.4625114906491307e-06, + "loss": 0.5384, + "step": 8518 + }, + { + "epoch": 2.2615159962830216, + "grad_norm": 0.44429589678732834, + "learning_rate": 3.4621892659928204e-06, + "loss": 0.5528, + "step": 8519 + }, + { + "epoch": 2.261781494756405, + "grad_norm": 0.42342356479441795, + "learning_rate": 3.4618670225714633e-06, + "loss": 0.5763, + "step": 8520 + }, + { + "epoch": 2.262046993229789, + "grad_norm": 0.43625814354014386, + "learning_rate": 3.461544760391345e-06, + "loss": 0.5576, + "step": 8521 + }, + { + "epoch": 2.262312491703173, + "grad_norm": 0.4445498637903862, + "learning_rate": 3.461222479458749e-06, + "loss": 0.5693, + "step": 8522 + }, + { + "epoch": 2.2625779901765566, + "grad_norm": 0.4393516661996358, + "learning_rate": 3.4609001797799607e-06, + "loss": 0.5693, + "step": 8523 + }, + { + "epoch": 2.2628434886499402, + "grad_norm": 0.43706053978286297, + "learning_rate": 3.4605778613612668e-06, + "loss": 0.5801, + "step": 8524 + }, + { + "epoch": 2.263108987123324, + "grad_norm": 0.434759604905694, + "learning_rate": 3.4602555242089526e-06, + "loss": 0.5842, + "step": 8525 + }, + { + "epoch": 2.263374485596708, + "grad_norm": 0.43038273658022436, + "learning_rate": 3.4599331683293045e-06, + "loss": 0.5477, + "step": 8526 + }, + { + "epoch": 2.2636399840700916, + "grad_norm": 0.43737407037190174, + "learning_rate": 3.4596107937286092e-06, + "loss": 0.6033, + "step": 8527 + }, + { + "epoch": 2.2639054825434752, + "grad_norm": 0.4261202057507047, + "learning_rate": 3.459288400413154e-06, + "loss": 0.5599, + "step": 8528 + }, + { + "epoch": 2.2641709810168593, + "grad_norm": 0.4285000501431998, + "learning_rate": 3.458965988389227e-06, + "loss": 0.5613, + "step": 8529 + }, + { + "epoch": 2.264436479490243, + "grad_norm": 0.43507289319758524, + "learning_rate": 3.4586435576631143e-06, + "loss": 0.5365, + "step": 8530 + }, + { + "epoch": 2.2647019779636266, + "grad_norm": 0.4157881800965343, + "learning_rate": 3.4583211082411057e-06, + "loss": 0.5314, + "step": 8531 + }, + { + "epoch": 2.2649674764370102, + "grad_norm": 0.4263637061781523, + "learning_rate": 3.4579986401294884e-06, + "loss": 0.5594, + "step": 8532 + }, + { + "epoch": 2.2652329749103943, + "grad_norm": 0.439673133839381, + "learning_rate": 3.4576761533345528e-06, + "loss": 0.6089, + "step": 8533 + }, + { + "epoch": 2.265498473383778, + "grad_norm": 0.4199750838291158, + "learning_rate": 3.4573536478625864e-06, + "loss": 0.5481, + "step": 8534 + }, + { + "epoch": 2.2657639718571616, + "grad_norm": 0.42777124943943723, + "learning_rate": 3.457031123719881e-06, + "loss": 0.5753, + "step": 8535 + }, + { + "epoch": 2.2660294703305457, + "grad_norm": 0.4254042268643367, + "learning_rate": 3.4567085809127247e-06, + "loss": 0.574, + "step": 8536 + }, + { + "epoch": 2.2662949688039293, + "grad_norm": 0.4287111813410994, + "learning_rate": 3.4563860194474086e-06, + "loss": 0.5487, + "step": 8537 + }, + { + "epoch": 2.266560467277313, + "grad_norm": 0.4332528353055577, + "learning_rate": 3.4560634393302246e-06, + "loss": 0.5527, + "step": 8538 + }, + { + "epoch": 2.266825965750697, + "grad_norm": 0.4381917553079303, + "learning_rate": 3.455740840567462e-06, + "loss": 0.535, + "step": 8539 + }, + { + "epoch": 2.2670914642240807, + "grad_norm": 0.44533968851103484, + "learning_rate": 3.455418223165412e-06, + "loss": 0.5692, + "step": 8540 + }, + { + "epoch": 2.2673569626974643, + "grad_norm": 0.4359696149096592, + "learning_rate": 3.455095587130368e-06, + "loss": 0.5792, + "step": 8541 + }, + { + "epoch": 2.2676224611708484, + "grad_norm": 0.4389674947402785, + "learning_rate": 3.454772932468622e-06, + "loss": 0.605, + "step": 8542 + }, + { + "epoch": 2.267887959644232, + "grad_norm": 0.43349729198072795, + "learning_rate": 3.454450259186466e-06, + "loss": 0.5919, + "step": 8543 + }, + { + "epoch": 2.2681534581176157, + "grad_norm": 0.43716179248404385, + "learning_rate": 3.454127567290193e-06, + "loss": 0.5726, + "step": 8544 + }, + { + "epoch": 2.268418956591, + "grad_norm": 0.42688664797397036, + "learning_rate": 3.4538048567860967e-06, + "loss": 0.5961, + "step": 8545 + }, + { + "epoch": 2.2686844550643834, + "grad_norm": 0.44654345742528595, + "learning_rate": 3.4534821276804695e-06, + "loss": 0.5828, + "step": 8546 + }, + { + "epoch": 2.268949953537767, + "grad_norm": 0.42852625289224566, + "learning_rate": 3.453159379979607e-06, + "loss": 0.5608, + "step": 8547 + }, + { + "epoch": 2.269215452011151, + "grad_norm": 0.4183610378943299, + "learning_rate": 3.452836613689803e-06, + "loss": 0.573, + "step": 8548 + }, + { + "epoch": 2.269480950484535, + "grad_norm": 0.4309301722327577, + "learning_rate": 3.452513828817351e-06, + "loss": 0.5735, + "step": 8549 + }, + { + "epoch": 2.2697464489579184, + "grad_norm": 0.43225726530680536, + "learning_rate": 3.452191025368548e-06, + "loss": 0.5525, + "step": 8550 + }, + { + "epoch": 2.270011947431302, + "grad_norm": 0.45698451243453, + "learning_rate": 3.4518682033496886e-06, + "loss": 0.5299, + "step": 8551 + }, + { + "epoch": 2.270277445904686, + "grad_norm": 0.4143690839663798, + "learning_rate": 3.4515453627670687e-06, + "loss": 0.578, + "step": 8552 + }, + { + "epoch": 2.27054294437807, + "grad_norm": 0.42611166357054314, + "learning_rate": 3.451222503626984e-06, + "loss": 0.4983, + "step": 8553 + }, + { + "epoch": 2.2708084428514534, + "grad_norm": 0.42408471335708403, + "learning_rate": 3.4508996259357318e-06, + "loss": 0.5485, + "step": 8554 + }, + { + "epoch": 2.2710739413248375, + "grad_norm": 0.4369268750561684, + "learning_rate": 3.4505767296996088e-06, + "loss": 0.5571, + "step": 8555 + }, + { + "epoch": 2.271339439798221, + "grad_norm": 0.4372695574096089, + "learning_rate": 3.450253814924912e-06, + "loss": 0.5493, + "step": 8556 + }, + { + "epoch": 2.271604938271605, + "grad_norm": 0.43797303587833813, + "learning_rate": 3.4499308816179392e-06, + "loss": 0.5751, + "step": 8557 + }, + { + "epoch": 2.271870436744989, + "grad_norm": 0.42989007327076006, + "learning_rate": 3.4496079297849893e-06, + "loss": 0.5453, + "step": 8558 + }, + { + "epoch": 2.2721359352183725, + "grad_norm": 0.4255473478142183, + "learning_rate": 3.449284959432358e-06, + "loss": 0.5875, + "step": 8559 + }, + { + "epoch": 2.272401433691756, + "grad_norm": 0.4372962381254977, + "learning_rate": 3.4489619705663464e-06, + "loss": 0.5452, + "step": 8560 + }, + { + "epoch": 2.27266693216514, + "grad_norm": 0.4357449121348786, + "learning_rate": 3.448638963193254e-06, + "loss": 0.5488, + "step": 8561 + }, + { + "epoch": 2.272932430638524, + "grad_norm": 0.4434780067866217, + "learning_rate": 3.4483159373193774e-06, + "loss": 0.5736, + "step": 8562 + }, + { + "epoch": 2.2731979291119075, + "grad_norm": 0.443178473120969, + "learning_rate": 3.4479928929510193e-06, + "loss": 0.5862, + "step": 8563 + }, + { + "epoch": 2.273463427585291, + "grad_norm": 0.42475222041609967, + "learning_rate": 3.447669830094479e-06, + "loss": 0.5803, + "step": 8564 + }, + { + "epoch": 2.2737289260586753, + "grad_norm": 0.4244862617112801, + "learning_rate": 3.4473467487560554e-06, + "loss": 0.5427, + "step": 8565 + }, + { + "epoch": 2.273994424532059, + "grad_norm": 0.4366460223429077, + "learning_rate": 3.447023648942052e-06, + "loss": 0.5953, + "step": 8566 + }, + { + "epoch": 2.2742599230054426, + "grad_norm": 0.43361233423863266, + "learning_rate": 3.446700530658768e-06, + "loss": 0.5684, + "step": 8567 + }, + { + "epoch": 2.2745254214788266, + "grad_norm": 0.42507294357960795, + "learning_rate": 3.4463773939125054e-06, + "loss": 0.5459, + "step": 8568 + }, + { + "epoch": 2.2747909199522103, + "grad_norm": 0.4223656834111791, + "learning_rate": 3.446054238709567e-06, + "loss": 0.5578, + "step": 8569 + }, + { + "epoch": 2.275056418425594, + "grad_norm": 0.43245622414523766, + "learning_rate": 3.4457310650562547e-06, + "loss": 0.58, + "step": 8570 + }, + { + "epoch": 2.275321916898978, + "grad_norm": 0.43190269287009964, + "learning_rate": 3.4454078729588708e-06, + "loss": 0.5553, + "step": 8571 + }, + { + "epoch": 2.2755874153723616, + "grad_norm": 0.431527965383785, + "learning_rate": 3.4450846624237184e-06, + "loss": 0.5685, + "step": 8572 + }, + { + "epoch": 2.2758529138457453, + "grad_norm": 0.45065090750922304, + "learning_rate": 3.4447614334571024e-06, + "loss": 0.5467, + "step": 8573 + }, + { + "epoch": 2.2761184123191294, + "grad_norm": 0.42685577016948234, + "learning_rate": 3.444438186065324e-06, + "loss": 0.5475, + "step": 8574 + }, + { + "epoch": 2.276383910792513, + "grad_norm": 0.45191910317104145, + "learning_rate": 3.4441149202546894e-06, + "loss": 0.579, + "step": 8575 + }, + { + "epoch": 2.2766494092658967, + "grad_norm": 0.4414055368028246, + "learning_rate": 3.4437916360315017e-06, + "loss": 0.5439, + "step": 8576 + }, + { + "epoch": 2.2769149077392807, + "grad_norm": 0.4437368098276584, + "learning_rate": 3.4434683334020657e-06, + "loss": 0.5611, + "step": 8577 + }, + { + "epoch": 2.2771804062126644, + "grad_norm": 0.43344869596613866, + "learning_rate": 3.4431450123726876e-06, + "loss": 0.565, + "step": 8578 + }, + { + "epoch": 2.277445904686048, + "grad_norm": 0.4236041179202667, + "learning_rate": 3.442821672949673e-06, + "loss": 0.5995, + "step": 8579 + }, + { + "epoch": 2.2777114031594317, + "grad_norm": 0.4388511237577311, + "learning_rate": 3.442498315139327e-06, + "loss": 0.5836, + "step": 8580 + }, + { + "epoch": 2.2779769016328157, + "grad_norm": 0.4194323193136079, + "learning_rate": 3.442174938947957e-06, + "loss": 0.5642, + "step": 8581 + }, + { + "epoch": 2.2782424001061994, + "grad_norm": 0.43122242117002885, + "learning_rate": 3.441851544381868e-06, + "loss": 0.5286, + "step": 8582 + }, + { + "epoch": 2.278507898579583, + "grad_norm": 0.42498320223986963, + "learning_rate": 3.441528131447368e-06, + "loss": 0.5555, + "step": 8583 + }, + { + "epoch": 2.278773397052967, + "grad_norm": 0.4250188446642847, + "learning_rate": 3.441204700150765e-06, + "loss": 0.5799, + "step": 8584 + }, + { + "epoch": 2.2790388955263507, + "grad_norm": 0.4285009818149595, + "learning_rate": 3.4408812504983645e-06, + "loss": 0.5889, + "step": 8585 + }, + { + "epoch": 2.2793043939997344, + "grad_norm": 0.4214319270597252, + "learning_rate": 3.4405577824964766e-06, + "loss": 0.5495, + "step": 8586 + }, + { + "epoch": 2.279569892473118, + "grad_norm": 0.43573256983697223, + "learning_rate": 3.440234296151409e-06, + "loss": 0.5928, + "step": 8587 + }, + { + "epoch": 2.279835390946502, + "grad_norm": 0.43407718567501175, + "learning_rate": 3.4399107914694707e-06, + "loss": 0.5652, + "step": 8588 + }, + { + "epoch": 2.2801008894198858, + "grad_norm": 0.4169491778406323, + "learning_rate": 3.4395872684569705e-06, + "loss": 0.5425, + "step": 8589 + }, + { + "epoch": 2.2803663878932694, + "grad_norm": 0.4521944568205286, + "learning_rate": 3.4392637271202178e-06, + "loss": 0.5803, + "step": 8590 + }, + { + "epoch": 2.2806318863666535, + "grad_norm": 0.4420948090859669, + "learning_rate": 3.4389401674655233e-06, + "loss": 0.5565, + "step": 8591 + }, + { + "epoch": 2.280897384840037, + "grad_norm": 0.4501262073979795, + "learning_rate": 3.4386165894991963e-06, + "loss": 0.5963, + "step": 8592 + }, + { + "epoch": 2.2811628833134208, + "grad_norm": 0.4249577146559445, + "learning_rate": 3.438292993227548e-06, + "loss": 0.5601, + "step": 8593 + }, + { + "epoch": 2.281428381786805, + "grad_norm": 0.43647541297813564, + "learning_rate": 3.4379693786568885e-06, + "loss": 0.5446, + "step": 8594 + }, + { + "epoch": 2.2816938802601885, + "grad_norm": 0.434658706721523, + "learning_rate": 3.43764574579353e-06, + "loss": 0.6062, + "step": 8595 + }, + { + "epoch": 2.281959378733572, + "grad_norm": 0.4639231751581402, + "learning_rate": 3.437322094643783e-06, + "loss": 0.549, + "step": 8596 + }, + { + "epoch": 2.282224877206956, + "grad_norm": 0.42841938257273166, + "learning_rate": 3.4369984252139605e-06, + "loss": 0.5652, + "step": 8597 + }, + { + "epoch": 2.28249037568034, + "grad_norm": 0.42476446939944607, + "learning_rate": 3.436674737510375e-06, + "loss": 0.547, + "step": 8598 + }, + { + "epoch": 2.2827558741537235, + "grad_norm": 0.43468818186699665, + "learning_rate": 3.4363510315393382e-06, + "loss": 0.5938, + "step": 8599 + }, + { + "epoch": 2.2830213726271076, + "grad_norm": 0.4374351682629712, + "learning_rate": 3.4360273073071636e-06, + "loss": 0.5903, + "step": 8600 + }, + { + "epoch": 2.2832868711004912, + "grad_norm": 0.4350099905812807, + "learning_rate": 3.4357035648201657e-06, + "loss": 0.6187, + "step": 8601 + }, + { + "epoch": 2.283552369573875, + "grad_norm": 0.43445281702511274, + "learning_rate": 3.4353798040846565e-06, + "loss": 0.578, + "step": 8602 + }, + { + "epoch": 2.283817868047259, + "grad_norm": 0.4281480416533931, + "learning_rate": 3.435056025106951e-06, + "loss": 0.5608, + "step": 8603 + }, + { + "epoch": 2.2840833665206426, + "grad_norm": 0.43789655630573504, + "learning_rate": 3.4347322278933644e-06, + "loss": 0.5882, + "step": 8604 + }, + { + "epoch": 2.2843488649940262, + "grad_norm": 0.4408960361267119, + "learning_rate": 3.434408412450209e-06, + "loss": 0.6032, + "step": 8605 + }, + { + "epoch": 2.28461436346741, + "grad_norm": 0.42620606987120185, + "learning_rate": 3.434084578783803e-06, + "loss": 0.5428, + "step": 8606 + }, + { + "epoch": 2.284879861940794, + "grad_norm": 0.41326492382555574, + "learning_rate": 3.43376072690046e-06, + "loss": 0.5693, + "step": 8607 + }, + { + "epoch": 2.2851453604141776, + "grad_norm": 0.44303866425971955, + "learning_rate": 3.4334368568064972e-06, + "loss": 0.6044, + "step": 8608 + }, + { + "epoch": 2.2854108588875612, + "grad_norm": 0.4211387302490756, + "learning_rate": 3.4331129685082306e-06, + "loss": 0.5569, + "step": 8609 + }, + { + "epoch": 2.2856763573609453, + "grad_norm": 0.45927611180523964, + "learning_rate": 3.4327890620119757e-06, + "loss": 0.5817, + "step": 8610 + }, + { + "epoch": 2.285941855834329, + "grad_norm": 0.4450881733344187, + "learning_rate": 3.4324651373240505e-06, + "loss": 0.5973, + "step": 8611 + }, + { + "epoch": 2.2862073543077126, + "grad_norm": 0.43432195369427223, + "learning_rate": 3.432141194450772e-06, + "loss": 0.5505, + "step": 8612 + }, + { + "epoch": 2.2864728527810967, + "grad_norm": 0.4431155234226894, + "learning_rate": 3.431817233398458e-06, + "loss": 0.5402, + "step": 8613 + }, + { + "epoch": 2.2867383512544803, + "grad_norm": 0.421946847119039, + "learning_rate": 3.4314932541734266e-06, + "loss": 0.5376, + "step": 8614 + }, + { + "epoch": 2.287003849727864, + "grad_norm": 0.44036042759908406, + "learning_rate": 3.431169256781995e-06, + "loss": 0.5905, + "step": 8615 + }, + { + "epoch": 2.2872693482012476, + "grad_norm": 0.45149330075274924, + "learning_rate": 3.4308452412304844e-06, + "loss": 0.5225, + "step": 8616 + }, + { + "epoch": 2.2875348466746317, + "grad_norm": 0.43353348919696133, + "learning_rate": 3.4305212075252116e-06, + "loss": 0.5366, + "step": 8617 + }, + { + "epoch": 2.2878003451480153, + "grad_norm": 0.4368234726549299, + "learning_rate": 3.4301971556724973e-06, + "loss": 0.6109, + "step": 8618 + }, + { + "epoch": 2.288065843621399, + "grad_norm": 0.4218767484763383, + "learning_rate": 3.429873085678661e-06, + "loss": 0.5336, + "step": 8619 + }, + { + "epoch": 2.288331342094783, + "grad_norm": 0.453800116753957, + "learning_rate": 3.429548997550023e-06, + "loss": 0.5206, + "step": 8620 + }, + { + "epoch": 2.2885968405681667, + "grad_norm": 0.43003754974788116, + "learning_rate": 3.429224891292904e-06, + "loss": 0.5879, + "step": 8621 + }, + { + "epoch": 2.2888623390415503, + "grad_norm": 0.43622186090912646, + "learning_rate": 3.4289007669136232e-06, + "loss": 0.6098, + "step": 8622 + }, + { + "epoch": 2.2891278375149344, + "grad_norm": 0.4350969956625951, + "learning_rate": 3.4285766244185037e-06, + "loss": 0.5456, + "step": 8623 + }, + { + "epoch": 2.289393335988318, + "grad_norm": 0.4302341183850153, + "learning_rate": 3.4282524638138664e-06, + "loss": 0.576, + "step": 8624 + }, + { + "epoch": 2.2896588344617017, + "grad_norm": 0.4346534193047658, + "learning_rate": 3.4279282851060335e-06, + "loss": 0.5868, + "step": 8625 + }, + { + "epoch": 2.289924332935086, + "grad_norm": 0.4346917869148051, + "learning_rate": 3.427604088301327e-06, + "loss": 0.5611, + "step": 8626 + }, + { + "epoch": 2.2901898314084694, + "grad_norm": 0.42406552900214967, + "learning_rate": 3.4272798734060696e-06, + "loss": 0.5873, + "step": 8627 + }, + { + "epoch": 2.290455329881853, + "grad_norm": 0.4669286594933157, + "learning_rate": 3.426955640426584e-06, + "loss": 0.5503, + "step": 8628 + }, + { + "epoch": 2.290720828355237, + "grad_norm": 0.45354687841586233, + "learning_rate": 3.4266313893691943e-06, + "loss": 0.5736, + "step": 8629 + }, + { + "epoch": 2.290986326828621, + "grad_norm": 0.4216700143556463, + "learning_rate": 3.426307120240224e-06, + "loss": 0.555, + "step": 8630 + }, + { + "epoch": 2.2912518253020044, + "grad_norm": 0.43172386065401885, + "learning_rate": 3.4259828330459966e-06, + "loss": 0.5085, + "step": 8631 + }, + { + "epoch": 2.2915173237753885, + "grad_norm": 0.42946655508038667, + "learning_rate": 3.425658527792836e-06, + "loss": 0.5523, + "step": 8632 + }, + { + "epoch": 2.291782822248772, + "grad_norm": 0.42809528809244657, + "learning_rate": 3.4253342044870685e-06, + "loss": 0.5778, + "step": 8633 + }, + { + "epoch": 2.292048320722156, + "grad_norm": 0.42884103648316607, + "learning_rate": 3.425009863135018e-06, + "loss": 0.5355, + "step": 8634 + }, + { + "epoch": 2.2923138191955394, + "grad_norm": 0.4282181011970201, + "learning_rate": 3.4246855037430106e-06, + "loss": 0.5502, + "step": 8635 + }, + { + "epoch": 2.2925793176689235, + "grad_norm": 0.43129714818790477, + "learning_rate": 3.4243611263173722e-06, + "loss": 0.5813, + "step": 8636 + }, + { + "epoch": 2.292844816142307, + "grad_norm": 0.4231203719941036, + "learning_rate": 3.424036730864428e-06, + "loss": 0.616, + "step": 8637 + }, + { + "epoch": 2.293110314615691, + "grad_norm": 0.4402166801138924, + "learning_rate": 3.423712317390505e-06, + "loss": 0.6032, + "step": 8638 + }, + { + "epoch": 2.293375813089075, + "grad_norm": 0.42350621013555106, + "learning_rate": 3.423387885901931e-06, + "loss": 0.5668, + "step": 8639 + }, + { + "epoch": 2.2936413115624585, + "grad_norm": 0.44008722164441844, + "learning_rate": 3.423063436405032e-06, + "loss": 0.5304, + "step": 8640 + }, + { + "epoch": 2.293906810035842, + "grad_norm": 0.4348090628063169, + "learning_rate": 3.4227389689061353e-06, + "loss": 0.5347, + "step": 8641 + }, + { + "epoch": 2.2941723085092263, + "grad_norm": 0.43435880921020203, + "learning_rate": 3.42241448341157e-06, + "loss": 0.5339, + "step": 8642 + }, + { + "epoch": 2.29443780698261, + "grad_norm": 0.4248996835375181, + "learning_rate": 3.4220899799276637e-06, + "loss": 0.5258, + "step": 8643 + }, + { + "epoch": 2.2947033054559935, + "grad_norm": 0.44058834309212397, + "learning_rate": 3.421765458460745e-06, + "loss": 0.5642, + "step": 8644 + }, + { + "epoch": 2.294968803929377, + "grad_norm": 0.41431904305329476, + "learning_rate": 3.421440919017144e-06, + "loss": 0.6193, + "step": 8645 + }, + { + "epoch": 2.2952343024027613, + "grad_norm": 0.43134524200929175, + "learning_rate": 3.421116361603188e-06, + "loss": 0.5652, + "step": 8646 + }, + { + "epoch": 2.295499800876145, + "grad_norm": 0.4338152491012483, + "learning_rate": 3.4207917862252083e-06, + "loss": 0.554, + "step": 8647 + }, + { + "epoch": 2.2957652993495286, + "grad_norm": 0.41761404718399825, + "learning_rate": 3.4204671928895334e-06, + "loss": 0.5778, + "step": 8648 + }, + { + "epoch": 2.2960307978229126, + "grad_norm": 0.43331192347395864, + "learning_rate": 3.420142581602495e-06, + "loss": 0.5746, + "step": 8649 + }, + { + "epoch": 2.2962962962962963, + "grad_norm": 0.45860675274287865, + "learning_rate": 3.4198179523704234e-06, + "loss": 0.5473, + "step": 8650 + }, + { + "epoch": 2.29656179476968, + "grad_norm": 0.4180147352218744, + "learning_rate": 3.4194933051996493e-06, + "loss": 0.5824, + "step": 8651 + }, + { + "epoch": 2.296827293243064, + "grad_norm": 0.44604411099370106, + "learning_rate": 3.4191686400965046e-06, + "loss": 0.5759, + "step": 8652 + }, + { + "epoch": 2.2970927917164476, + "grad_norm": 0.448373937487176, + "learning_rate": 3.4188439570673216e-06, + "loss": 0.5644, + "step": 8653 + }, + { + "epoch": 2.2973582901898313, + "grad_norm": 0.43927625938094816, + "learning_rate": 3.4185192561184312e-06, + "loss": 0.5697, + "step": 8654 + }, + { + "epoch": 2.2976237886632154, + "grad_norm": 0.43609999003164884, + "learning_rate": 3.4181945372561658e-06, + "loss": 0.5955, + "step": 8655 + }, + { + "epoch": 2.297889287136599, + "grad_norm": 0.4385595759633102, + "learning_rate": 3.4178698004868598e-06, + "loss": 0.5134, + "step": 8656 + }, + { + "epoch": 2.2981547856099827, + "grad_norm": 0.4437820845501664, + "learning_rate": 3.417545045816845e-06, + "loss": 0.5554, + "step": 8657 + }, + { + "epoch": 2.2984202840833667, + "grad_norm": 0.422961155923829, + "learning_rate": 3.417220273252455e-06, + "loss": 0.5582, + "step": 8658 + }, + { + "epoch": 2.2986857825567504, + "grad_norm": 0.43192448818067, + "learning_rate": 3.4168954828000245e-06, + "loss": 0.5735, + "step": 8659 + }, + { + "epoch": 2.298951281030134, + "grad_norm": 0.4449759670096079, + "learning_rate": 3.4165706744658865e-06, + "loss": 0.5706, + "step": 8660 + }, + { + "epoch": 2.299216779503518, + "grad_norm": 0.4405244493681199, + "learning_rate": 3.4162458482563756e-06, + "loss": 0.5407, + "step": 8661 + }, + { + "epoch": 2.2994822779769017, + "grad_norm": 0.4195382959296605, + "learning_rate": 3.4159210041778284e-06, + "loss": 0.5551, + "step": 8662 + }, + { + "epoch": 2.2997477764502854, + "grad_norm": 0.42201630037548943, + "learning_rate": 3.415596142236579e-06, + "loss": 0.568, + "step": 8663 + }, + { + "epoch": 2.300013274923669, + "grad_norm": 0.4298967016439161, + "learning_rate": 3.415271262438963e-06, + "loss": 0.5496, + "step": 8664 + }, + { + "epoch": 2.300278773397053, + "grad_norm": 0.43154018803061883, + "learning_rate": 3.414946364791316e-06, + "loss": 0.5635, + "step": 8665 + }, + { + "epoch": 2.3005442718704368, + "grad_norm": 0.4438106209256733, + "learning_rate": 3.414621449299975e-06, + "loss": 0.5941, + "step": 8666 + }, + { + "epoch": 2.3008097703438204, + "grad_norm": 0.4330238135245737, + "learning_rate": 3.414296515971276e-06, + "loss": 0.606, + "step": 8667 + }, + { + "epoch": 2.3010752688172045, + "grad_norm": 0.41537796047406034, + "learning_rate": 3.4139715648115575e-06, + "loss": 0.5432, + "step": 8668 + }, + { + "epoch": 2.301340767290588, + "grad_norm": 0.4199013477576837, + "learning_rate": 3.4136465958271547e-06, + "loss": 0.5402, + "step": 8669 + }, + { + "epoch": 2.3016062657639718, + "grad_norm": 0.42145896929941684, + "learning_rate": 3.4133216090244063e-06, + "loss": 0.5432, + "step": 8670 + }, + { + "epoch": 2.3018717642373554, + "grad_norm": 0.43515668343426206, + "learning_rate": 3.4129966044096503e-06, + "loss": 0.5988, + "step": 8671 + }, + { + "epoch": 2.3021372627107395, + "grad_norm": 0.43223548861228345, + "learning_rate": 3.4126715819892253e-06, + "loss": 0.5859, + "step": 8672 + }, + { + "epoch": 2.302402761184123, + "grad_norm": 0.43430799411907095, + "learning_rate": 3.4123465417694702e-06, + "loss": 0.5983, + "step": 8673 + }, + { + "epoch": 2.3026682596575068, + "grad_norm": 0.430194111538804, + "learning_rate": 3.4120214837567234e-06, + "loss": 0.5887, + "step": 8674 + }, + { + "epoch": 2.302933758130891, + "grad_norm": 0.42061516824386097, + "learning_rate": 3.4116964079573243e-06, + "loss": 0.5985, + "step": 8675 + }, + { + "epoch": 2.3031992566042745, + "grad_norm": 0.4296039745269631, + "learning_rate": 3.411371314377614e-06, + "loss": 0.55, + "step": 8676 + }, + { + "epoch": 2.303464755077658, + "grad_norm": 0.43966332243547074, + "learning_rate": 3.4110462030239304e-06, + "loss": 0.5702, + "step": 8677 + }, + { + "epoch": 2.303730253551042, + "grad_norm": 0.43771975287124654, + "learning_rate": 3.410721073902616e-06, + "loss": 0.5628, + "step": 8678 + }, + { + "epoch": 2.303995752024426, + "grad_norm": 0.4331116302408522, + "learning_rate": 3.4103959270200105e-06, + "loss": 0.6011, + "step": 8679 + }, + { + "epoch": 2.3042612504978095, + "grad_norm": 0.44030659592738497, + "learning_rate": 3.410070762382455e-06, + "loss": 0.592, + "step": 8680 + }, + { + "epoch": 2.3045267489711936, + "grad_norm": 0.4358715104606606, + "learning_rate": 3.409745579996292e-06, + "loss": 0.5768, + "step": 8681 + }, + { + "epoch": 2.3047922474445772, + "grad_norm": 0.4284271462370547, + "learning_rate": 3.4094203798678634e-06, + "loss": 0.5646, + "step": 8682 + }, + { + "epoch": 2.305057745917961, + "grad_norm": 0.4309236091956606, + "learning_rate": 3.4090951620035094e-06, + "loss": 0.5095, + "step": 8683 + }, + { + "epoch": 2.305323244391345, + "grad_norm": 0.42184894141151874, + "learning_rate": 3.4087699264095746e-06, + "loss": 0.5498, + "step": 8684 + }, + { + "epoch": 2.3055887428647286, + "grad_norm": 0.42655281270623524, + "learning_rate": 3.4084446730924016e-06, + "loss": 0.5355, + "step": 8685 + }, + { + "epoch": 2.3058542413381122, + "grad_norm": 0.42470084582982187, + "learning_rate": 3.408119402058333e-06, + "loss": 0.5791, + "step": 8686 + }, + { + "epoch": 2.3061197398114963, + "grad_norm": 0.4371712161975838, + "learning_rate": 3.407794113313712e-06, + "loss": 0.5354, + "step": 8687 + }, + { + "epoch": 2.30638523828488, + "grad_norm": 0.4483303382792808, + "learning_rate": 3.407468806864883e-06, + "loss": 0.527, + "step": 8688 + }, + { + "epoch": 2.3066507367582636, + "grad_norm": 0.4428717600591457, + "learning_rate": 3.407143482718191e-06, + "loss": 0.5527, + "step": 8689 + }, + { + "epoch": 2.3069162352316472, + "grad_norm": 0.4344103689174707, + "learning_rate": 3.40681814087998e-06, + "loss": 0.5927, + "step": 8690 + }, + { + "epoch": 2.3071817337050313, + "grad_norm": 0.42864000328790075, + "learning_rate": 3.4064927813565952e-06, + "loss": 0.5431, + "step": 8691 + }, + { + "epoch": 2.307447232178415, + "grad_norm": 0.4278265465540942, + "learning_rate": 3.406167404154381e-06, + "loss": 0.5556, + "step": 8692 + }, + { + "epoch": 2.3077127306517986, + "grad_norm": 0.446265498518479, + "learning_rate": 3.405842009279684e-06, + "loss": 0.6127, + "step": 8693 + }, + { + "epoch": 2.3079782291251827, + "grad_norm": 0.4281659300570047, + "learning_rate": 3.4055165967388502e-06, + "loss": 0.5349, + "step": 8694 + }, + { + "epoch": 2.3082437275985663, + "grad_norm": 0.4425767952431549, + "learning_rate": 3.4051911665382254e-06, + "loss": 0.5637, + "step": 8695 + }, + { + "epoch": 2.30850922607195, + "grad_norm": 0.4359561735033075, + "learning_rate": 3.4048657186841567e-06, + "loss": 0.5348, + "step": 8696 + }, + { + "epoch": 2.308774724545334, + "grad_norm": 0.44916543480984455, + "learning_rate": 3.404540253182991e-06, + "loss": 0.5325, + "step": 8697 + }, + { + "epoch": 2.3090402230187177, + "grad_norm": 0.4348341393286978, + "learning_rate": 3.4042147700410755e-06, + "loss": 0.5418, + "step": 8698 + }, + { + "epoch": 2.3093057214921013, + "grad_norm": 0.45393446013022204, + "learning_rate": 3.4038892692647584e-06, + "loss": 0.582, + "step": 8699 + }, + { + "epoch": 2.309571219965485, + "grad_norm": 0.4493936187700734, + "learning_rate": 3.4035637508603868e-06, + "loss": 0.571, + "step": 8700 + }, + { + "epoch": 2.309836718438869, + "grad_norm": 0.44204429344420293, + "learning_rate": 3.4032382148343103e-06, + "loss": 0.6077, + "step": 8701 + }, + { + "epoch": 2.3101022169122527, + "grad_norm": 0.44863656295787097, + "learning_rate": 3.4029126611928776e-06, + "loss": 0.5956, + "step": 8702 + }, + { + "epoch": 2.3103677153856363, + "grad_norm": 0.4893470318271801, + "learning_rate": 3.4025870899424363e-06, + "loss": 0.5306, + "step": 8703 + }, + { + "epoch": 2.3106332138590204, + "grad_norm": 0.4335001711334422, + "learning_rate": 3.4022615010893374e-06, + "loss": 0.5174, + "step": 8704 + }, + { + "epoch": 2.310898712332404, + "grad_norm": 0.431490596916666, + "learning_rate": 3.4019358946399304e-06, + "loss": 0.5549, + "step": 8705 + }, + { + "epoch": 2.3111642108057877, + "grad_norm": 0.43424166875333015, + "learning_rate": 3.4016102706005645e-06, + "loss": 0.5273, + "step": 8706 + }, + { + "epoch": 2.311429709279172, + "grad_norm": 0.4217312919572834, + "learning_rate": 3.4012846289775905e-06, + "loss": 0.5825, + "step": 8707 + }, + { + "epoch": 2.3116952077525554, + "grad_norm": 0.42817993624353834, + "learning_rate": 3.4009589697773605e-06, + "loss": 0.5657, + "step": 8708 + }, + { + "epoch": 2.311960706225939, + "grad_norm": 0.43404422005993665, + "learning_rate": 3.4006332930062236e-06, + "loss": 0.6125, + "step": 8709 + }, + { + "epoch": 2.312226204699323, + "grad_norm": 0.4373978590145837, + "learning_rate": 3.4003075986705326e-06, + "loss": 0.5926, + "step": 8710 + }, + { + "epoch": 2.312491703172707, + "grad_norm": 0.4159244910635771, + "learning_rate": 3.39998188677664e-06, + "loss": 0.5299, + "step": 8711 + }, + { + "epoch": 2.3127572016460904, + "grad_norm": 0.4172099619607329, + "learning_rate": 3.3996561573308963e-06, + "loss": 0.5286, + "step": 8712 + }, + { + "epoch": 2.3130227001194745, + "grad_norm": 0.42859753708641873, + "learning_rate": 3.3993304103396553e-06, + "loss": 0.5692, + "step": 8713 + }, + { + "epoch": 2.313288198592858, + "grad_norm": 0.44133138185157095, + "learning_rate": 3.399004645809269e-06, + "loss": 0.538, + "step": 8714 + }, + { + "epoch": 2.313553697066242, + "grad_norm": 0.434876809713245, + "learning_rate": 3.398678863746091e-06, + "loss": 0.6101, + "step": 8715 + }, + { + "epoch": 2.313819195539626, + "grad_norm": 0.4292994856603943, + "learning_rate": 3.3983530641564744e-06, + "loss": 0.5728, + "step": 8716 + }, + { + "epoch": 2.3140846940130095, + "grad_norm": 0.4307938872307361, + "learning_rate": 3.398027247046774e-06, + "loss": 0.5673, + "step": 8717 + }, + { + "epoch": 2.314350192486393, + "grad_norm": 0.4274362234983465, + "learning_rate": 3.397701412423343e-06, + "loss": 0.5613, + "step": 8718 + }, + { + "epoch": 2.314615690959777, + "grad_norm": 0.43440228854944063, + "learning_rate": 3.3973755602925374e-06, + "loss": 0.5974, + "step": 8719 + }, + { + "epoch": 2.314881189433161, + "grad_norm": 0.451912719332572, + "learning_rate": 3.397049690660711e-06, + "loss": 0.5888, + "step": 8720 + }, + { + "epoch": 2.3151466879065445, + "grad_norm": 0.42808434773507886, + "learning_rate": 3.3967238035342187e-06, + "loss": 0.5625, + "step": 8721 + }, + { + "epoch": 2.315412186379928, + "grad_norm": 0.4315311977344077, + "learning_rate": 3.3963978989194174e-06, + "loss": 0.5745, + "step": 8722 + }, + { + "epoch": 2.3156776848533123, + "grad_norm": 0.4251036422424467, + "learning_rate": 3.396071976822662e-06, + "loss": 0.5397, + "step": 8723 + }, + { + "epoch": 2.315943183326696, + "grad_norm": 0.42626404411390983, + "learning_rate": 3.395746037250309e-06, + "loss": 0.5432, + "step": 8724 + }, + { + "epoch": 2.3162086818000795, + "grad_norm": 0.4252503594702319, + "learning_rate": 3.3954200802087146e-06, + "loss": 0.5624, + "step": 8725 + }, + { + "epoch": 2.316474180273463, + "grad_norm": 0.4503302304801565, + "learning_rate": 3.3950941057042363e-06, + "loss": 0.5335, + "step": 8726 + }, + { + "epoch": 2.3167396787468473, + "grad_norm": 0.42929050395089974, + "learning_rate": 3.394768113743232e-06, + "loss": 0.566, + "step": 8727 + }, + { + "epoch": 2.317005177220231, + "grad_norm": 0.440221977185262, + "learning_rate": 3.3944421043320593e-06, + "loss": 0.5647, + "step": 8728 + }, + { + "epoch": 2.3172706756936146, + "grad_norm": 0.41752932526464925, + "learning_rate": 3.3941160774770744e-06, + "loss": 0.5475, + "step": 8729 + }, + { + "epoch": 2.3175361741669986, + "grad_norm": 0.43419345021156086, + "learning_rate": 3.3937900331846373e-06, + "loss": 0.5807, + "step": 8730 + }, + { + "epoch": 2.3178016726403823, + "grad_norm": 0.43546798131950937, + "learning_rate": 3.393463971461106e-06, + "loss": 0.5887, + "step": 8731 + }, + { + "epoch": 2.318067171113766, + "grad_norm": 0.42812839890863225, + "learning_rate": 3.39313789231284e-06, + "loss": 0.5713, + "step": 8732 + }, + { + "epoch": 2.31833266958715, + "grad_norm": 0.4351081451216406, + "learning_rate": 3.392811795746198e-06, + "loss": 0.5748, + "step": 8733 + }, + { + "epoch": 2.3185981680605336, + "grad_norm": 0.4355074277409877, + "learning_rate": 3.39248568176754e-06, + "loss": 0.5615, + "step": 8734 + }, + { + "epoch": 2.3188636665339173, + "grad_norm": 0.4289827679562849, + "learning_rate": 3.392159550383226e-06, + "loss": 0.5824, + "step": 8735 + }, + { + "epoch": 2.3191291650073014, + "grad_norm": 0.4376249382687959, + "learning_rate": 3.391833401599617e-06, + "loss": 0.5796, + "step": 8736 + }, + { + "epoch": 2.319394663480685, + "grad_norm": 0.45386129963571475, + "learning_rate": 3.3915072354230728e-06, + "loss": 0.5417, + "step": 8737 + }, + { + "epoch": 2.3196601619540687, + "grad_norm": 0.42624879960086787, + "learning_rate": 3.3911810518599542e-06, + "loss": 0.5827, + "step": 8738 + }, + { + "epoch": 2.3199256604274527, + "grad_norm": 0.4178959163332132, + "learning_rate": 3.3908548509166234e-06, + "loss": 0.5216, + "step": 8739 + }, + { + "epoch": 2.3201911589008364, + "grad_norm": 0.4458969808068912, + "learning_rate": 3.3905286325994423e-06, + "loss": 0.5902, + "step": 8740 + }, + { + "epoch": 2.32045665737422, + "grad_norm": 0.43466416442710726, + "learning_rate": 3.3902023969147723e-06, + "loss": 0.568, + "step": 8741 + }, + { + "epoch": 2.320722155847604, + "grad_norm": 0.42851178838776194, + "learning_rate": 3.3898761438689752e-06, + "loss": 0.5611, + "step": 8742 + }, + { + "epoch": 2.3209876543209877, + "grad_norm": 0.4258354349653672, + "learning_rate": 3.389549873468415e-06, + "loss": 0.5541, + "step": 8743 + }, + { + "epoch": 2.3212531527943714, + "grad_norm": 0.4342573521016042, + "learning_rate": 3.3892235857194545e-06, + "loss": 0.6051, + "step": 8744 + }, + { + "epoch": 2.321518651267755, + "grad_norm": 0.42099841562692886, + "learning_rate": 3.388897280628457e-06, + "loss": 0.5597, + "step": 8745 + }, + { + "epoch": 2.321784149741139, + "grad_norm": 0.42442378017136545, + "learning_rate": 3.3885709582017856e-06, + "loss": 0.5621, + "step": 8746 + }, + { + "epoch": 2.3220496482145228, + "grad_norm": 0.43274790826031445, + "learning_rate": 3.388244618445805e-06, + "loss": 0.5808, + "step": 8747 + }, + { + "epoch": 2.3223151466879064, + "grad_norm": 0.4229624825508171, + "learning_rate": 3.38791826136688e-06, + "loss": 0.5514, + "step": 8748 + }, + { + "epoch": 2.3225806451612905, + "grad_norm": 0.42607682743538566, + "learning_rate": 3.3875918869713744e-06, + "loss": 0.58, + "step": 8749 + }, + { + "epoch": 2.322846143634674, + "grad_norm": 0.43847846165442683, + "learning_rate": 3.387265495265654e-06, + "loss": 0.5634, + "step": 8750 + }, + { + "epoch": 2.3231116421080578, + "grad_norm": 0.4270452036960777, + "learning_rate": 3.3869390862560846e-06, + "loss": 0.547, + "step": 8751 + }, + { + "epoch": 2.323377140581442, + "grad_norm": 0.41880505997487516, + "learning_rate": 3.3866126599490303e-06, + "loss": 0.5811, + "step": 8752 + }, + { + "epoch": 2.3236426390548255, + "grad_norm": 0.40998814173754866, + "learning_rate": 3.3862862163508585e-06, + "loss": 0.5597, + "step": 8753 + }, + { + "epoch": 2.323908137528209, + "grad_norm": 0.43114239895432926, + "learning_rate": 3.3859597554679362e-06, + "loss": 0.5747, + "step": 8754 + }, + { + "epoch": 2.3241736360015928, + "grad_norm": 0.4357305441063565, + "learning_rate": 3.385633277306629e-06, + "loss": 0.5677, + "step": 8755 + }, + { + "epoch": 2.324439134474977, + "grad_norm": 0.4274764062098414, + "learning_rate": 3.3853067818733045e-06, + "loss": 0.5624, + "step": 8756 + }, + { + "epoch": 2.3247046329483605, + "grad_norm": 0.44134163501348717, + "learning_rate": 3.3849802691743305e-06, + "loss": 0.6051, + "step": 8757 + }, + { + "epoch": 2.324970131421744, + "grad_norm": 0.4168260058285323, + "learning_rate": 3.3846537392160743e-06, + "loss": 0.5667, + "step": 8758 + }, + { + "epoch": 2.325235629895128, + "grad_norm": 0.4301045316116505, + "learning_rate": 3.384327192004904e-06, + "loss": 0.579, + "step": 8759 + }, + { + "epoch": 2.325501128368512, + "grad_norm": 0.43160578163390323, + "learning_rate": 3.3840006275471888e-06, + "loss": 0.5311, + "step": 8760 + }, + { + "epoch": 2.3257666268418955, + "grad_norm": 0.41731258671713795, + "learning_rate": 3.3836740458492963e-06, + "loss": 0.548, + "step": 8761 + }, + { + "epoch": 2.3260321253152796, + "grad_norm": 0.4245560097178044, + "learning_rate": 3.3833474469175965e-06, + "loss": 0.5531, + "step": 8762 + }, + { + "epoch": 2.3262976237886632, + "grad_norm": 0.4311658019781137, + "learning_rate": 3.3830208307584583e-06, + "loss": 0.5506, + "step": 8763 + }, + { + "epoch": 2.326563122262047, + "grad_norm": 0.4350115659052762, + "learning_rate": 3.382694197378252e-06, + "loss": 0.5865, + "step": 8764 + }, + { + "epoch": 2.326828620735431, + "grad_norm": 0.42976356035906665, + "learning_rate": 3.3823675467833477e-06, + "loss": 0.5659, + "step": 8765 + }, + { + "epoch": 2.3270941192088146, + "grad_norm": 0.42217579135381045, + "learning_rate": 3.382040878980117e-06, + "loss": 0.5446, + "step": 8766 + }, + { + "epoch": 2.3273596176821982, + "grad_norm": 0.43685572696200264, + "learning_rate": 3.381714193974928e-06, + "loss": 0.5804, + "step": 8767 + }, + { + "epoch": 2.3276251161555823, + "grad_norm": 0.4215383570453018, + "learning_rate": 3.3813874917741546e-06, + "loss": 0.5214, + "step": 8768 + }, + { + "epoch": 2.327890614628966, + "grad_norm": 0.4371868338171324, + "learning_rate": 3.381060772384166e-06, + "loss": 0.5836, + "step": 8769 + }, + { + "epoch": 2.3281561131023496, + "grad_norm": 0.4320116304077911, + "learning_rate": 3.380734035811336e-06, + "loss": 0.5612, + "step": 8770 + }, + { + "epoch": 2.3284216115757337, + "grad_norm": 0.4402656182516278, + "learning_rate": 3.380407282062035e-06, + "loss": 0.5674, + "step": 8771 + }, + { + "epoch": 2.3286871100491173, + "grad_norm": 0.43072070621367076, + "learning_rate": 3.380080511142637e-06, + "loss": 0.5517, + "step": 8772 + }, + { + "epoch": 2.328952608522501, + "grad_norm": 0.4459760525571599, + "learning_rate": 3.379753723059514e-06, + "loss": 0.5788, + "step": 8773 + }, + { + "epoch": 2.3292181069958846, + "grad_norm": 0.43119698635512205, + "learning_rate": 3.3794269178190397e-06, + "loss": 0.5514, + "step": 8774 + }, + { + "epoch": 2.3294836054692687, + "grad_norm": 0.4313929811357197, + "learning_rate": 3.3791000954275873e-06, + "loss": 0.6164, + "step": 8775 + }, + { + "epoch": 2.3297491039426523, + "grad_norm": 0.42650866371809354, + "learning_rate": 3.3787732558915304e-06, + "loss": 0.5507, + "step": 8776 + }, + { + "epoch": 2.330014602416036, + "grad_norm": 0.42220623556580533, + "learning_rate": 3.378446399217243e-06, + "loss": 0.549, + "step": 8777 + }, + { + "epoch": 2.33028010088942, + "grad_norm": 0.44642883068398237, + "learning_rate": 3.378119525411101e-06, + "loss": 0.5495, + "step": 8778 + }, + { + "epoch": 2.3305455993628037, + "grad_norm": 0.42876822773267426, + "learning_rate": 3.3777926344794777e-06, + "loss": 0.5854, + "step": 8779 + }, + { + "epoch": 2.3308110978361873, + "grad_norm": 0.4248732943623475, + "learning_rate": 3.3774657264287484e-06, + "loss": 0.5159, + "step": 8780 + }, + { + "epoch": 2.331076596309571, + "grad_norm": 0.44854392927100034, + "learning_rate": 3.3771388012652893e-06, + "loss": 0.5886, + "step": 8781 + }, + { + "epoch": 2.331342094782955, + "grad_norm": 0.43426629600643435, + "learning_rate": 3.3768118589954764e-06, + "loss": 0.5529, + "step": 8782 + }, + { + "epoch": 2.3316075932563387, + "grad_norm": 0.4391509435686133, + "learning_rate": 3.3764848996256856e-06, + "loss": 0.5952, + "step": 8783 + }, + { + "epoch": 2.3318730917297223, + "grad_norm": 0.43325678444810684, + "learning_rate": 3.3761579231622927e-06, + "loss": 0.5662, + "step": 8784 + }, + { + "epoch": 2.3321385902031064, + "grad_norm": 0.41361918690565397, + "learning_rate": 3.375830929611675e-06, + "loss": 0.5371, + "step": 8785 + }, + { + "epoch": 2.33240408867649, + "grad_norm": 0.4252050249840596, + "learning_rate": 3.3755039189802104e-06, + "loss": 0.5588, + "step": 8786 + }, + { + "epoch": 2.3326695871498737, + "grad_norm": 0.42546898596264116, + "learning_rate": 3.375176891274275e-06, + "loss": 0.5665, + "step": 8787 + }, + { + "epoch": 2.332935085623258, + "grad_norm": 0.4119843155832369, + "learning_rate": 3.3748498465002475e-06, + "loss": 0.5328, + "step": 8788 + }, + { + "epoch": 2.3332005840966414, + "grad_norm": 0.42847432932985036, + "learning_rate": 3.3745227846645064e-06, + "loss": 0.547, + "step": 8789 + }, + { + "epoch": 2.333466082570025, + "grad_norm": 0.4400134953340087, + "learning_rate": 3.37419570577343e-06, + "loss": 0.5594, + "step": 8790 + }, + { + "epoch": 2.333731581043409, + "grad_norm": 0.43890434350385954, + "learning_rate": 3.3738686098333965e-06, + "loss": 0.5592, + "step": 8791 + }, + { + "epoch": 2.333997079516793, + "grad_norm": 0.43212388558860204, + "learning_rate": 3.373541496850786e-06, + "loss": 0.5636, + "step": 8792 + }, + { + "epoch": 2.3342625779901764, + "grad_norm": 0.43492001564840976, + "learning_rate": 3.3732143668319765e-06, + "loss": 0.5648, + "step": 8793 + }, + { + "epoch": 2.3345280764635605, + "grad_norm": 0.4396660771210867, + "learning_rate": 3.3728872197833495e-06, + "loss": 0.5419, + "step": 8794 + }, + { + "epoch": 2.334793574936944, + "grad_norm": 0.425635948718239, + "learning_rate": 3.3725600557112846e-06, + "loss": 0.5607, + "step": 8795 + }, + { + "epoch": 2.335059073410328, + "grad_norm": 0.43969729976873984, + "learning_rate": 3.3722328746221623e-06, + "loss": 0.5935, + "step": 8796 + }, + { + "epoch": 2.335324571883712, + "grad_norm": 0.42708600909644023, + "learning_rate": 3.3719056765223636e-06, + "loss": 0.5651, + "step": 8797 + }, + { + "epoch": 2.3355900703570955, + "grad_norm": 0.4337202269645962, + "learning_rate": 3.371578461418268e-06, + "loss": 0.5856, + "step": 8798 + }, + { + "epoch": 2.335855568830479, + "grad_norm": 0.43798683514030007, + "learning_rate": 3.371251229316259e-06, + "loss": 0.5887, + "step": 8799 + }, + { + "epoch": 2.336121067303863, + "grad_norm": 0.4336074735676317, + "learning_rate": 3.3709239802227188e-06, + "loss": 0.5837, + "step": 8800 + }, + { + "epoch": 2.336386565777247, + "grad_norm": 0.443097459964302, + "learning_rate": 3.3705967141440283e-06, + "loss": 0.5525, + "step": 8801 + }, + { + "epoch": 2.3366520642506305, + "grad_norm": 0.4334481430177968, + "learning_rate": 3.3702694310865696e-06, + "loss": 0.5812, + "step": 8802 + }, + { + "epoch": 2.336917562724014, + "grad_norm": 0.4288951040913168, + "learning_rate": 3.3699421310567272e-06, + "loss": 0.5799, + "step": 8803 + }, + { + "epoch": 2.3371830611973983, + "grad_norm": 0.4218048729184324, + "learning_rate": 3.3696148140608827e-06, + "loss": 0.5625, + "step": 8804 + }, + { + "epoch": 2.337448559670782, + "grad_norm": 0.4144412073712662, + "learning_rate": 3.36928748010542e-06, + "loss": 0.5229, + "step": 8805 + }, + { + "epoch": 2.3377140581441656, + "grad_norm": 0.4299257655900489, + "learning_rate": 3.3689601291967234e-06, + "loss": 0.5725, + "step": 8806 + }, + { + "epoch": 2.3379795566175496, + "grad_norm": 0.420646340682856, + "learning_rate": 3.3686327613411764e-06, + "loss": 0.5738, + "step": 8807 + }, + { + "epoch": 2.3382450550909333, + "grad_norm": 0.4356379548656279, + "learning_rate": 3.3683053765451627e-06, + "loss": 0.5778, + "step": 8808 + }, + { + "epoch": 2.338510553564317, + "grad_norm": 0.43375528050603035, + "learning_rate": 3.36797797481507e-06, + "loss": 0.5702, + "step": 8809 + }, + { + "epoch": 2.3387760520377006, + "grad_norm": 0.42474780495462766, + "learning_rate": 3.367650556157281e-06, + "loss": 0.584, + "step": 8810 + }, + { + "epoch": 2.3390415505110846, + "grad_norm": 0.41703617322444586, + "learning_rate": 3.3673231205781812e-06, + "loss": 0.5694, + "step": 8811 + }, + { + "epoch": 2.3393070489844683, + "grad_norm": 0.42864664088840404, + "learning_rate": 3.3669956680841583e-06, + "loss": 0.583, + "step": 8812 + }, + { + "epoch": 2.339572547457852, + "grad_norm": 0.43901959590019274, + "learning_rate": 3.3666681986815963e-06, + "loss": 0.5774, + "step": 8813 + }, + { + "epoch": 2.339838045931236, + "grad_norm": 0.4427095235203057, + "learning_rate": 3.366340712376882e-06, + "loss": 0.5683, + "step": 8814 + }, + { + "epoch": 2.3401035444046197, + "grad_norm": 0.43391177678379367, + "learning_rate": 3.3660132091764035e-06, + "loss": 0.5916, + "step": 8815 + }, + { + "epoch": 2.3403690428780033, + "grad_norm": 0.43996243402876556, + "learning_rate": 3.3656856890865463e-06, + "loss": 0.5566, + "step": 8816 + }, + { + "epoch": 2.3406345413513874, + "grad_norm": 0.42766544473265805, + "learning_rate": 3.365358152113699e-06, + "loss": 0.5585, + "step": 8817 + }, + { + "epoch": 2.340900039824771, + "grad_norm": 0.4256743187631771, + "learning_rate": 3.365030598264249e-06, + "loss": 0.5485, + "step": 8818 + }, + { + "epoch": 2.3411655382981547, + "grad_norm": 0.42607483077790975, + "learning_rate": 3.364703027544584e-06, + "loss": 0.5502, + "step": 8819 + }, + { + "epoch": 2.3414310367715387, + "grad_norm": 0.4390112592971977, + "learning_rate": 3.3643754399610934e-06, + "loss": 0.5432, + "step": 8820 + }, + { + "epoch": 2.3416965352449224, + "grad_norm": 0.43955638839497163, + "learning_rate": 3.364047835520165e-06, + "loss": 0.5666, + "step": 8821 + }, + { + "epoch": 2.341962033718306, + "grad_norm": 0.4355844466592428, + "learning_rate": 3.3637202142281885e-06, + "loss": 0.558, + "step": 8822 + }, + { + "epoch": 2.34222753219169, + "grad_norm": 0.4360865128593089, + "learning_rate": 3.3633925760915533e-06, + "loss": 0.591, + "step": 8823 + }, + { + "epoch": 2.3424930306650737, + "grad_norm": 0.43913627042112885, + "learning_rate": 3.3630649211166485e-06, + "loss": 0.5242, + "step": 8824 + }, + { + "epoch": 2.3427585291384574, + "grad_norm": 0.4449555994920618, + "learning_rate": 3.362737249309864e-06, + "loss": 0.6171, + "step": 8825 + }, + { + "epoch": 2.3430240276118415, + "grad_norm": 0.42894660818709235, + "learning_rate": 3.3624095606775914e-06, + "loss": 0.517, + "step": 8826 + }, + { + "epoch": 2.343289526085225, + "grad_norm": 0.42691282307265577, + "learning_rate": 3.362081855226221e-06, + "loss": 0.5775, + "step": 8827 + }, + { + "epoch": 2.3435550245586088, + "grad_norm": 0.429486885888311, + "learning_rate": 3.361754132962143e-06, + "loss": 0.5937, + "step": 8828 + }, + { + "epoch": 2.3438205230319924, + "grad_norm": 0.4264673928090679, + "learning_rate": 3.36142639389175e-06, + "loss": 0.5913, + "step": 8829 + }, + { + "epoch": 2.3440860215053765, + "grad_norm": 0.4591669796189079, + "learning_rate": 3.361098638021433e-06, + "loss": 0.5259, + "step": 8830 + }, + { + "epoch": 2.34435151997876, + "grad_norm": 0.434074941188246, + "learning_rate": 3.3607708653575844e-06, + "loss": 0.5774, + "step": 8831 + }, + { + "epoch": 2.3446170184521438, + "grad_norm": 0.43352346559332433, + "learning_rate": 3.360443075906597e-06, + "loss": 0.5648, + "step": 8832 + }, + { + "epoch": 2.344882516925528, + "grad_norm": 0.4489222667996664, + "learning_rate": 3.3601152696748617e-06, + "loss": 0.5534, + "step": 8833 + }, + { + "epoch": 2.3451480153989115, + "grad_norm": 0.4323721948226055, + "learning_rate": 3.3597874466687736e-06, + "loss": 0.5751, + "step": 8834 + }, + { + "epoch": 2.345413513872295, + "grad_norm": 0.43129548976884186, + "learning_rate": 3.359459606894725e-06, + "loss": 0.545, + "step": 8835 + }, + { + "epoch": 2.3456790123456788, + "grad_norm": 0.424979484629881, + "learning_rate": 3.3591317503591093e-06, + "loss": 0.5953, + "step": 8836 + }, + { + "epoch": 2.345944510819063, + "grad_norm": 0.42770430303252044, + "learning_rate": 3.3588038770683207e-06, + "loss": 0.5286, + "step": 8837 + }, + { + "epoch": 2.3462100092924465, + "grad_norm": 0.4328267637116619, + "learning_rate": 3.3584759870287546e-06, + "loss": 0.5774, + "step": 8838 + }, + { + "epoch": 2.34647550776583, + "grad_norm": 0.42777447353551135, + "learning_rate": 3.3581480802468047e-06, + "loss": 0.5749, + "step": 8839 + }, + { + "epoch": 2.3467410062392142, + "grad_norm": 0.4390972021056961, + "learning_rate": 3.3578201567288665e-06, + "loss": 0.5706, + "step": 8840 + }, + { + "epoch": 2.347006504712598, + "grad_norm": 0.4152630591066803, + "learning_rate": 3.357492216481334e-06, + "loss": 0.5676, + "step": 8841 + }, + { + "epoch": 2.3472720031859815, + "grad_norm": 0.4204784542036679, + "learning_rate": 3.3571642595106044e-06, + "loss": 0.5463, + "step": 8842 + }, + { + "epoch": 2.3475375016593656, + "grad_norm": 0.4352861529070829, + "learning_rate": 3.356836285823073e-06, + "loss": 0.5969, + "step": 8843 + }, + { + "epoch": 2.3478030001327492, + "grad_norm": 0.4125679713394544, + "learning_rate": 3.3565082954251352e-06, + "loss": 0.5675, + "step": 8844 + }, + { + "epoch": 2.348068498606133, + "grad_norm": 0.4196060397927863, + "learning_rate": 3.35618028832319e-06, + "loss": 0.5985, + "step": 8845 + }, + { + "epoch": 2.348333997079517, + "grad_norm": 0.4367278845489271, + "learning_rate": 3.355852264523632e-06, + "loss": 0.5223, + "step": 8846 + }, + { + "epoch": 2.3485994955529006, + "grad_norm": 0.4374217443141185, + "learning_rate": 3.3555242240328595e-06, + "loss": 0.5846, + "step": 8847 + }, + { + "epoch": 2.3488649940262842, + "grad_norm": 0.4435691559751568, + "learning_rate": 3.3551961668572697e-06, + "loss": 0.5712, + "step": 8848 + }, + { + "epoch": 2.3491304924996683, + "grad_norm": 0.42569185553728683, + "learning_rate": 3.354868093003262e-06, + "loss": 0.5461, + "step": 8849 + }, + { + "epoch": 2.349395990973052, + "grad_norm": 0.4348053243345388, + "learning_rate": 3.354540002477232e-06, + "loss": 0.5527, + "step": 8850 + }, + { + "epoch": 2.3496614894464356, + "grad_norm": 0.4340360346449056, + "learning_rate": 3.35421189528558e-06, + "loss": 0.5459, + "step": 8851 + }, + { + "epoch": 2.3499269879198197, + "grad_norm": 0.4349564218448799, + "learning_rate": 3.353883771434705e-06, + "loss": 0.5569, + "step": 8852 + }, + { + "epoch": 2.3501924863932033, + "grad_norm": 0.43148989000995414, + "learning_rate": 3.353555630931006e-06, + "loss": 0.5608, + "step": 8853 + }, + { + "epoch": 2.350457984866587, + "grad_norm": 0.4250984511385049, + "learning_rate": 3.3532274737808805e-06, + "loss": 0.5662, + "step": 8854 + }, + { + "epoch": 2.3507234833399706, + "grad_norm": 0.42860272349187517, + "learning_rate": 3.3528992999907323e-06, + "loss": 0.5373, + "step": 8855 + }, + { + "epoch": 2.3509889818133547, + "grad_norm": 0.42901600710336274, + "learning_rate": 3.352571109566958e-06, + "loss": 0.558, + "step": 8856 + }, + { + "epoch": 2.3512544802867383, + "grad_norm": 0.4410004963888209, + "learning_rate": 3.3522429025159606e-06, + "loss": 0.5681, + "step": 8857 + }, + { + "epoch": 2.351519978760122, + "grad_norm": 0.4402728462751001, + "learning_rate": 3.3519146788441405e-06, + "loss": 0.6142, + "step": 8858 + }, + { + "epoch": 2.351785477233506, + "grad_norm": 0.4357818477038254, + "learning_rate": 3.3515864385578974e-06, + "loss": 0.581, + "step": 8859 + }, + { + "epoch": 2.3520509757068897, + "grad_norm": 0.42294156572081576, + "learning_rate": 3.3512581816636336e-06, + "loss": 0.5458, + "step": 8860 + }, + { + "epoch": 2.3523164741802733, + "grad_norm": 0.43484318752635504, + "learning_rate": 3.3509299081677515e-06, + "loss": 0.5599, + "step": 8861 + }, + { + "epoch": 2.3525819726536574, + "grad_norm": 0.4264117234522694, + "learning_rate": 3.350601618076652e-06, + "loss": 0.5748, + "step": 8862 + }, + { + "epoch": 2.352847471127041, + "grad_norm": 0.4350566582975631, + "learning_rate": 3.350273311396739e-06, + "loss": 0.5638, + "step": 8863 + }, + { + "epoch": 2.3531129696004247, + "grad_norm": 0.4263375138965626, + "learning_rate": 3.3499449881344144e-06, + "loss": 0.558, + "step": 8864 + }, + { + "epoch": 2.3533784680738083, + "grad_norm": 0.4281857724682057, + "learning_rate": 3.349616648296081e-06, + "loss": 0.5774, + "step": 8865 + }, + { + "epoch": 2.3536439665471924, + "grad_norm": 0.4438395270856019, + "learning_rate": 3.3492882918881444e-06, + "loss": 0.5666, + "step": 8866 + }, + { + "epoch": 2.353909465020576, + "grad_norm": 0.43144998072188995, + "learning_rate": 3.348959918917005e-06, + "loss": 0.5526, + "step": 8867 + }, + { + "epoch": 2.3541749634939597, + "grad_norm": 0.42157356313686106, + "learning_rate": 3.3486315293890693e-06, + "loss": 0.5285, + "step": 8868 + }, + { + "epoch": 2.354440461967344, + "grad_norm": 0.4225050936733447, + "learning_rate": 3.348303123310741e-06, + "loss": 0.5683, + "step": 8869 + }, + { + "epoch": 2.3547059604407274, + "grad_norm": 0.42698671797859056, + "learning_rate": 3.3479747006884244e-06, + "loss": 0.5582, + "step": 8870 + }, + { + "epoch": 2.354971458914111, + "grad_norm": 0.4369811388184197, + "learning_rate": 3.3476462615285248e-06, + "loss": 0.5948, + "step": 8871 + }, + { + "epoch": 2.355236957387495, + "grad_norm": 0.43584353222099903, + "learning_rate": 3.3473178058374477e-06, + "loss": 0.5605, + "step": 8872 + }, + { + "epoch": 2.355502455860879, + "grad_norm": 0.4342988989388118, + "learning_rate": 3.346989333621599e-06, + "loss": 0.5697, + "step": 8873 + }, + { + "epoch": 2.3557679543342624, + "grad_norm": 0.4323356280350703, + "learning_rate": 3.346660844887385e-06, + "loss": 0.592, + "step": 8874 + }, + { + "epoch": 2.3560334528076465, + "grad_norm": 0.4379984258111358, + "learning_rate": 3.3463323396412113e-06, + "loss": 0.604, + "step": 8875 + }, + { + "epoch": 2.35629895128103, + "grad_norm": 0.4391101883723459, + "learning_rate": 3.3460038178894845e-06, + "loss": 0.579, + "step": 8876 + }, + { + "epoch": 2.356564449754414, + "grad_norm": 0.43034077033188173, + "learning_rate": 3.345675279638612e-06, + "loss": 0.5868, + "step": 8877 + }, + { + "epoch": 2.356829948227798, + "grad_norm": 0.4391422039112972, + "learning_rate": 3.3453467248950013e-06, + "loss": 0.5451, + "step": 8878 + }, + { + "epoch": 2.3570954467011815, + "grad_norm": 0.4388294514393079, + "learning_rate": 3.3450181536650596e-06, + "loss": 0.5736, + "step": 8879 + }, + { + "epoch": 2.357360945174565, + "grad_norm": 0.4469926633619977, + "learning_rate": 3.344689565955194e-06, + "loss": 0.5241, + "step": 8880 + }, + { + "epoch": 2.3576264436479493, + "grad_norm": 0.4306230800009585, + "learning_rate": 3.344360961771815e-06, + "loss": 0.5874, + "step": 8881 + }, + { + "epoch": 2.357891942121333, + "grad_norm": 0.4422450738544373, + "learning_rate": 3.344032341121329e-06, + "loss": 0.5298, + "step": 8882 + }, + { + "epoch": 2.3581574405947165, + "grad_norm": 0.4478856363136004, + "learning_rate": 3.343703704010146e-06, + "loss": 0.5847, + "step": 8883 + }, + { + "epoch": 2.3584229390681, + "grad_norm": 0.42473028395262435, + "learning_rate": 3.3433750504446753e-06, + "loss": 0.5447, + "step": 8884 + }, + { + "epoch": 2.3586884375414843, + "grad_norm": 0.4205272065008639, + "learning_rate": 3.3430463804313262e-06, + "loss": 0.5592, + "step": 8885 + }, + { + "epoch": 2.358953936014868, + "grad_norm": 0.4220914864315717, + "learning_rate": 3.342717693976508e-06, + "loss": 0.5557, + "step": 8886 + }, + { + "epoch": 2.3592194344882516, + "grad_norm": 0.42965589701602974, + "learning_rate": 3.3423889910866324e-06, + "loss": 0.5521, + "step": 8887 + }, + { + "epoch": 2.3594849329616356, + "grad_norm": 0.4383952630417687, + "learning_rate": 3.3420602717681083e-06, + "loss": 0.576, + "step": 8888 + }, + { + "epoch": 2.3597504314350193, + "grad_norm": 0.4502398447236067, + "learning_rate": 3.341731536027347e-06, + "loss": 0.566, + "step": 8889 + }, + { + "epoch": 2.360015929908403, + "grad_norm": 0.4341871617936367, + "learning_rate": 3.3414027838707604e-06, + "loss": 0.5773, + "step": 8890 + }, + { + "epoch": 2.3602814283817866, + "grad_norm": 0.44883184388842, + "learning_rate": 3.341074015304759e-06, + "loss": 0.5832, + "step": 8891 + }, + { + "epoch": 2.3605469268551706, + "grad_norm": 0.4241440165106155, + "learning_rate": 3.340745230335755e-06, + "loss": 0.5778, + "step": 8892 + }, + { + "epoch": 2.3608124253285543, + "grad_norm": 0.44051555281370386, + "learning_rate": 3.3404164289701607e-06, + "loss": 0.5691, + "step": 8893 + }, + { + "epoch": 2.361077923801938, + "grad_norm": 0.4245983160442818, + "learning_rate": 3.3400876112143883e-06, + "loss": 0.5985, + "step": 8894 + }, + { + "epoch": 2.361343422275322, + "grad_norm": 0.4422129724546392, + "learning_rate": 3.3397587770748507e-06, + "loss": 0.5401, + "step": 8895 + }, + { + "epoch": 2.3616089207487057, + "grad_norm": 0.4248377774710493, + "learning_rate": 3.3394299265579606e-06, + "loss": 0.5663, + "step": 8896 + }, + { + "epoch": 2.3618744192220893, + "grad_norm": 0.43193391761416816, + "learning_rate": 3.339101059670132e-06, + "loss": 0.5846, + "step": 8897 + }, + { + "epoch": 2.3621399176954734, + "grad_norm": 0.4506637390449819, + "learning_rate": 3.3387721764177782e-06, + "loss": 0.5784, + "step": 8898 + }, + { + "epoch": 2.362405416168857, + "grad_norm": 0.42713741780447695, + "learning_rate": 3.3384432768073126e-06, + "loss": 0.5747, + "step": 8899 + }, + { + "epoch": 2.3626709146422407, + "grad_norm": 0.4369320504215003, + "learning_rate": 3.3381143608451507e-06, + "loss": 0.5715, + "step": 8900 + }, + { + "epoch": 2.3629364131156247, + "grad_norm": 0.44044415273640597, + "learning_rate": 3.3377854285377072e-06, + "loss": 0.577, + "step": 8901 + }, + { + "epoch": 2.3632019115890084, + "grad_norm": 0.4285197517306018, + "learning_rate": 3.3374564798913955e-06, + "loss": 0.5864, + "step": 8902 + }, + { + "epoch": 2.363467410062392, + "grad_norm": 0.4265317839010594, + "learning_rate": 3.3371275149126325e-06, + "loss": 0.5883, + "step": 8903 + }, + { + "epoch": 2.363732908535776, + "grad_norm": 0.444399158891179, + "learning_rate": 3.336798533607834e-06, + "loss": 0.566, + "step": 8904 + }, + { + "epoch": 2.3639984070091598, + "grad_norm": 0.4353909692618772, + "learning_rate": 3.336469535983415e-06, + "loss": 0.5944, + "step": 8905 + }, + { + "epoch": 2.3642639054825434, + "grad_norm": 0.43745375585501844, + "learning_rate": 3.3361405220457914e-06, + "loss": 0.5818, + "step": 8906 + }, + { + "epoch": 2.3645294039559275, + "grad_norm": 0.4270421377911294, + "learning_rate": 3.335811491801381e-06, + "loss": 0.5801, + "step": 8907 + }, + { + "epoch": 2.364794902429311, + "grad_norm": 0.43307497995833316, + "learning_rate": 3.3354824452566e-06, + "loss": 0.5629, + "step": 8908 + }, + { + "epoch": 2.3650604009026948, + "grad_norm": 0.42055685182281116, + "learning_rate": 3.335153382417865e-06, + "loss": 0.5474, + "step": 8909 + }, + { + "epoch": 2.3653258993760784, + "grad_norm": 0.43408031922165485, + "learning_rate": 3.334824303291595e-06, + "loss": 0.5978, + "step": 8910 + }, + { + "epoch": 2.3655913978494625, + "grad_norm": 0.43897263764473665, + "learning_rate": 3.3344952078842063e-06, + "loss": 0.5654, + "step": 8911 + }, + { + "epoch": 2.365856896322846, + "grad_norm": 0.4252298461587109, + "learning_rate": 3.3341660962021195e-06, + "loss": 0.524, + "step": 8912 + }, + { + "epoch": 2.3661223947962298, + "grad_norm": 0.43166903261059497, + "learning_rate": 3.33383696825175e-06, + "loss": 0.5541, + "step": 8913 + }, + { + "epoch": 2.366387893269614, + "grad_norm": 0.4401792696403206, + "learning_rate": 3.333507824039518e-06, + "loss": 0.5506, + "step": 8914 + }, + { + "epoch": 2.3666533917429975, + "grad_norm": 0.4187047529131064, + "learning_rate": 3.3331786635718433e-06, + "loss": 0.5692, + "step": 8915 + }, + { + "epoch": 2.366918890216381, + "grad_norm": 0.43420256865427387, + "learning_rate": 3.3328494868551444e-06, + "loss": 0.5473, + "step": 8916 + }, + { + "epoch": 2.367184388689765, + "grad_norm": 0.42722773946391224, + "learning_rate": 3.332520293895841e-06, + "loss": 0.5822, + "step": 8917 + }, + { + "epoch": 2.367449887163149, + "grad_norm": 0.4412444705009898, + "learning_rate": 3.3321910847003543e-06, + "loss": 0.5729, + "step": 8918 + }, + { + "epoch": 2.3677153856365325, + "grad_norm": 0.424083896091279, + "learning_rate": 3.331861859275103e-06, + "loss": 0.5838, + "step": 8919 + }, + { + "epoch": 2.367980884109916, + "grad_norm": 0.4529796841772629, + "learning_rate": 3.3315326176265094e-06, + "loss": 0.5751, + "step": 8920 + }, + { + "epoch": 2.3682463825833002, + "grad_norm": 0.4494633823009935, + "learning_rate": 3.331203359760994e-06, + "loss": 0.588, + "step": 8921 + }, + { + "epoch": 2.368511881056684, + "grad_norm": 0.4295703481885404, + "learning_rate": 3.330874085684977e-06, + "loss": 0.5599, + "step": 8922 + }, + { + "epoch": 2.3687773795300675, + "grad_norm": 0.4273346374535615, + "learning_rate": 3.3305447954048814e-06, + "loss": 0.5675, + "step": 8923 + }, + { + "epoch": 2.3690428780034516, + "grad_norm": 0.47480264179759235, + "learning_rate": 3.3302154889271294e-06, + "loss": 0.5344, + "step": 8924 + }, + { + "epoch": 2.3693083764768352, + "grad_norm": 0.43136636991689664, + "learning_rate": 3.329886166258142e-06, + "loss": 0.5399, + "step": 8925 + }, + { + "epoch": 2.369573874950219, + "grad_norm": 0.4319479532477483, + "learning_rate": 3.3295568274043422e-06, + "loss": 0.5283, + "step": 8926 + }, + { + "epoch": 2.369839373423603, + "grad_norm": 0.438441439982824, + "learning_rate": 3.3292274723721536e-06, + "loss": 0.5275, + "step": 8927 + }, + { + "epoch": 2.3701048718969866, + "grad_norm": 0.44471836262488756, + "learning_rate": 3.328898101167999e-06, + "loss": 0.5776, + "step": 8928 + }, + { + "epoch": 2.3703703703703702, + "grad_norm": 0.4289057332067796, + "learning_rate": 3.3285687137983016e-06, + "loss": 0.5726, + "step": 8929 + }, + { + "epoch": 2.3706358688437543, + "grad_norm": 0.4470900182030393, + "learning_rate": 3.328239310269486e-06, + "loss": 0.5873, + "step": 8930 + }, + { + "epoch": 2.370901367317138, + "grad_norm": 0.4541801082005248, + "learning_rate": 3.327909890587976e-06, + "loss": 0.6118, + "step": 8931 + }, + { + "epoch": 2.3711668657905216, + "grad_norm": 0.4349202020085466, + "learning_rate": 3.3275804547601953e-06, + "loss": 0.5738, + "step": 8932 + }, + { + "epoch": 2.3714323642639057, + "grad_norm": 0.42925759601253943, + "learning_rate": 3.3272510027925707e-06, + "loss": 0.5672, + "step": 8933 + }, + { + "epoch": 2.3716978627372893, + "grad_norm": 0.4494786758818938, + "learning_rate": 3.326921534691525e-06, + "loss": 0.6012, + "step": 8934 + }, + { + "epoch": 2.371963361210673, + "grad_norm": 0.4352010167679763, + "learning_rate": 3.3265920504634852e-06, + "loss": 0.5525, + "step": 8935 + }, + { + "epoch": 2.372228859684057, + "grad_norm": 0.41406772268010444, + "learning_rate": 3.3262625501148764e-06, + "loss": 0.5448, + "step": 8936 + }, + { + "epoch": 2.3724943581574407, + "grad_norm": 0.41525166007121395, + "learning_rate": 3.3259330336521252e-06, + "loss": 0.533, + "step": 8937 + }, + { + "epoch": 2.3727598566308243, + "grad_norm": 0.42046505309234083, + "learning_rate": 3.3256035010816577e-06, + "loss": 0.592, + "step": 8938 + }, + { + "epoch": 2.373025355104208, + "grad_norm": 0.4159543881754008, + "learning_rate": 3.3252739524098997e-06, + "loss": 0.5918, + "step": 8939 + }, + { + "epoch": 2.373290853577592, + "grad_norm": 0.431125485660253, + "learning_rate": 3.3249443876432795e-06, + "loss": 0.5434, + "step": 8940 + }, + { + "epoch": 2.3735563520509757, + "grad_norm": 0.4430481936721721, + "learning_rate": 3.3246148067882243e-06, + "loss": 0.5706, + "step": 8941 + }, + { + "epoch": 2.3738218505243593, + "grad_norm": 0.42744908399488074, + "learning_rate": 3.3242852098511607e-06, + "loss": 0.563, + "step": 8942 + }, + { + "epoch": 2.3740873489977434, + "grad_norm": 0.4302090306120013, + "learning_rate": 3.323955596838518e-06, + "loss": 0.5837, + "step": 8943 + }, + { + "epoch": 2.374352847471127, + "grad_norm": 0.4278112365724571, + "learning_rate": 3.3236259677567232e-06, + "loss": 0.544, + "step": 8944 + }, + { + "epoch": 2.3746183459445107, + "grad_norm": 0.438050020826379, + "learning_rate": 3.323296322612205e-06, + "loss": 0.581, + "step": 8945 + }, + { + "epoch": 2.3748838444178944, + "grad_norm": 0.4499585228578101, + "learning_rate": 3.322966661411393e-06, + "loss": 0.5821, + "step": 8946 + }, + { + "epoch": 2.3751493428912784, + "grad_norm": 0.45782669341340915, + "learning_rate": 3.3226369841607174e-06, + "loss": 0.5765, + "step": 8947 + }, + { + "epoch": 2.375414841364662, + "grad_norm": 0.43309585800248296, + "learning_rate": 3.3223072908666053e-06, + "loss": 0.5787, + "step": 8948 + }, + { + "epoch": 2.3756803398380457, + "grad_norm": 0.43769625362966325, + "learning_rate": 3.3219775815354875e-06, + "loss": 0.5361, + "step": 8949 + }, + { + "epoch": 2.37594583831143, + "grad_norm": 0.44221921238496265, + "learning_rate": 3.321647856173795e-06, + "loss": 0.6124, + "step": 8950 + }, + { + "epoch": 2.3762113367848134, + "grad_norm": 0.43686696963561916, + "learning_rate": 3.3213181147879576e-06, + "loss": 0.5452, + "step": 8951 + }, + { + "epoch": 2.376476835258197, + "grad_norm": 0.42803595460610067, + "learning_rate": 3.3209883573844052e-06, + "loss": 0.5939, + "step": 8952 + }, + { + "epoch": 2.376742333731581, + "grad_norm": 0.42659580352178766, + "learning_rate": 3.3206585839695703e-06, + "loss": 0.6101, + "step": 8953 + }, + { + "epoch": 2.377007832204965, + "grad_norm": 0.42756629395576273, + "learning_rate": 3.3203287945498843e-06, + "loss": 0.5711, + "step": 8954 + }, + { + "epoch": 2.3772733306783485, + "grad_norm": 0.42729382465879334, + "learning_rate": 3.3199989891317763e-06, + "loss": 0.5595, + "step": 8955 + }, + { + "epoch": 2.3775388291517325, + "grad_norm": 0.4397539626090969, + "learning_rate": 3.319669167721682e-06, + "loss": 0.5205, + "step": 8956 + }, + { + "epoch": 2.377804327625116, + "grad_norm": 0.4431624990374816, + "learning_rate": 3.3193393303260317e-06, + "loss": 0.6023, + "step": 8957 + }, + { + "epoch": 2.3780698260985, + "grad_norm": 0.42947122560393286, + "learning_rate": 3.319009476951258e-06, + "loss": 0.6097, + "step": 8958 + }, + { + "epoch": 2.378335324571884, + "grad_norm": 0.42861413870650317, + "learning_rate": 3.318679607603795e-06, + "loss": 0.5385, + "step": 8959 + }, + { + "epoch": 2.3786008230452675, + "grad_norm": 0.4386827420592484, + "learning_rate": 3.3183497222900747e-06, + "loss": 0.531, + "step": 8960 + }, + { + "epoch": 2.378866321518651, + "grad_norm": 0.43612203246490566, + "learning_rate": 3.3180198210165316e-06, + "loss": 0.5716, + "step": 8961 + }, + { + "epoch": 2.3791318199920353, + "grad_norm": 0.4254797625268863, + "learning_rate": 3.317689903789598e-06, + "loss": 0.5348, + "step": 8962 + }, + { + "epoch": 2.379397318465419, + "grad_norm": 0.4392126814240156, + "learning_rate": 3.31735997061571e-06, + "loss": 0.5507, + "step": 8963 + }, + { + "epoch": 2.3796628169388025, + "grad_norm": 0.4447925197687243, + "learning_rate": 3.3170300215013013e-06, + "loss": 0.6066, + "step": 8964 + }, + { + "epoch": 2.379928315412186, + "grad_norm": 0.45156181342584906, + "learning_rate": 3.316700056452806e-06, + "loss": 0.5493, + "step": 8965 + }, + { + "epoch": 2.3801938138855703, + "grad_norm": 0.4347426934114723, + "learning_rate": 3.3163700754766607e-06, + "loss": 0.5713, + "step": 8966 + }, + { + "epoch": 2.380459312358954, + "grad_norm": 0.4295693372840132, + "learning_rate": 3.3160400785793007e-06, + "loss": 0.5383, + "step": 8967 + }, + { + "epoch": 2.3807248108323376, + "grad_norm": 0.4259879973654942, + "learning_rate": 3.3157100657671606e-06, + "loss": 0.524, + "step": 8968 + }, + { + "epoch": 2.3809903093057216, + "grad_norm": 0.4635457295863235, + "learning_rate": 3.3153800370466766e-06, + "loss": 0.5704, + "step": 8969 + }, + { + "epoch": 2.3812558077791053, + "grad_norm": 0.41704338076713193, + "learning_rate": 3.315049992424286e-06, + "loss": 0.5755, + "step": 8970 + }, + { + "epoch": 2.381521306252489, + "grad_norm": 0.4413481594447607, + "learning_rate": 3.314719931906425e-06, + "loss": 0.5714, + "step": 8971 + }, + { + "epoch": 2.381786804725873, + "grad_norm": 0.44544683772216187, + "learning_rate": 3.31438985549953e-06, + "loss": 0.5694, + "step": 8972 + }, + { + "epoch": 2.3820523031992566, + "grad_norm": 0.44183917269096257, + "learning_rate": 3.314059763210039e-06, + "loss": 0.5452, + "step": 8973 + }, + { + "epoch": 2.3823178016726403, + "grad_norm": 0.42772579390770954, + "learning_rate": 3.31372965504439e-06, + "loss": 0.5454, + "step": 8974 + }, + { + "epoch": 2.382583300146024, + "grad_norm": 0.438316228849524, + "learning_rate": 3.3133995310090196e-06, + "loss": 0.5667, + "step": 8975 + }, + { + "epoch": 2.382848798619408, + "grad_norm": 0.41846680898985295, + "learning_rate": 3.3130693911103672e-06, + "loss": 0.5691, + "step": 8976 + }, + { + "epoch": 2.3831142970927917, + "grad_norm": 0.4377084618128971, + "learning_rate": 3.312739235354871e-06, + "loss": 0.5699, + "step": 8977 + }, + { + "epoch": 2.3833797955661753, + "grad_norm": 0.436086779882674, + "learning_rate": 3.3124090637489696e-06, + "loss": 0.5748, + "step": 8978 + }, + { + "epoch": 2.3836452940395594, + "grad_norm": 0.43479769510778893, + "learning_rate": 3.312078876299103e-06, + "loss": 0.5728, + "step": 8979 + }, + { + "epoch": 2.383910792512943, + "grad_norm": 0.4354054568009158, + "learning_rate": 3.3117486730117092e-06, + "loss": 0.5927, + "step": 8980 + }, + { + "epoch": 2.3841762909863267, + "grad_norm": 0.44673025767342245, + "learning_rate": 3.3114184538932294e-06, + "loss": 0.537, + "step": 8981 + }, + { + "epoch": 2.3844417894597107, + "grad_norm": 0.430354988224778, + "learning_rate": 3.3110882189501022e-06, + "loss": 0.5895, + "step": 8982 + }, + { + "epoch": 2.3847072879330944, + "grad_norm": 0.4386886765841535, + "learning_rate": 3.310757968188769e-06, + "loss": 0.5945, + "step": 8983 + }, + { + "epoch": 2.384972786406478, + "grad_norm": 0.43263691756984424, + "learning_rate": 3.3104277016156707e-06, + "loss": 0.551, + "step": 8984 + }, + { + "epoch": 2.385238284879862, + "grad_norm": 0.45993801063373413, + "learning_rate": 3.3100974192372484e-06, + "loss": 0.5549, + "step": 8985 + }, + { + "epoch": 2.3855037833532458, + "grad_norm": 0.4321727250108383, + "learning_rate": 3.3097671210599425e-06, + "loss": 0.5553, + "step": 8986 + }, + { + "epoch": 2.3857692818266294, + "grad_norm": 0.4290232568933907, + "learning_rate": 3.309436807090195e-06, + "loss": 0.5697, + "step": 8987 + }, + { + "epoch": 2.3860347803000135, + "grad_norm": 0.4227054942908084, + "learning_rate": 3.309106477334448e-06, + "loss": 0.5497, + "step": 8988 + }, + { + "epoch": 2.386300278773397, + "grad_norm": 0.44266586522443746, + "learning_rate": 3.3087761317991433e-06, + "loss": 0.5739, + "step": 8989 + }, + { + "epoch": 2.3865657772467808, + "grad_norm": 0.418396380667841, + "learning_rate": 3.3084457704907245e-06, + "loss": 0.5674, + "step": 8990 + }, + { + "epoch": 2.386831275720165, + "grad_norm": 0.4473386595077829, + "learning_rate": 3.308115393415633e-06, + "loss": 0.5921, + "step": 8991 + }, + { + "epoch": 2.3870967741935485, + "grad_norm": 0.4335663370808527, + "learning_rate": 3.307785000580313e-06, + "loss": 0.5646, + "step": 8992 + }, + { + "epoch": 2.387362272666932, + "grad_norm": 0.4429002140683974, + "learning_rate": 3.3074545919912083e-06, + "loss": 0.5924, + "step": 8993 + }, + { + "epoch": 2.3876277711403158, + "grad_norm": 0.4359265006846511, + "learning_rate": 3.3071241676547617e-06, + "loss": 0.5409, + "step": 8994 + }, + { + "epoch": 2.3878932696137, + "grad_norm": 0.44248286040260887, + "learning_rate": 3.306793727577417e-06, + "loss": 0.5736, + "step": 8995 + }, + { + "epoch": 2.3881587680870835, + "grad_norm": 0.4344038649302684, + "learning_rate": 3.3064632717656203e-06, + "loss": 0.5784, + "step": 8996 + }, + { + "epoch": 2.388424266560467, + "grad_norm": 0.44642269304610177, + "learning_rate": 3.3061328002258143e-06, + "loss": 0.5712, + "step": 8997 + }, + { + "epoch": 2.388689765033851, + "grad_norm": 0.43630233992230993, + "learning_rate": 3.3058023129644458e-06, + "loss": 0.5527, + "step": 8998 + }, + { + "epoch": 2.388955263507235, + "grad_norm": 0.43482513745974766, + "learning_rate": 3.305471809987959e-06, + "loss": 0.5487, + "step": 8999 + }, + { + "epoch": 2.3892207619806185, + "grad_norm": 0.43528551937940396, + "learning_rate": 3.3051412913027993e-06, + "loss": 0.5508, + "step": 9000 + }, + { + "epoch": 2.389486260454002, + "grad_norm": 0.4164359446587163, + "learning_rate": 3.3048107569154124e-06, + "loss": 0.5574, + "step": 9001 + }, + { + "epoch": 2.3897517589273862, + "grad_norm": 0.43107175165329814, + "learning_rate": 3.304480206832247e-06, + "loss": 0.5529, + "step": 9002 + }, + { + "epoch": 2.39001725740077, + "grad_norm": 0.43958148467125346, + "learning_rate": 3.304149641059747e-06, + "loss": 0.5407, + "step": 9003 + }, + { + "epoch": 2.3902827558741535, + "grad_norm": 0.44814532759915765, + "learning_rate": 3.3038190596043595e-06, + "loss": 0.5925, + "step": 9004 + }, + { + "epoch": 2.3905482543475376, + "grad_norm": 0.42870170473471575, + "learning_rate": 3.303488462472534e-06, + "loss": 0.5591, + "step": 9005 + }, + { + "epoch": 2.3908137528209212, + "grad_norm": 0.4362208141538646, + "learning_rate": 3.3031578496707144e-06, + "loss": 0.5746, + "step": 9006 + }, + { + "epoch": 2.391079251294305, + "grad_norm": 0.4315727270725072, + "learning_rate": 3.302827221205351e-06, + "loss": 0.5864, + "step": 9007 + }, + { + "epoch": 2.391344749767689, + "grad_norm": 0.4252190510641299, + "learning_rate": 3.3024965770828914e-06, + "loss": 0.5889, + "step": 9008 + }, + { + "epoch": 2.3916102482410726, + "grad_norm": 0.45563921278842917, + "learning_rate": 3.302165917309783e-06, + "loss": 0.5814, + "step": 9009 + }, + { + "epoch": 2.3918757467144562, + "grad_norm": 0.4336231217139674, + "learning_rate": 3.301835241892476e-06, + "loss": 0.5592, + "step": 9010 + }, + { + "epoch": 2.3921412451878403, + "grad_norm": 0.4393491909832698, + "learning_rate": 3.3015045508374177e-06, + "loss": 0.5736, + "step": 9011 + }, + { + "epoch": 2.392406743661224, + "grad_norm": 0.4433823245956819, + "learning_rate": 3.301173844151059e-06, + "loss": 0.5749, + "step": 9012 + }, + { + "epoch": 2.3926722421346076, + "grad_norm": 0.42753774010277257, + "learning_rate": 3.3008431218398483e-06, + "loss": 0.5702, + "step": 9013 + }, + { + "epoch": 2.3929377406079917, + "grad_norm": 0.4637681095886912, + "learning_rate": 3.300512383910236e-06, + "loss": 0.5573, + "step": 9014 + }, + { + "epoch": 2.3932032390813753, + "grad_norm": 0.450509250472613, + "learning_rate": 3.3001816303686718e-06, + "loss": 0.5822, + "step": 9015 + }, + { + "epoch": 2.393468737554759, + "grad_norm": 0.46193271022882076, + "learning_rate": 3.299850861221608e-06, + "loss": 0.5783, + "step": 9016 + }, + { + "epoch": 2.393734236028143, + "grad_norm": 0.43633129242591095, + "learning_rate": 3.2995200764754924e-06, + "loss": 0.5655, + "step": 9017 + }, + { + "epoch": 2.3939997345015267, + "grad_norm": 0.4248732573455485, + "learning_rate": 3.299189276136778e-06, + "loss": 0.557, + "step": 9018 + }, + { + "epoch": 2.3942652329749103, + "grad_norm": 0.44045966667493264, + "learning_rate": 3.2988584602119166e-06, + "loss": 0.5681, + "step": 9019 + }, + { + "epoch": 2.394530731448294, + "grad_norm": 0.4373104490205752, + "learning_rate": 3.298527628707359e-06, + "loss": 0.6011, + "step": 9020 + }, + { + "epoch": 2.394796229921678, + "grad_norm": 0.43071368277808325, + "learning_rate": 3.2981967816295578e-06, + "loss": 0.5369, + "step": 9021 + }, + { + "epoch": 2.3950617283950617, + "grad_norm": 0.43101590793783506, + "learning_rate": 3.2978659189849647e-06, + "loss": 0.5302, + "step": 9022 + }, + { + "epoch": 2.3953272268684453, + "grad_norm": 0.4306973182771112, + "learning_rate": 3.2975350407800333e-06, + "loss": 0.5344, + "step": 9023 + }, + { + "epoch": 2.3955927253418294, + "grad_norm": 0.43845580504732257, + "learning_rate": 3.297204147021215e-06, + "loss": 0.592, + "step": 9024 + }, + { + "epoch": 2.395858223815213, + "grad_norm": 0.43915433873623594, + "learning_rate": 3.2968732377149648e-06, + "loss": 0.5913, + "step": 9025 + }, + { + "epoch": 2.3961237222885967, + "grad_norm": 0.4413209439489076, + "learning_rate": 3.2965423128677347e-06, + "loss": 0.5574, + "step": 9026 + }, + { + "epoch": 2.396389220761981, + "grad_norm": 0.43644401063689026, + "learning_rate": 3.296211372485979e-06, + "loss": 0.5492, + "step": 9027 + }, + { + "epoch": 2.3966547192353644, + "grad_norm": 0.4386200850754053, + "learning_rate": 3.295880416576153e-06, + "loss": 0.5719, + "step": 9028 + }, + { + "epoch": 2.396920217708748, + "grad_norm": 0.43313383845216535, + "learning_rate": 3.295549445144709e-06, + "loss": 0.5846, + "step": 9029 + }, + { + "epoch": 2.3971857161821317, + "grad_norm": 0.43967996766675765, + "learning_rate": 3.295218458198104e-06, + "loss": 0.5699, + "step": 9030 + }, + { + "epoch": 2.397451214655516, + "grad_norm": 0.45365517654404003, + "learning_rate": 3.2948874557427923e-06, + "loss": 0.5412, + "step": 9031 + }, + { + "epoch": 2.3977167131288994, + "grad_norm": 0.44637822687623785, + "learning_rate": 3.294556437785228e-06, + "loss": 0.5862, + "step": 9032 + }, + { + "epoch": 2.397982211602283, + "grad_norm": 0.43278537443010157, + "learning_rate": 3.2942254043318688e-06, + "loss": 0.5635, + "step": 9033 + }, + { + "epoch": 2.398247710075667, + "grad_norm": 0.44873148898784226, + "learning_rate": 3.2938943553891687e-06, + "loss": 0.5448, + "step": 9034 + }, + { + "epoch": 2.398513208549051, + "grad_norm": 0.4516706392984563, + "learning_rate": 3.2935632909635852e-06, + "loss": 0.5497, + "step": 9035 + }, + { + "epoch": 2.3987787070224345, + "grad_norm": 0.43999825675334375, + "learning_rate": 3.2932322110615744e-06, + "loss": 0.596, + "step": 9036 + }, + { + "epoch": 2.3990442054958185, + "grad_norm": 0.4410207508138358, + "learning_rate": 3.292901115689593e-06, + "loss": 0.5538, + "step": 9037 + }, + { + "epoch": 2.399309703969202, + "grad_norm": 0.44325866107186057, + "learning_rate": 3.2925700048540985e-06, + "loss": 0.5582, + "step": 9038 + }, + { + "epoch": 2.399575202442586, + "grad_norm": 0.44671382021661027, + "learning_rate": 3.29223887856155e-06, + "loss": 0.5958, + "step": 9039 + }, + { + "epoch": 2.39984070091597, + "grad_norm": 0.4399975235460901, + "learning_rate": 3.2919077368184023e-06, + "loss": 0.579, + "step": 9040 + }, + { + "epoch": 2.4001061993893535, + "grad_norm": 0.44199927150768964, + "learning_rate": 3.2915765796311143e-06, + "loss": 0.5921, + "step": 9041 + }, + { + "epoch": 2.400371697862737, + "grad_norm": 0.4468000911483779, + "learning_rate": 3.291245407006146e-06, + "loss": 0.5704, + "step": 9042 + }, + { + "epoch": 2.4006371963361213, + "grad_norm": 0.4283960643988952, + "learning_rate": 3.2909142189499545e-06, + "loss": 0.5535, + "step": 9043 + }, + { + "epoch": 2.400902694809505, + "grad_norm": 0.4474830594423144, + "learning_rate": 3.290583015468999e-06, + "loss": 0.5276, + "step": 9044 + }, + { + "epoch": 2.4011681932828886, + "grad_norm": 0.4387095466128506, + "learning_rate": 3.29025179656974e-06, + "loss": 0.5579, + "step": 9045 + }, + { + "epoch": 2.4014336917562726, + "grad_norm": 0.439554860831197, + "learning_rate": 3.289920562258635e-06, + "loss": 0.5366, + "step": 9046 + }, + { + "epoch": 2.4016991902296563, + "grad_norm": 0.4256204999554291, + "learning_rate": 3.2895893125421447e-06, + "loss": 0.5503, + "step": 9047 + }, + { + "epoch": 2.40196468870304, + "grad_norm": 0.4286274143110022, + "learning_rate": 3.2892580474267305e-06, + "loss": 0.5741, + "step": 9048 + }, + { + "epoch": 2.4022301871764236, + "grad_norm": 0.43799657315997353, + "learning_rate": 3.288926766918852e-06, + "loss": 0.5729, + "step": 9049 + }, + { + "epoch": 2.4024956856498076, + "grad_norm": 0.45387084500085684, + "learning_rate": 3.288595471024969e-06, + "loss": 0.5912, + "step": 9050 + }, + { + "epoch": 2.4027611841231913, + "grad_norm": 0.46089971493613646, + "learning_rate": 3.288264159751544e-06, + "loss": 0.5887, + "step": 9051 + }, + { + "epoch": 2.403026682596575, + "grad_norm": 0.45149176031626753, + "learning_rate": 3.287932833105038e-06, + "loss": 0.5636, + "step": 9052 + }, + { + "epoch": 2.403292181069959, + "grad_norm": 0.456286986382293, + "learning_rate": 3.2876014910919125e-06, + "loss": 0.5764, + "step": 9053 + }, + { + "epoch": 2.4035576795433427, + "grad_norm": 0.4312070695857693, + "learning_rate": 3.2872701337186298e-06, + "loss": 0.5504, + "step": 9054 + }, + { + "epoch": 2.4038231780167263, + "grad_norm": 0.43698954704816434, + "learning_rate": 3.2869387609916514e-06, + "loss": 0.5831, + "step": 9055 + }, + { + "epoch": 2.40408867649011, + "grad_norm": 0.45754160345467837, + "learning_rate": 3.286607372917441e-06, + "loss": 0.5648, + "step": 9056 + }, + { + "epoch": 2.404354174963494, + "grad_norm": 0.4458020898123882, + "learning_rate": 3.2862759695024605e-06, + "loss": 0.5753, + "step": 9057 + }, + { + "epoch": 2.4046196734368777, + "grad_norm": 0.4301693943844551, + "learning_rate": 3.285944550753173e-06, + "loss": 0.5454, + "step": 9058 + }, + { + "epoch": 2.4048851719102613, + "grad_norm": 0.42831206217370205, + "learning_rate": 3.2856131166760438e-06, + "loss": 0.5809, + "step": 9059 + }, + { + "epoch": 2.4051506703836454, + "grad_norm": 0.42342340460670735, + "learning_rate": 3.285281667277534e-06, + "loss": 0.569, + "step": 9060 + }, + { + "epoch": 2.405416168857029, + "grad_norm": 0.42558313523624053, + "learning_rate": 3.284950202564109e-06, + "loss": 0.5475, + "step": 9061 + }, + { + "epoch": 2.4056816673304127, + "grad_norm": 0.43462346314129197, + "learning_rate": 3.2846187225422343e-06, + "loss": 0.5627, + "step": 9062 + }, + { + "epoch": 2.4059471658037968, + "grad_norm": 0.4390428636695656, + "learning_rate": 3.2842872272183723e-06, + "loss": 0.5833, + "step": 9063 + }, + { + "epoch": 2.4062126642771804, + "grad_norm": 0.42980440260601255, + "learning_rate": 3.283955716598989e-06, + "loss": 0.544, + "step": 9064 + }, + { + "epoch": 2.406478162750564, + "grad_norm": 0.4311810242230458, + "learning_rate": 3.283624190690551e-06, + "loss": 0.5428, + "step": 9065 + }, + { + "epoch": 2.406743661223948, + "grad_norm": 0.42029966084695025, + "learning_rate": 3.2832926494995215e-06, + "loss": 0.5303, + "step": 9066 + }, + { + "epoch": 2.4070091596973318, + "grad_norm": 0.42526557041297225, + "learning_rate": 3.282961093032368e-06, + "loss": 0.5637, + "step": 9067 + }, + { + "epoch": 2.4072746581707154, + "grad_norm": 0.42468257885172217, + "learning_rate": 3.282629521295556e-06, + "loss": 0.5596, + "step": 9068 + }, + { + "epoch": 2.4075401566440995, + "grad_norm": 0.4307766994261409, + "learning_rate": 3.2822979342955525e-06, + "loss": 0.5756, + "step": 9069 + }, + { + "epoch": 2.407805655117483, + "grad_norm": 0.4338800409124982, + "learning_rate": 3.2819663320388233e-06, + "loss": 0.5513, + "step": 9070 + }, + { + "epoch": 2.4080711535908668, + "grad_norm": 0.4497810731846223, + "learning_rate": 3.2816347145318367e-06, + "loss": 0.5388, + "step": 9071 + }, + { + "epoch": 2.408336652064251, + "grad_norm": 0.4374456560631929, + "learning_rate": 3.2813030817810587e-06, + "loss": 0.5981, + "step": 9072 + }, + { + "epoch": 2.4086021505376345, + "grad_norm": 0.4301494899973307, + "learning_rate": 3.2809714337929584e-06, + "loss": 0.584, + "step": 9073 + }, + { + "epoch": 2.408867649011018, + "grad_norm": 0.42908340940462375, + "learning_rate": 3.280639770574002e-06, + "loss": 0.5518, + "step": 9074 + }, + { + "epoch": 2.4091331474844018, + "grad_norm": 0.4488992569860736, + "learning_rate": 3.28030809213066e-06, + "loss": 0.5796, + "step": 9075 + }, + { + "epoch": 2.409398645957786, + "grad_norm": 0.4387102985845898, + "learning_rate": 3.2799763984693982e-06, + "loss": 0.5325, + "step": 9076 + }, + { + "epoch": 2.4096641444311695, + "grad_norm": 0.42057295179418563, + "learning_rate": 3.2796446895966883e-06, + "loss": 0.5471, + "step": 9077 + }, + { + "epoch": 2.409929642904553, + "grad_norm": 0.4268868337030098, + "learning_rate": 3.279312965518997e-06, + "loss": 0.568, + "step": 9078 + }, + { + "epoch": 2.4101951413779372, + "grad_norm": 0.44955924659503443, + "learning_rate": 3.2789812262427954e-06, + "loss": 0.5708, + "step": 9079 + }, + { + "epoch": 2.410460639851321, + "grad_norm": 0.4440585093204899, + "learning_rate": 3.278649471774553e-06, + "loss": 0.5546, + "step": 9080 + }, + { + "epoch": 2.4107261383247045, + "grad_norm": 0.43300766950418235, + "learning_rate": 3.2783177021207387e-06, + "loss": 0.5644, + "step": 9081 + }, + { + "epoch": 2.4109916367980886, + "grad_norm": 0.43560153230019805, + "learning_rate": 3.2779859172878238e-06, + "loss": 0.5752, + "step": 9082 + }, + { + "epoch": 2.4112571352714722, + "grad_norm": 0.4378306348115646, + "learning_rate": 3.2776541172822783e-06, + "loss": 0.5631, + "step": 9083 + }, + { + "epoch": 2.411522633744856, + "grad_norm": 0.45888931326126037, + "learning_rate": 3.277322302110574e-06, + "loss": 0.5527, + "step": 9084 + }, + { + "epoch": 2.4117881322182395, + "grad_norm": 0.43992651384570824, + "learning_rate": 3.2769904717791818e-06, + "loss": 0.5663, + "step": 9085 + }, + { + "epoch": 2.4120536306916236, + "grad_norm": 0.42832766778053155, + "learning_rate": 3.276658626294572e-06, + "loss": 0.5625, + "step": 9086 + }, + { + "epoch": 2.4123191291650072, + "grad_norm": 0.43847995820519775, + "learning_rate": 3.276326765663218e-06, + "loss": 0.5542, + "step": 9087 + }, + { + "epoch": 2.412584627638391, + "grad_norm": 0.4441117513484502, + "learning_rate": 3.2759948898915916e-06, + "loss": 0.5731, + "step": 9088 + }, + { + "epoch": 2.412850126111775, + "grad_norm": 0.4343526164747089, + "learning_rate": 3.275662998986165e-06, + "loss": 0.553, + "step": 9089 + }, + { + "epoch": 2.4131156245851586, + "grad_norm": 0.4402587746978412, + "learning_rate": 3.27533109295341e-06, + "loss": 0.5685, + "step": 9090 + }, + { + "epoch": 2.4133811230585422, + "grad_norm": 0.42444058077186153, + "learning_rate": 3.2749991717998007e-06, + "loss": 0.5799, + "step": 9091 + }, + { + "epoch": 2.4136466215319263, + "grad_norm": 0.43820718119411045, + "learning_rate": 3.27466723553181e-06, + "loss": 0.5804, + "step": 9092 + }, + { + "epoch": 2.41391212000531, + "grad_norm": 0.43654068591366924, + "learning_rate": 3.274335284155911e-06, + "loss": 0.5783, + "step": 9093 + }, + { + "epoch": 2.4141776184786936, + "grad_norm": 0.43532639651992966, + "learning_rate": 3.2740033176785792e-06, + "loss": 0.5932, + "step": 9094 + }, + { + "epoch": 2.4144431169520777, + "grad_norm": 0.4348176848811108, + "learning_rate": 3.2736713361062873e-06, + "loss": 0.5648, + "step": 9095 + }, + { + "epoch": 2.4147086154254613, + "grad_norm": 0.4372382741372949, + "learning_rate": 3.2733393394455094e-06, + "loss": 0.5569, + "step": 9096 + }, + { + "epoch": 2.414974113898845, + "grad_norm": 0.41345614969053285, + "learning_rate": 3.273007327702722e-06, + "loss": 0.5488, + "step": 9097 + }, + { + "epoch": 2.415239612372229, + "grad_norm": 0.4395229362251379, + "learning_rate": 3.2726753008843983e-06, + "loss": 0.5766, + "step": 9098 + }, + { + "epoch": 2.4155051108456127, + "grad_norm": 0.43011577391031586, + "learning_rate": 3.2723432589970137e-06, + "loss": 0.5725, + "step": 9099 + }, + { + "epoch": 2.4157706093189963, + "grad_norm": 0.44600835447553777, + "learning_rate": 3.2720112020470463e-06, + "loss": 0.5344, + "step": 9100 + }, + { + "epoch": 2.4160361077923804, + "grad_norm": 0.4383011380768872, + "learning_rate": 3.271679130040969e-06, + "loss": 0.5528, + "step": 9101 + }, + { + "epoch": 2.416301606265764, + "grad_norm": 0.4385086349773373, + "learning_rate": 3.271347042985259e-06, + "loss": 0.5764, + "step": 9102 + }, + { + "epoch": 2.4165671047391477, + "grad_norm": 0.4414712358391007, + "learning_rate": 3.2710149408863934e-06, + "loss": 0.5411, + "step": 9103 + }, + { + "epoch": 2.4168326032125313, + "grad_norm": 0.43301901942809695, + "learning_rate": 3.2706828237508493e-06, + "loss": 0.5856, + "step": 9104 + }, + { + "epoch": 2.4170981016859154, + "grad_norm": 0.43166595190549506, + "learning_rate": 3.2703506915851024e-06, + "loss": 0.5793, + "step": 9105 + }, + { + "epoch": 2.417363600159299, + "grad_norm": 0.4476928142887255, + "learning_rate": 3.2700185443956317e-06, + "loss": 0.6021, + "step": 9106 + }, + { + "epoch": 2.4176290986326827, + "grad_norm": 0.44121866991769576, + "learning_rate": 3.2696863821889135e-06, + "loss": 0.582, + "step": 9107 + }, + { + "epoch": 2.417894597106067, + "grad_norm": 0.43731115327928305, + "learning_rate": 3.269354204971427e-06, + "loss": 0.5791, + "step": 9108 + }, + { + "epoch": 2.4181600955794504, + "grad_norm": 0.4259817802135003, + "learning_rate": 3.2690220127496485e-06, + "loss": 0.471, + "step": 9109 + }, + { + "epoch": 2.418425594052834, + "grad_norm": 0.4373280270444597, + "learning_rate": 3.2686898055300587e-06, + "loss": 0.5678, + "step": 9110 + }, + { + "epoch": 2.418691092526218, + "grad_norm": 0.4422824302520187, + "learning_rate": 3.268357583319135e-06, + "loss": 0.5717, + "step": 9111 + }, + { + "epoch": 2.418956590999602, + "grad_norm": 0.44709192473826037, + "learning_rate": 3.2680253461233578e-06, + "loss": 0.5924, + "step": 9112 + }, + { + "epoch": 2.4192220894729854, + "grad_norm": 0.44701150235350845, + "learning_rate": 3.2676930939492058e-06, + "loss": 0.5224, + "step": 9113 + }, + { + "epoch": 2.419487587946369, + "grad_norm": 0.4401849762627822, + "learning_rate": 3.2673608268031594e-06, + "loss": 0.5735, + "step": 9114 + }, + { + "epoch": 2.419753086419753, + "grad_norm": 0.4338187539239229, + "learning_rate": 3.2670285446916973e-06, + "loss": 0.5582, + "step": 9115 + }, + { + "epoch": 2.420018584893137, + "grad_norm": 0.4290222017807178, + "learning_rate": 3.2666962476213004e-06, + "loss": 0.5588, + "step": 9116 + }, + { + "epoch": 2.4202840833665205, + "grad_norm": 0.4338441383791346, + "learning_rate": 3.26636393559845e-06, + "loss": 0.5612, + "step": 9117 + }, + { + "epoch": 2.4205495818399045, + "grad_norm": 0.4452245653747792, + "learning_rate": 3.2660316086296263e-06, + "loss": 0.5682, + "step": 9118 + }, + { + "epoch": 2.420815080313288, + "grad_norm": 0.4337923082097538, + "learning_rate": 3.2656992667213104e-06, + "loss": 0.5374, + "step": 9119 + }, + { + "epoch": 2.421080578786672, + "grad_norm": 0.435562242885739, + "learning_rate": 3.265366909879984e-06, + "loss": 0.5827, + "step": 9120 + }, + { + "epoch": 2.421346077260056, + "grad_norm": 0.42994944474585034, + "learning_rate": 3.265034538112129e-06, + "loss": 0.5853, + "step": 9121 + }, + { + "epoch": 2.4216115757334395, + "grad_norm": 0.43472687325129333, + "learning_rate": 3.2647021514242277e-06, + "loss": 0.5361, + "step": 9122 + }, + { + "epoch": 2.421877074206823, + "grad_norm": 0.4289478879143833, + "learning_rate": 3.2643697498227624e-06, + "loss": 0.5964, + "step": 9123 + }, + { + "epoch": 2.4221425726802073, + "grad_norm": 0.4337055661130339, + "learning_rate": 3.264037333314215e-06, + "loss": 0.5838, + "step": 9124 + }, + { + "epoch": 2.422408071153591, + "grad_norm": 0.42932907827676353, + "learning_rate": 3.2637049019050688e-06, + "loss": 0.5704, + "step": 9125 + }, + { + "epoch": 2.4226735696269746, + "grad_norm": 0.435762154336256, + "learning_rate": 3.2633724556018077e-06, + "loss": 0.5502, + "step": 9126 + }, + { + "epoch": 2.4229390681003586, + "grad_norm": 0.44049891373842254, + "learning_rate": 3.263039994410914e-06, + "loss": 0.5737, + "step": 9127 + }, + { + "epoch": 2.4232045665737423, + "grad_norm": 0.43089527834065355, + "learning_rate": 3.2627075183388725e-06, + "loss": 0.5829, + "step": 9128 + }, + { + "epoch": 2.423470065047126, + "grad_norm": 0.4247600803728179, + "learning_rate": 3.2623750273921665e-06, + "loss": 0.5567, + "step": 9129 + }, + { + "epoch": 2.42373556352051, + "grad_norm": 0.4297256338559161, + "learning_rate": 3.262042521577281e-06, + "loss": 0.5603, + "step": 9130 + }, + { + "epoch": 2.4240010619938936, + "grad_norm": 0.4294093050239711, + "learning_rate": 3.2617100009007013e-06, + "loss": 0.5511, + "step": 9131 + }, + { + "epoch": 2.4242665604672773, + "grad_norm": 0.44297036894908215, + "learning_rate": 3.2613774653689113e-06, + "loss": 0.5344, + "step": 9132 + }, + { + "epoch": 2.424532058940661, + "grad_norm": 0.4279206395712597, + "learning_rate": 3.261044914988396e-06, + "loss": 0.5947, + "step": 9133 + }, + { + "epoch": 2.424797557414045, + "grad_norm": 0.44537927557467294, + "learning_rate": 3.260712349765642e-06, + "loss": 0.5689, + "step": 9134 + }, + { + "epoch": 2.4250630558874287, + "grad_norm": 0.42594071051467025, + "learning_rate": 3.260379769707135e-06, + "loss": 0.5379, + "step": 9135 + }, + { + "epoch": 2.4253285543608123, + "grad_norm": 0.4491248209290099, + "learning_rate": 3.2600471748193595e-06, + "loss": 0.6012, + "step": 9136 + }, + { + "epoch": 2.4255940528341964, + "grad_norm": 0.4431843553721626, + "learning_rate": 3.259714565108805e-06, + "loss": 0.5738, + "step": 9137 + }, + { + "epoch": 2.42585955130758, + "grad_norm": 0.45131846084871735, + "learning_rate": 3.2593819405819538e-06, + "loss": 0.5552, + "step": 9138 + }, + { + "epoch": 2.4261250497809637, + "grad_norm": 0.4511241032382652, + "learning_rate": 3.2590493012452974e-06, + "loss": 0.5733, + "step": 9139 + }, + { + "epoch": 2.4263905482543473, + "grad_norm": 0.441950605196848, + "learning_rate": 3.258716647105321e-06, + "loss": 0.5886, + "step": 9140 + }, + { + "epoch": 2.4266560467277314, + "grad_norm": 0.42335291292428895, + "learning_rate": 3.258383978168512e-06, + "loss": 0.567, + "step": 9141 + }, + { + "epoch": 2.426921545201115, + "grad_norm": 0.4368039505158051, + "learning_rate": 3.2580512944413583e-06, + "loss": 0.5435, + "step": 9142 + }, + { + "epoch": 2.4271870436744987, + "grad_norm": 0.4472270859809179, + "learning_rate": 3.257718595930349e-06, + "loss": 0.5874, + "step": 9143 + }, + { + "epoch": 2.4274525421478828, + "grad_norm": 0.4262819257228624, + "learning_rate": 3.257385882641971e-06, + "loss": 0.5089, + "step": 9144 + }, + { + "epoch": 2.4277180406212664, + "grad_norm": 0.44965705749988305, + "learning_rate": 3.2570531545827144e-06, + "loss": 0.5876, + "step": 9145 + }, + { + "epoch": 2.42798353909465, + "grad_norm": 0.42922730245834806, + "learning_rate": 3.2567204117590683e-06, + "loss": 0.5656, + "step": 9146 + }, + { + "epoch": 2.428249037568034, + "grad_norm": 0.43639259900445376, + "learning_rate": 3.256387654177521e-06, + "loss": 0.5983, + "step": 9147 + }, + { + "epoch": 2.4285145360414178, + "grad_norm": 0.4407913731239791, + "learning_rate": 3.2560548818445615e-06, + "loss": 0.5635, + "step": 9148 + }, + { + "epoch": 2.4287800345148014, + "grad_norm": 0.43014165142758554, + "learning_rate": 3.255722094766681e-06, + "loss": 0.5978, + "step": 9149 + }, + { + "epoch": 2.4290455329881855, + "grad_norm": 0.43011592406325894, + "learning_rate": 3.25538929295037e-06, + "loss": 0.5383, + "step": 9150 + }, + { + "epoch": 2.429311031461569, + "grad_norm": 0.43945246163923596, + "learning_rate": 3.255056476402118e-06, + "loss": 0.5486, + "step": 9151 + }, + { + "epoch": 2.4295765299349528, + "grad_norm": 0.44358605750138386, + "learning_rate": 3.2547236451284166e-06, + "loss": 0.599, + "step": 9152 + }, + { + "epoch": 2.429842028408337, + "grad_norm": 0.44200632162198994, + "learning_rate": 3.2543907991357556e-06, + "loss": 0.5751, + "step": 9153 + }, + { + "epoch": 2.4301075268817205, + "grad_norm": 0.43935672980888524, + "learning_rate": 3.254057938430627e-06, + "loss": 0.5903, + "step": 9154 + }, + { + "epoch": 2.430373025355104, + "grad_norm": 0.43910250531319545, + "learning_rate": 3.2537250630195227e-06, + "loss": 0.539, + "step": 9155 + }, + { + "epoch": 2.430638523828488, + "grad_norm": 0.44984755203911037, + "learning_rate": 3.2533921729089333e-06, + "loss": 0.5509, + "step": 9156 + }, + { + "epoch": 2.430904022301872, + "grad_norm": 0.42386614224297936, + "learning_rate": 3.253059268105353e-06, + "loss": 0.5763, + "step": 9157 + }, + { + "epoch": 2.4311695207752555, + "grad_norm": 0.4308931498701452, + "learning_rate": 3.252726348615272e-06, + "loss": 0.5434, + "step": 9158 + }, + { + "epoch": 2.431435019248639, + "grad_norm": 0.44955341875971067, + "learning_rate": 3.2523934144451853e-06, + "loss": 0.5342, + "step": 9159 + }, + { + "epoch": 2.4317005177220232, + "grad_norm": 0.4340219625958674, + "learning_rate": 3.2520604656015846e-06, + "loss": 0.5911, + "step": 9160 + }, + { + "epoch": 2.431966016195407, + "grad_norm": 0.4344027018014918, + "learning_rate": 3.2517275020909635e-06, + "loss": 0.5725, + "step": 9161 + }, + { + "epoch": 2.4322315146687905, + "grad_norm": 0.42813203679916667, + "learning_rate": 3.251394523919815e-06, + "loss": 0.5605, + "step": 9162 + }, + { + "epoch": 2.4324970131421746, + "grad_norm": 0.42889217798515633, + "learning_rate": 3.2510615310946343e-06, + "loss": 0.5781, + "step": 9163 + }, + { + "epoch": 2.4327625116155582, + "grad_norm": 0.43295123436598354, + "learning_rate": 3.2507285236219143e-06, + "loss": 0.546, + "step": 9164 + }, + { + "epoch": 2.433028010088942, + "grad_norm": 0.4391380261475172, + "learning_rate": 3.2503955015081502e-06, + "loss": 0.5816, + "step": 9165 + }, + { + "epoch": 2.433293508562326, + "grad_norm": 0.42916397306508125, + "learning_rate": 3.2500624647598362e-06, + "loss": 0.5743, + "step": 9166 + }, + { + "epoch": 2.4335590070357096, + "grad_norm": 0.44002645228221976, + "learning_rate": 3.2497294133834678e-06, + "loss": 0.5162, + "step": 9167 + }, + { + "epoch": 2.4338245055090932, + "grad_norm": 0.45737603065659477, + "learning_rate": 3.24939634738554e-06, + "loss": 0.5432, + "step": 9168 + }, + { + "epoch": 2.434090003982477, + "grad_norm": 0.42153568244350414, + "learning_rate": 3.24906326677255e-06, + "loss": 0.559, + "step": 9169 + }, + { + "epoch": 2.434355502455861, + "grad_norm": 0.42851174395653957, + "learning_rate": 3.2487301715509905e-06, + "loss": 0.5219, + "step": 9170 + }, + { + "epoch": 2.4346210009292446, + "grad_norm": 0.4452406195296684, + "learning_rate": 3.2483970617273602e-06, + "loss": 0.5739, + "step": 9171 + }, + { + "epoch": 2.4348864994026282, + "grad_norm": 0.44360428533414653, + "learning_rate": 3.248063937308155e-06, + "loss": 0.541, + "step": 9172 + }, + { + "epoch": 2.4351519978760123, + "grad_norm": 0.4234600100173597, + "learning_rate": 3.247730798299871e-06, + "loss": 0.5351, + "step": 9173 + }, + { + "epoch": 2.435417496349396, + "grad_norm": 0.4277662324604862, + "learning_rate": 3.247397644709006e-06, + "loss": 0.5555, + "step": 9174 + }, + { + "epoch": 2.4356829948227796, + "grad_norm": 0.421594085592504, + "learning_rate": 3.2470644765420567e-06, + "loss": 0.5322, + "step": 9175 + }, + { + "epoch": 2.4359484932961637, + "grad_norm": 0.43948652679330147, + "learning_rate": 3.246731293805521e-06, + "loss": 0.5917, + "step": 9176 + }, + { + "epoch": 2.4362139917695473, + "grad_norm": 0.4204130077193829, + "learning_rate": 3.2463980965058966e-06, + "loss": 0.554, + "step": 9177 + }, + { + "epoch": 2.436479490242931, + "grad_norm": 0.44448535970745134, + "learning_rate": 3.246064884649682e-06, + "loss": 0.5801, + "step": 9178 + }, + { + "epoch": 2.436744988716315, + "grad_norm": 0.4356196617978778, + "learning_rate": 3.245731658243376e-06, + "loss": 0.5756, + "step": 9179 + }, + { + "epoch": 2.4370104871896987, + "grad_norm": 0.42773727166948444, + "learning_rate": 3.2453984172934765e-06, + "loss": 0.5471, + "step": 9180 + }, + { + "epoch": 2.4372759856630823, + "grad_norm": 0.42462849250878887, + "learning_rate": 3.2450651618064827e-06, + "loss": 0.5616, + "step": 9181 + }, + { + "epoch": 2.4375414841364664, + "grad_norm": 0.4339715556695152, + "learning_rate": 3.2447318917888937e-06, + "loss": 0.5252, + "step": 9182 + }, + { + "epoch": 2.43780698260985, + "grad_norm": 0.4307919473090889, + "learning_rate": 3.24439860724721e-06, + "loss": 0.5767, + "step": 9183 + }, + { + "epoch": 2.4380724810832337, + "grad_norm": 0.43769399065216524, + "learning_rate": 3.2440653081879303e-06, + "loss": 0.5815, + "step": 9184 + }, + { + "epoch": 2.438337979556618, + "grad_norm": 0.4302797163901531, + "learning_rate": 3.243731994617555e-06, + "loss": 0.5465, + "step": 9185 + }, + { + "epoch": 2.4386034780300014, + "grad_norm": 0.4281632483599945, + "learning_rate": 3.2433986665425858e-06, + "loss": 0.5909, + "step": 9186 + }, + { + "epoch": 2.438868976503385, + "grad_norm": 0.44648244976525125, + "learning_rate": 3.243065323969522e-06, + "loss": 0.5673, + "step": 9187 + }, + { + "epoch": 2.4391344749767687, + "grad_norm": 0.43677578178066223, + "learning_rate": 3.242731966904865e-06, + "loss": 0.5522, + "step": 9188 + }, + { + "epoch": 2.439399973450153, + "grad_norm": 0.4337940304001195, + "learning_rate": 3.242398595355116e-06, + "loss": 0.5383, + "step": 9189 + }, + { + "epoch": 2.4396654719235364, + "grad_norm": 0.44634964527842813, + "learning_rate": 3.2420652093267774e-06, + "loss": 0.594, + "step": 9190 + }, + { + "epoch": 2.43993097039692, + "grad_norm": 0.4417383135491838, + "learning_rate": 3.2417318088263492e-06, + "loss": 0.5687, + "step": 9191 + }, + { + "epoch": 2.440196468870304, + "grad_norm": 0.4260931325446069, + "learning_rate": 3.241398393860336e-06, + "loss": 0.5581, + "step": 9192 + }, + { + "epoch": 2.440461967343688, + "grad_norm": 0.43122967616835856, + "learning_rate": 3.2410649644352383e-06, + "loss": 0.5496, + "step": 9193 + }, + { + "epoch": 2.4407274658170715, + "grad_norm": 0.42390560119565857, + "learning_rate": 3.2407315205575583e-06, + "loss": 0.5513, + "step": 9194 + }, + { + "epoch": 2.440992964290455, + "grad_norm": 0.45586334387375094, + "learning_rate": 3.2403980622338017e-06, + "loss": 0.5817, + "step": 9195 + }, + { + "epoch": 2.441258462763839, + "grad_norm": 0.43608430469160275, + "learning_rate": 3.2400645894704687e-06, + "loss": 0.5737, + "step": 9196 + }, + { + "epoch": 2.441523961237223, + "grad_norm": 0.43477080666778983, + "learning_rate": 3.2397311022740647e-06, + "loss": 0.5604, + "step": 9197 + }, + { + "epoch": 2.4417894597106065, + "grad_norm": 0.4387187335494614, + "learning_rate": 3.2393976006510935e-06, + "loss": 0.575, + "step": 9198 + }, + { + "epoch": 2.4420549581839905, + "grad_norm": 0.43094303753061264, + "learning_rate": 3.2390640846080585e-06, + "loss": 0.5702, + "step": 9199 + }, + { + "epoch": 2.442320456657374, + "grad_norm": 0.4312671545112207, + "learning_rate": 3.2387305541514636e-06, + "loss": 0.5564, + "step": 9200 + }, + { + "epoch": 2.442585955130758, + "grad_norm": 0.44552056122902023, + "learning_rate": 3.2383970092878153e-06, + "loss": 0.578, + "step": 9201 + }, + { + "epoch": 2.442851453604142, + "grad_norm": 0.4346805979038832, + "learning_rate": 3.2380634500236167e-06, + "loss": 0.5942, + "step": 9202 + }, + { + "epoch": 2.4431169520775255, + "grad_norm": 0.4269817133498969, + "learning_rate": 3.2377298763653735e-06, + "loss": 0.5552, + "step": 9203 + }, + { + "epoch": 2.443382450550909, + "grad_norm": 0.4396154861105833, + "learning_rate": 3.237396288319592e-06, + "loss": 0.5455, + "step": 9204 + }, + { + "epoch": 2.4436479490242933, + "grad_norm": 0.4438804392459179, + "learning_rate": 3.2370626858927773e-06, + "loss": 0.5639, + "step": 9205 + }, + { + "epoch": 2.443913447497677, + "grad_norm": 0.4372374444359718, + "learning_rate": 3.2367290690914357e-06, + "loss": 0.5525, + "step": 9206 + }, + { + "epoch": 2.4441789459710606, + "grad_norm": 0.44649241767222164, + "learning_rate": 3.236395437922073e-06, + "loss": 0.5788, + "step": 9207 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.429693238551958, + "learning_rate": 3.2360617923911964e-06, + "loss": 0.556, + "step": 9208 + }, + { + "epoch": 2.4447099429178283, + "grad_norm": 0.4143491171039682, + "learning_rate": 3.2357281325053138e-06, + "loss": 0.5287, + "step": 9209 + }, + { + "epoch": 2.444975441391212, + "grad_norm": 0.43596035684381934, + "learning_rate": 3.2353944582709294e-06, + "loss": 0.5707, + "step": 9210 + }, + { + "epoch": 2.445240939864596, + "grad_norm": 0.44351936327665104, + "learning_rate": 3.2350607696945535e-06, + "loss": 0.5761, + "step": 9211 + }, + { + "epoch": 2.4455064383379796, + "grad_norm": 0.4452840504611613, + "learning_rate": 3.2347270667826926e-06, + "loss": 0.5695, + "step": 9212 + }, + { + "epoch": 2.4457719368113633, + "grad_norm": 0.4267308160494901, + "learning_rate": 3.2343933495418544e-06, + "loss": 0.5665, + "step": 9213 + }, + { + "epoch": 2.446037435284747, + "grad_norm": 0.42609573495694336, + "learning_rate": 3.234059617978549e-06, + "loss": 0.542, + "step": 9214 + }, + { + "epoch": 2.446302933758131, + "grad_norm": 0.43217625536679605, + "learning_rate": 3.233725872099283e-06, + "loss": 0.5568, + "step": 9215 + }, + { + "epoch": 2.4465684322315147, + "grad_norm": 0.4377406039584305, + "learning_rate": 3.233392111910566e-06, + "loss": 0.546, + "step": 9216 + }, + { + "epoch": 2.4468339307048983, + "grad_norm": 0.43513108966793584, + "learning_rate": 3.2330583374189073e-06, + "loss": 0.5724, + "step": 9217 + }, + { + "epoch": 2.4470994291782824, + "grad_norm": 0.4217823896548075, + "learning_rate": 3.232724548630817e-06, + "loss": 0.5281, + "step": 9218 + }, + { + "epoch": 2.447364927651666, + "grad_norm": 0.42926041257157305, + "learning_rate": 3.232390745552803e-06, + "loss": 0.562, + "step": 9219 + }, + { + "epoch": 2.4476304261250497, + "grad_norm": 0.4319942277111498, + "learning_rate": 3.232056928191376e-06, + "loss": 0.5674, + "step": 9220 + }, + { + "epoch": 2.4478959245984337, + "grad_norm": 0.4295877169921538, + "learning_rate": 3.231723096553047e-06, + "loss": 0.5428, + "step": 9221 + }, + { + "epoch": 2.4481614230718174, + "grad_norm": 0.4440776621133491, + "learning_rate": 3.2313892506443264e-06, + "loss": 0.5902, + "step": 9222 + }, + { + "epoch": 2.448426921545201, + "grad_norm": 0.45221098831232626, + "learning_rate": 3.231055390471724e-06, + "loss": 0.5733, + "step": 9223 + }, + { + "epoch": 2.4486924200185847, + "grad_norm": 0.435474292924806, + "learning_rate": 3.230721516041752e-06, + "loss": 0.6027, + "step": 9224 + }, + { + "epoch": 2.4489579184919688, + "grad_norm": 0.4184631013785383, + "learning_rate": 3.2303876273609213e-06, + "loss": 0.5599, + "step": 9225 + }, + { + "epoch": 2.4492234169653524, + "grad_norm": 0.44857414886613467, + "learning_rate": 3.2300537244357435e-06, + "loss": 0.5273, + "step": 9226 + }, + { + "epoch": 2.449488915438736, + "grad_norm": 0.4333410469894492, + "learning_rate": 3.2297198072727308e-06, + "loss": 0.5771, + "step": 9227 + }, + { + "epoch": 2.44975441391212, + "grad_norm": 0.4375320261939352, + "learning_rate": 3.229385875878395e-06, + "loss": 0.6016, + "step": 9228 + }, + { + "epoch": 2.4500199123855038, + "grad_norm": 0.4384651629033774, + "learning_rate": 3.229051930259248e-06, + "loss": 0.5232, + "step": 9229 + }, + { + "epoch": 2.4502854108588874, + "grad_norm": 0.43918455817330104, + "learning_rate": 3.228717970421804e-06, + "loss": 0.5786, + "step": 9230 + }, + { + "epoch": 2.4505509093322715, + "grad_norm": 0.4273802015053016, + "learning_rate": 3.228383996372576e-06, + "loss": 0.5243, + "step": 9231 + }, + { + "epoch": 2.450816407805655, + "grad_norm": 0.44214970182529034, + "learning_rate": 3.2280500081180764e-06, + "loss": 0.5905, + "step": 9232 + }, + { + "epoch": 2.4510819062790388, + "grad_norm": 0.4379022413759155, + "learning_rate": 3.2277160056648183e-06, + "loss": 0.5646, + "step": 9233 + }, + { + "epoch": 2.451347404752423, + "grad_norm": 0.44213035769546577, + "learning_rate": 3.227381989019317e-06, + "loss": 0.5456, + "step": 9234 + }, + { + "epoch": 2.4516129032258065, + "grad_norm": 0.44355785006915593, + "learning_rate": 3.227047958188086e-06, + "loss": 0.5897, + "step": 9235 + }, + { + "epoch": 2.45187840169919, + "grad_norm": 0.4399791434706225, + "learning_rate": 3.2267139131776392e-06, + "loss": 0.5868, + "step": 9236 + }, + { + "epoch": 2.452143900172574, + "grad_norm": 0.4379368177073412, + "learning_rate": 3.2263798539944923e-06, + "loss": 0.5834, + "step": 9237 + }, + { + "epoch": 2.452409398645958, + "grad_norm": 0.41958616275063965, + "learning_rate": 3.2260457806451593e-06, + "loss": 0.5344, + "step": 9238 + }, + { + "epoch": 2.4526748971193415, + "grad_norm": 0.4400284390552846, + "learning_rate": 3.225711693136156e-06, + "loss": 0.5922, + "step": 9239 + }, + { + "epoch": 2.4529403955927256, + "grad_norm": 0.43673061488650344, + "learning_rate": 3.225377591473998e-06, + "loss": 0.5647, + "step": 9240 + }, + { + "epoch": 2.4532058940661092, + "grad_norm": 0.4216282446509674, + "learning_rate": 3.225043475665201e-06, + "loss": 0.566, + "step": 9241 + }, + { + "epoch": 2.453471392539493, + "grad_norm": 0.4337536443596227, + "learning_rate": 3.224709345716281e-06, + "loss": 0.5634, + "step": 9242 + }, + { + "epoch": 2.4537368910128765, + "grad_norm": 0.433214091004987, + "learning_rate": 3.2243752016337544e-06, + "loss": 0.6068, + "step": 9243 + }, + { + "epoch": 2.4540023894862606, + "grad_norm": 0.44678129969466157, + "learning_rate": 3.2240410434241376e-06, + "loss": 0.5503, + "step": 9244 + }, + { + "epoch": 2.4542678879596442, + "grad_norm": 0.43895188265329577, + "learning_rate": 3.223706871093948e-06, + "loss": 0.6122, + "step": 9245 + }, + { + "epoch": 2.454533386433028, + "grad_norm": 0.43941213832308157, + "learning_rate": 3.223372684649702e-06, + "loss": 0.5884, + "step": 9246 + }, + { + "epoch": 2.454798884906412, + "grad_norm": 0.432795591779766, + "learning_rate": 3.2230384840979184e-06, + "loss": 0.5592, + "step": 9247 + }, + { + "epoch": 2.4550643833797956, + "grad_norm": 0.4346904827910939, + "learning_rate": 3.2227042694451135e-06, + "loss": 0.5392, + "step": 9248 + }, + { + "epoch": 2.4553298818531792, + "grad_norm": 0.4301518330707813, + "learning_rate": 3.2223700406978052e-06, + "loss": 0.5844, + "step": 9249 + }, + { + "epoch": 2.455595380326563, + "grad_norm": 0.43143760152867194, + "learning_rate": 3.222035797862513e-06, + "loss": 0.5172, + "step": 9250 + }, + { + "epoch": 2.455860878799947, + "grad_norm": 0.4275449969586369, + "learning_rate": 3.2217015409457547e-06, + "loss": 0.5437, + "step": 9251 + }, + { + "epoch": 2.4561263772733306, + "grad_norm": 0.4361445645580946, + "learning_rate": 3.22136726995405e-06, + "loss": 0.5353, + "step": 9252 + }, + { + "epoch": 2.4563918757467142, + "grad_norm": 0.43985169949558883, + "learning_rate": 3.221032984893917e-06, + "loss": 0.5873, + "step": 9253 + }, + { + "epoch": 2.4566573742200983, + "grad_norm": 0.43839727355294944, + "learning_rate": 3.2206986857718754e-06, + "loss": 0.5801, + "step": 9254 + }, + { + "epoch": 2.456922872693482, + "grad_norm": 0.4298509354403553, + "learning_rate": 3.220364372594445e-06, + "loss": 0.577, + "step": 9255 + }, + { + "epoch": 2.4571883711668656, + "grad_norm": 0.4508958566467277, + "learning_rate": 3.2200300453681448e-06, + "loss": 0.5888, + "step": 9256 + }, + { + "epoch": 2.4574538696402497, + "grad_norm": 0.4401538693756923, + "learning_rate": 3.2196957040994962e-06, + "loss": 0.5808, + "step": 9257 + }, + { + "epoch": 2.4577193681136333, + "grad_norm": 0.4462528574714819, + "learning_rate": 3.219361348795019e-06, + "loss": 0.5264, + "step": 9258 + }, + { + "epoch": 2.457984866587017, + "grad_norm": 0.4311772317729529, + "learning_rate": 3.219026979461235e-06, + "loss": 0.538, + "step": 9259 + }, + { + "epoch": 2.458250365060401, + "grad_norm": 0.43117471150653824, + "learning_rate": 3.2186925961046632e-06, + "loss": 0.5352, + "step": 9260 + }, + { + "epoch": 2.4585158635337847, + "grad_norm": 0.447568904126334, + "learning_rate": 3.218358198731828e-06, + "loss": 0.5774, + "step": 9261 + }, + { + "epoch": 2.4587813620071683, + "grad_norm": 0.4380905548917899, + "learning_rate": 3.2180237873492477e-06, + "loss": 0.5632, + "step": 9262 + }, + { + "epoch": 2.4590468604805524, + "grad_norm": 0.43910650805026535, + "learning_rate": 3.2176893619634453e-06, + "loss": 0.5825, + "step": 9263 + }, + { + "epoch": 2.459312358953936, + "grad_norm": 0.43907316138183544, + "learning_rate": 3.2173549225809444e-06, + "loss": 0.5557, + "step": 9264 + }, + { + "epoch": 2.4595778574273197, + "grad_norm": 0.44960943712798707, + "learning_rate": 3.217020469208265e-06, + "loss": 0.5898, + "step": 9265 + }, + { + "epoch": 2.459843355900704, + "grad_norm": 0.4260107522996064, + "learning_rate": 3.216686001851931e-06, + "loss": 0.5554, + "step": 9266 + }, + { + "epoch": 2.4601088543740874, + "grad_norm": 0.4316678549676487, + "learning_rate": 3.216351520518465e-06, + "loss": 0.5832, + "step": 9267 + }, + { + "epoch": 2.460374352847471, + "grad_norm": 0.4345900595714598, + "learning_rate": 3.2160170252143913e-06, + "loss": 0.5495, + "step": 9268 + }, + { + "epoch": 2.4606398513208547, + "grad_norm": 0.44487367474504164, + "learning_rate": 3.2156825159462317e-06, + "loss": 0.5696, + "step": 9269 + }, + { + "epoch": 2.460905349794239, + "grad_norm": 0.44008919471229413, + "learning_rate": 3.2153479927205116e-06, + "loss": 0.5757, + "step": 9270 + }, + { + "epoch": 2.4611708482676224, + "grad_norm": 0.44487969973180636, + "learning_rate": 3.2150134555437536e-06, + "loss": 0.6114, + "step": 9271 + }, + { + "epoch": 2.461436346741006, + "grad_norm": 0.4456398133516107, + "learning_rate": 3.2146789044224823e-06, + "loss": 0.5522, + "step": 9272 + }, + { + "epoch": 2.46170184521439, + "grad_norm": 0.43313708646730037, + "learning_rate": 3.2143443393632233e-06, + "loss": 0.5587, + "step": 9273 + }, + { + "epoch": 2.461967343687774, + "grad_norm": 0.435056742891196, + "learning_rate": 3.2140097603725e-06, + "loss": 0.5755, + "step": 9274 + }, + { + "epoch": 2.4622328421611575, + "grad_norm": 0.4344219947204466, + "learning_rate": 3.2136751674568384e-06, + "loss": 0.5464, + "step": 9275 + }, + { + "epoch": 2.4624983406345415, + "grad_norm": 0.43564920120243994, + "learning_rate": 3.2133405606227636e-06, + "loss": 0.5337, + "step": 9276 + }, + { + "epoch": 2.462763839107925, + "grad_norm": 0.4148265194262022, + "learning_rate": 3.213005939876801e-06, + "loss": 0.5394, + "step": 9277 + }, + { + "epoch": 2.463029337581309, + "grad_norm": 0.4336315820065418, + "learning_rate": 3.212671305225478e-06, + "loss": 0.5534, + "step": 9278 + }, + { + "epoch": 2.4632948360546925, + "grad_norm": 0.44180895337030884, + "learning_rate": 3.2123366566753183e-06, + "loss": 0.5701, + "step": 9279 + }, + { + "epoch": 2.4635603345280765, + "grad_norm": 0.44135897544400393, + "learning_rate": 3.21200199423285e-06, + "loss": 0.5909, + "step": 9280 + }, + { + "epoch": 2.46382583300146, + "grad_norm": 0.4542709845579795, + "learning_rate": 3.2116673179046003e-06, + "loss": 0.5765, + "step": 9281 + }, + { + "epoch": 2.464091331474844, + "grad_norm": 0.4334304278213419, + "learning_rate": 3.2113326276970945e-06, + "loss": 0.5513, + "step": 9282 + }, + { + "epoch": 2.464356829948228, + "grad_norm": 0.46611268758444935, + "learning_rate": 3.2109979236168615e-06, + "loss": 0.6013, + "step": 9283 + }, + { + "epoch": 2.4646223284216116, + "grad_norm": 0.4330494564015539, + "learning_rate": 3.2106632056704277e-06, + "loss": 0.5481, + "step": 9284 + }, + { + "epoch": 2.464887826894995, + "grad_norm": 0.4314359068765981, + "learning_rate": 3.2103284738643216e-06, + "loss": 0.5728, + "step": 9285 + }, + { + "epoch": 2.4651533253683793, + "grad_norm": 0.437056312784691, + "learning_rate": 3.209993728205071e-06, + "loss": 0.5731, + "step": 9286 + }, + { + "epoch": 2.465418823841763, + "grad_norm": 0.4383553537706255, + "learning_rate": 3.2096589686992043e-06, + "loss": 0.5415, + "step": 9287 + }, + { + "epoch": 2.4656843223151466, + "grad_norm": 0.44151316454398964, + "learning_rate": 3.20932419535325e-06, + "loss": 0.5676, + "step": 9288 + }, + { + "epoch": 2.4659498207885306, + "grad_norm": 0.4336326588505913, + "learning_rate": 3.2089894081737373e-06, + "loss": 0.555, + "step": 9289 + }, + { + "epoch": 2.4662153192619143, + "grad_norm": 0.4231797704858733, + "learning_rate": 3.208654607167196e-06, + "loss": 0.5226, + "step": 9290 + }, + { + "epoch": 2.466480817735298, + "grad_norm": 0.4348513266093203, + "learning_rate": 3.2083197923401537e-06, + "loss": 0.5743, + "step": 9291 + }, + { + "epoch": 2.466746316208682, + "grad_norm": 0.4224671118271553, + "learning_rate": 3.2079849636991416e-06, + "loss": 0.5578, + "step": 9292 + }, + { + "epoch": 2.4670118146820657, + "grad_norm": 0.43936651309718844, + "learning_rate": 3.2076501212506896e-06, + "loss": 0.5783, + "step": 9293 + }, + { + "epoch": 2.4672773131554493, + "grad_norm": 0.45205520071898586, + "learning_rate": 3.2073152650013273e-06, + "loss": 0.5662, + "step": 9294 + }, + { + "epoch": 2.4675428116288334, + "grad_norm": 0.43418390292864895, + "learning_rate": 3.2069803949575845e-06, + "loss": 0.5409, + "step": 9295 + }, + { + "epoch": 2.467808310102217, + "grad_norm": 0.44506073932633294, + "learning_rate": 3.206645511125995e-06, + "loss": 0.5786, + "step": 9296 + }, + { + "epoch": 2.4680738085756007, + "grad_norm": 0.44820003310072165, + "learning_rate": 3.206310613513086e-06, + "loss": 0.5571, + "step": 9297 + }, + { + "epoch": 2.4683393070489843, + "grad_norm": 0.4663835187323188, + "learning_rate": 3.205975702125392e-06, + "loss": 0.5456, + "step": 9298 + }, + { + "epoch": 2.4686048055223684, + "grad_norm": 0.4306489245787134, + "learning_rate": 3.205640776969443e-06, + "loss": 0.557, + "step": 9299 + }, + { + "epoch": 2.468870303995752, + "grad_norm": 0.4231525121319539, + "learning_rate": 3.205305838051771e-06, + "loss": 0.5628, + "step": 9300 + }, + { + "epoch": 2.4691358024691357, + "grad_norm": 0.4418397927448869, + "learning_rate": 3.2049708853789087e-06, + "loss": 0.563, + "step": 9301 + }, + { + "epoch": 2.4694013009425198, + "grad_norm": 0.4368738420288982, + "learning_rate": 3.204635918957387e-06, + "loss": 0.5817, + "step": 9302 + }, + { + "epoch": 2.4696667994159034, + "grad_norm": 0.43947537258508945, + "learning_rate": 3.204300938793741e-06, + "loss": 0.586, + "step": 9303 + }, + { + "epoch": 2.469932297889287, + "grad_norm": 0.4315511554711722, + "learning_rate": 3.203965944894501e-06, + "loss": 0.516, + "step": 9304 + }, + { + "epoch": 2.4701977963626707, + "grad_norm": 0.44174830309953, + "learning_rate": 3.2036309372662017e-06, + "loss": 0.5764, + "step": 9305 + }, + { + "epoch": 2.4704632948360548, + "grad_norm": 0.42774219667033225, + "learning_rate": 3.2032959159153766e-06, + "loss": 0.563, + "step": 9306 + }, + { + "epoch": 2.4707287933094384, + "grad_norm": 0.43309016923940263, + "learning_rate": 3.2029608808485598e-06, + "loss": 0.5623, + "step": 9307 + }, + { + "epoch": 2.470994291782822, + "grad_norm": 0.44753072963382295, + "learning_rate": 3.2026258320722834e-06, + "loss": 0.5504, + "step": 9308 + }, + { + "epoch": 2.471259790256206, + "grad_norm": 0.4396184446589746, + "learning_rate": 3.2022907695930837e-06, + "loss": 0.5291, + "step": 9309 + }, + { + "epoch": 2.4715252887295898, + "grad_norm": 0.4304845146467045, + "learning_rate": 3.201955693417495e-06, + "loss": 0.5699, + "step": 9310 + }, + { + "epoch": 2.4717907872029734, + "grad_norm": 0.4508366669555417, + "learning_rate": 3.201620603552051e-06, + "loss": 0.5956, + "step": 9311 + }, + { + "epoch": 2.4720562856763575, + "grad_norm": 0.45239558825462195, + "learning_rate": 3.201285500003287e-06, + "loss": 0.5466, + "step": 9312 + }, + { + "epoch": 2.472321784149741, + "grad_norm": 0.4272271088422055, + "learning_rate": 3.200950382777739e-06, + "loss": 0.5467, + "step": 9313 + }, + { + "epoch": 2.4725872826231248, + "grad_norm": 0.4323833422230112, + "learning_rate": 3.2006152518819424e-06, + "loss": 0.5789, + "step": 9314 + }, + { + "epoch": 2.472852781096509, + "grad_norm": 0.43092476199328034, + "learning_rate": 3.200280107322433e-06, + "loss": 0.5515, + "step": 9315 + }, + { + "epoch": 2.4731182795698925, + "grad_norm": 0.45519324505825043, + "learning_rate": 3.1999449491057474e-06, + "loss": 0.564, + "step": 9316 + }, + { + "epoch": 2.473383778043276, + "grad_norm": 0.439487421697383, + "learning_rate": 3.1996097772384203e-06, + "loss": 0.5447, + "step": 9317 + }, + { + "epoch": 2.4736492765166602, + "grad_norm": 0.45040987974501273, + "learning_rate": 3.199274591726991e-06, + "loss": 0.587, + "step": 9318 + }, + { + "epoch": 2.473914774990044, + "grad_norm": 0.42470609183369723, + "learning_rate": 3.1989393925779943e-06, + "loss": 0.5461, + "step": 9319 + }, + { + "epoch": 2.4741802734634275, + "grad_norm": 0.44562647077999756, + "learning_rate": 3.1986041797979683e-06, + "loss": 0.5844, + "step": 9320 + }, + { + "epoch": 2.4744457719368116, + "grad_norm": 0.4532510946910471, + "learning_rate": 3.1982689533934496e-06, + "loss": 0.5506, + "step": 9321 + }, + { + "epoch": 2.4747112704101952, + "grad_norm": 0.44449700841515694, + "learning_rate": 3.1979337133709777e-06, + "loss": 0.6077, + "step": 9322 + }, + { + "epoch": 2.474976768883579, + "grad_norm": 0.44830119567768045, + "learning_rate": 3.197598459737089e-06, + "loss": 0.5515, + "step": 9323 + }, + { + "epoch": 2.4752422673569625, + "grad_norm": 0.42880806501016094, + "learning_rate": 3.1972631924983228e-06, + "loss": 0.5482, + "step": 9324 + }, + { + "epoch": 2.4755077658303466, + "grad_norm": 0.4357218750000735, + "learning_rate": 3.1969279116612163e-06, + "loss": 0.5407, + "step": 9325 + }, + { + "epoch": 2.4757732643037302, + "grad_norm": 0.43746853278861136, + "learning_rate": 3.1965926172323096e-06, + "loss": 0.5742, + "step": 9326 + }, + { + "epoch": 2.476038762777114, + "grad_norm": 0.43884137117245586, + "learning_rate": 3.196257309218142e-06, + "loss": 0.5813, + "step": 9327 + }, + { + "epoch": 2.476304261250498, + "grad_norm": 0.4501126219589327, + "learning_rate": 3.1959219876252513e-06, + "loss": 0.5757, + "step": 9328 + }, + { + "epoch": 2.4765697597238816, + "grad_norm": 0.44691863809592286, + "learning_rate": 3.195586652460178e-06, + "loss": 0.574, + "step": 9329 + }, + { + "epoch": 2.4768352581972652, + "grad_norm": 0.43685441452318735, + "learning_rate": 3.1952513037294624e-06, + "loss": 0.5855, + "step": 9330 + }, + { + "epoch": 2.4771007566706493, + "grad_norm": 0.43112116327421307, + "learning_rate": 3.194915941439643e-06, + "loss": 0.5424, + "step": 9331 + }, + { + "epoch": 2.477366255144033, + "grad_norm": 0.4361272198938356, + "learning_rate": 3.1945805655972624e-06, + "loss": 0.533, + "step": 9332 + }, + { + "epoch": 2.4776317536174166, + "grad_norm": 0.4345226729695114, + "learning_rate": 3.19424517620886e-06, + "loss": 0.5945, + "step": 9333 + }, + { + "epoch": 2.4778972520908003, + "grad_norm": 0.43966239949898617, + "learning_rate": 3.1939097732809763e-06, + "loss": 0.5375, + "step": 9334 + }, + { + "epoch": 2.4781627505641843, + "grad_norm": 0.4603663491252038, + "learning_rate": 3.193574356820153e-06, + "loss": 0.568, + "step": 9335 + }, + { + "epoch": 2.478428249037568, + "grad_norm": 0.45718711627010944, + "learning_rate": 3.193238926832933e-06, + "loss": 0.5819, + "step": 9336 + }, + { + "epoch": 2.4786937475109516, + "grad_norm": 0.4420346798540461, + "learning_rate": 3.1929034833258553e-06, + "loss": 0.5855, + "step": 9337 + }, + { + "epoch": 2.4789592459843357, + "grad_norm": 0.4708479232021837, + "learning_rate": 3.192568026305463e-06, + "loss": 0.5286, + "step": 9338 + }, + { + "epoch": 2.4792247444577193, + "grad_norm": 0.44519458042877974, + "learning_rate": 3.1922325557782998e-06, + "loss": 0.5792, + "step": 9339 + }, + { + "epoch": 2.479490242931103, + "grad_norm": 0.4757209214463942, + "learning_rate": 3.1918970717509057e-06, + "loss": 0.5602, + "step": 9340 + }, + { + "epoch": 2.479755741404487, + "grad_norm": 0.4462675605428505, + "learning_rate": 3.191561574229824e-06, + "loss": 0.5759, + "step": 9341 + }, + { + "epoch": 2.4800212398778707, + "grad_norm": 0.441487599107144, + "learning_rate": 3.1912260632215997e-06, + "loss": 0.5455, + "step": 9342 + }, + { + "epoch": 2.4802867383512543, + "grad_norm": 0.43613430422436594, + "learning_rate": 3.190890538732774e-06, + "loss": 0.5576, + "step": 9343 + }, + { + "epoch": 2.4805522368246384, + "grad_norm": 0.44867907732059414, + "learning_rate": 3.1905550007698914e-06, + "loss": 0.536, + "step": 9344 + }, + { + "epoch": 2.480817735298022, + "grad_norm": 0.43336200421984833, + "learning_rate": 3.190219449339496e-06, + "loss": 0.5843, + "step": 9345 + }, + { + "epoch": 2.4810832337714057, + "grad_norm": 0.43731371335480906, + "learning_rate": 3.189883884448131e-06, + "loss": 0.5264, + "step": 9346 + }, + { + "epoch": 2.48134873224479, + "grad_norm": 0.4270844799774336, + "learning_rate": 3.1895483061023413e-06, + "loss": 0.5391, + "step": 9347 + }, + { + "epoch": 2.4816142307181734, + "grad_norm": 0.43606513188398427, + "learning_rate": 3.1892127143086716e-06, + "loss": 0.5227, + "step": 9348 + }, + { + "epoch": 2.481879729191557, + "grad_norm": 0.42749928299191725, + "learning_rate": 3.188877109073666e-06, + "loss": 0.5679, + "step": 9349 + }, + { + "epoch": 2.482145227664941, + "grad_norm": 0.43431540937476915, + "learning_rate": 3.1885414904038704e-06, + "loss": 0.5344, + "step": 9350 + }, + { + "epoch": 2.482410726138325, + "grad_norm": 0.436061249834663, + "learning_rate": 3.1882058583058294e-06, + "loss": 0.5618, + "step": 9351 + }, + { + "epoch": 2.4826762246117084, + "grad_norm": 0.4377943226932406, + "learning_rate": 3.1878702127860893e-06, + "loss": 0.5795, + "step": 9352 + }, + { + "epoch": 2.482941723085092, + "grad_norm": 0.44184378059784135, + "learning_rate": 3.187534553851196e-06, + "loss": 0.5812, + "step": 9353 + }, + { + "epoch": 2.483207221558476, + "grad_norm": 0.43368017594018454, + "learning_rate": 3.1871988815076953e-06, + "loss": 0.5449, + "step": 9354 + }, + { + "epoch": 2.48347272003186, + "grad_norm": 0.4238069815154126, + "learning_rate": 3.1868631957621343e-06, + "loss": 0.5436, + "step": 9355 + }, + { + "epoch": 2.4837382185052435, + "grad_norm": 0.4260234172067541, + "learning_rate": 3.186527496621059e-06, + "loss": 0.5551, + "step": 9356 + }, + { + "epoch": 2.4840037169786275, + "grad_norm": 0.43921437028593535, + "learning_rate": 3.1861917840910166e-06, + "loss": 0.5564, + "step": 9357 + }, + { + "epoch": 2.484269215452011, + "grad_norm": 0.4394505298820809, + "learning_rate": 3.185856058178555e-06, + "loss": 0.5786, + "step": 9358 + }, + { + "epoch": 2.484534713925395, + "grad_norm": 0.4436985424507277, + "learning_rate": 3.1855203188902194e-06, + "loss": 0.5721, + "step": 9359 + }, + { + "epoch": 2.4848002123987785, + "grad_norm": 0.42585355772389516, + "learning_rate": 3.1851845662325602e-06, + "loss": 0.554, + "step": 9360 + }, + { + "epoch": 2.4850657108721625, + "grad_norm": 0.4383754774042248, + "learning_rate": 3.184848800212124e-06, + "loss": 0.5738, + "step": 9361 + }, + { + "epoch": 2.485331209345546, + "grad_norm": 0.43766013125967823, + "learning_rate": 3.1845130208354603e-06, + "loss": 0.5808, + "step": 9362 + }, + { + "epoch": 2.48559670781893, + "grad_norm": 0.43077167528659666, + "learning_rate": 3.184177228109116e-06, + "loss": 0.5685, + "step": 9363 + }, + { + "epoch": 2.485862206292314, + "grad_norm": 0.4407213907402458, + "learning_rate": 3.1838414220396407e-06, + "loss": 0.5962, + "step": 9364 + }, + { + "epoch": 2.4861277047656976, + "grad_norm": 0.4361684779228887, + "learning_rate": 3.183505602633583e-06, + "loss": 0.5379, + "step": 9365 + }, + { + "epoch": 2.486393203239081, + "grad_norm": 0.4307045462344444, + "learning_rate": 3.183169769897493e-06, + "loss": 0.5547, + "step": 9366 + }, + { + "epoch": 2.4866587017124653, + "grad_norm": 0.4389193712930424, + "learning_rate": 3.1828339238379195e-06, + "loss": 0.5629, + "step": 9367 + }, + { + "epoch": 2.486924200185849, + "grad_norm": 0.43812322007297244, + "learning_rate": 3.182498064461413e-06, + "loss": 0.5607, + "step": 9368 + }, + { + "epoch": 2.4871896986592326, + "grad_norm": 0.43177351334385666, + "learning_rate": 3.1821621917745227e-06, + "loss": 0.5378, + "step": 9369 + }, + { + "epoch": 2.4874551971326166, + "grad_norm": 0.42901807443409923, + "learning_rate": 3.1818263057837996e-06, + "loss": 0.5625, + "step": 9370 + }, + { + "epoch": 2.4877206956060003, + "grad_norm": 0.4212485000515425, + "learning_rate": 3.1814904064957943e-06, + "loss": 0.5417, + "step": 9371 + }, + { + "epoch": 2.487986194079384, + "grad_norm": 0.43914906144383814, + "learning_rate": 3.1811544939170573e-06, + "loss": 0.5685, + "step": 9372 + }, + { + "epoch": 2.488251692552768, + "grad_norm": 0.42594664621438766, + "learning_rate": 3.1808185680541404e-06, + "loss": 0.5591, + "step": 9373 + }, + { + "epoch": 2.4885171910261517, + "grad_norm": 0.4283460894058382, + "learning_rate": 3.1804826289135944e-06, + "loss": 0.5584, + "step": 9374 + }, + { + "epoch": 2.4887826894995353, + "grad_norm": 0.43088434320017543, + "learning_rate": 3.1801466765019705e-06, + "loss": 0.5579, + "step": 9375 + }, + { + "epoch": 2.4890481879729194, + "grad_norm": 0.4370405283436262, + "learning_rate": 3.1798107108258213e-06, + "loss": 0.5621, + "step": 9376 + }, + { + "epoch": 2.489313686446303, + "grad_norm": 0.43069731679526424, + "learning_rate": 3.1794747318916984e-06, + "loss": 0.5819, + "step": 9377 + }, + { + "epoch": 2.4895791849196867, + "grad_norm": 0.4470534406296529, + "learning_rate": 3.179138739706155e-06, + "loss": 0.5643, + "step": 9378 + }, + { + "epoch": 2.4898446833930703, + "grad_norm": 0.43352758497340826, + "learning_rate": 3.178802734275744e-06, + "loss": 0.6265, + "step": 9379 + }, + { + "epoch": 2.4901101818664544, + "grad_norm": 0.4417331957014581, + "learning_rate": 3.178466715607017e-06, + "loss": 0.5698, + "step": 9380 + }, + { + "epoch": 2.490375680339838, + "grad_norm": 0.4202582888579948, + "learning_rate": 3.1781306837065273e-06, + "loss": 0.5409, + "step": 9381 + }, + { + "epoch": 2.4906411788132217, + "grad_norm": 0.43936683233036167, + "learning_rate": 3.17779463858083e-06, + "loss": 0.5754, + "step": 9382 + }, + { + "epoch": 2.4909066772866058, + "grad_norm": 0.41849693921954273, + "learning_rate": 3.177458580236478e-06, + "loss": 0.5294, + "step": 9383 + }, + { + "epoch": 2.4911721757599894, + "grad_norm": 0.43464855887181675, + "learning_rate": 3.1771225086800234e-06, + "loss": 0.5594, + "step": 9384 + }, + { + "epoch": 2.491437674233373, + "grad_norm": 0.4292485679549083, + "learning_rate": 3.1767864239180234e-06, + "loss": 0.5773, + "step": 9385 + }, + { + "epoch": 2.491703172706757, + "grad_norm": 0.4260345270197514, + "learning_rate": 3.1764503259570305e-06, + "loss": 0.5479, + "step": 9386 + }, + { + "epoch": 2.4919686711801408, + "grad_norm": 0.43207095039306936, + "learning_rate": 3.1761142148035993e-06, + "loss": 0.5641, + "step": 9387 + }, + { + "epoch": 2.4922341696535244, + "grad_norm": 0.44440231512274675, + "learning_rate": 3.1757780904642864e-06, + "loss": 0.5622, + "step": 9388 + }, + { + "epoch": 2.492499668126908, + "grad_norm": 0.444438178675176, + "learning_rate": 3.175441952945646e-06, + "loss": 0.5544, + "step": 9389 + }, + { + "epoch": 2.492765166600292, + "grad_norm": 0.44591030742237003, + "learning_rate": 3.1751058022542335e-06, + "loss": 0.5542, + "step": 9390 + }, + { + "epoch": 2.4930306650736758, + "grad_norm": 0.4405575199835961, + "learning_rate": 3.174769638396606e-06, + "loss": 0.5532, + "step": 9391 + }, + { + "epoch": 2.4932961635470594, + "grad_norm": 0.4437465011144683, + "learning_rate": 3.1744334613793172e-06, + "loss": 0.5768, + "step": 9392 + }, + { + "epoch": 2.4935616620204435, + "grad_norm": 0.44587487694160954, + "learning_rate": 3.1740972712089247e-06, + "loss": 0.5927, + "step": 9393 + }, + { + "epoch": 2.493827160493827, + "grad_norm": 0.43148217595035476, + "learning_rate": 3.173761067891986e-06, + "loss": 0.5503, + "step": 9394 + }, + { + "epoch": 2.4940926589672108, + "grad_norm": 0.4393587490987073, + "learning_rate": 3.1734248514350556e-06, + "loss": 0.5459, + "step": 9395 + }, + { + "epoch": 2.494358157440595, + "grad_norm": 0.4347850772836811, + "learning_rate": 3.173088621844692e-06, + "loss": 0.5207, + "step": 9396 + }, + { + "epoch": 2.4946236559139785, + "grad_norm": 0.4178166944832567, + "learning_rate": 3.1727523791274527e-06, + "loss": 0.5478, + "step": 9397 + }, + { + "epoch": 2.494889154387362, + "grad_norm": 0.42730049645611723, + "learning_rate": 3.172416123289894e-06, + "loss": 0.5533, + "step": 9398 + }, + { + "epoch": 2.4951546528607462, + "grad_norm": 0.44476275157662243, + "learning_rate": 3.172079854338576e-06, + "loss": 0.5502, + "step": 9399 + }, + { + "epoch": 2.49542015133413, + "grad_norm": 0.4442515436093456, + "learning_rate": 3.1717435722800542e-06, + "loss": 0.5686, + "step": 9400 + }, + { + "epoch": 2.4956856498075135, + "grad_norm": 0.43988423286033923, + "learning_rate": 3.171407277120888e-06, + "loss": 0.5627, + "step": 9401 + }, + { + "epoch": 2.4959511482808976, + "grad_norm": 0.4437273711155727, + "learning_rate": 3.171070968867637e-06, + "loss": 0.5807, + "step": 9402 + }, + { + "epoch": 2.4962166467542812, + "grad_norm": 0.4418132440806291, + "learning_rate": 3.1707346475268587e-06, + "loss": 0.5699, + "step": 9403 + }, + { + "epoch": 2.496482145227665, + "grad_norm": 0.4244327837451305, + "learning_rate": 3.170398313105111e-06, + "loss": 0.5431, + "step": 9404 + }, + { + "epoch": 2.496747643701049, + "grad_norm": 0.43547062279590987, + "learning_rate": 3.1700619656089565e-06, + "loss": 0.5819, + "step": 9405 + }, + { + "epoch": 2.4970131421744326, + "grad_norm": 0.43619303395179776, + "learning_rate": 3.169725605044952e-06, + "loss": 0.5644, + "step": 9406 + }, + { + "epoch": 2.4972786406478162, + "grad_norm": 0.43781081050230675, + "learning_rate": 3.1693892314196588e-06, + "loss": 0.5543, + "step": 9407 + }, + { + "epoch": 2.4975441391212, + "grad_norm": 0.43690069654365465, + "learning_rate": 3.1690528447396375e-06, + "loss": 0.5752, + "step": 9408 + }, + { + "epoch": 2.497809637594584, + "grad_norm": 0.42997172892839525, + "learning_rate": 3.1687164450114465e-06, + "loss": 0.5699, + "step": 9409 + }, + { + "epoch": 2.4980751360679676, + "grad_norm": 0.4521442698318833, + "learning_rate": 3.168380032241648e-06, + "loss": 0.5338, + "step": 9410 + }, + { + "epoch": 2.4983406345413512, + "grad_norm": 0.43680720736348605, + "learning_rate": 3.1680436064368026e-06, + "loss": 0.5472, + "step": 9411 + }, + { + "epoch": 2.4986061330147353, + "grad_norm": 0.4368443393016064, + "learning_rate": 3.1677071676034708e-06, + "loss": 0.5768, + "step": 9412 + }, + { + "epoch": 2.498871631488119, + "grad_norm": 0.4418991616528072, + "learning_rate": 3.167370715748215e-06, + "loss": 0.5761, + "step": 9413 + }, + { + "epoch": 2.4991371299615026, + "grad_norm": 0.4398447504453518, + "learning_rate": 3.1670342508775954e-06, + "loss": 0.5784, + "step": 9414 + }, + { + "epoch": 2.4994026284348863, + "grad_norm": 0.4213672710563381, + "learning_rate": 3.1666977729981746e-06, + "loss": 0.5334, + "step": 9415 + }, + { + "epoch": 2.4996681269082703, + "grad_norm": 0.4322465775984401, + "learning_rate": 3.1663612821165156e-06, + "loss": 0.5459, + "step": 9416 + }, + { + "epoch": 2.499933625381654, + "grad_norm": 0.4262706490928654, + "learning_rate": 3.1660247782391802e-06, + "loss": 0.5732, + "step": 9417 + }, + { + "epoch": 2.5001991238550376, + "grad_norm": 0.4378124962515531, + "learning_rate": 3.16568826137273e-06, + "loss": 0.5788, + "step": 9418 + }, + { + "epoch": 2.5004646223284217, + "grad_norm": 0.43240347248433175, + "learning_rate": 3.165351731523729e-06, + "loss": 0.5735, + "step": 9419 + }, + { + "epoch": 2.5007301208018053, + "grad_norm": 0.43058080754786965, + "learning_rate": 3.1650151886987414e-06, + "loss": 0.5104, + "step": 9420 + }, + { + "epoch": 2.500995619275189, + "grad_norm": 0.44248267596784074, + "learning_rate": 3.1646786329043283e-06, + "loss": 0.5586, + "step": 9421 + }, + { + "epoch": 2.501261117748573, + "grad_norm": 0.4409641447977084, + "learning_rate": 3.1643420641470547e-06, + "loss": 0.5777, + "step": 9422 + }, + { + "epoch": 2.5015266162219567, + "grad_norm": 0.42398713429322904, + "learning_rate": 3.1640054824334842e-06, + "loss": 0.5365, + "step": 9423 + }, + { + "epoch": 2.5017921146953404, + "grad_norm": 0.4375260120482033, + "learning_rate": 3.163668887770181e-06, + "loss": 0.6066, + "step": 9424 + }, + { + "epoch": 2.5020576131687244, + "grad_norm": 0.42498811281789456, + "learning_rate": 3.16333228016371e-06, + "loss": 0.5733, + "step": 9425 + }, + { + "epoch": 2.502323111642108, + "grad_norm": 0.4281434155084899, + "learning_rate": 3.1629956596206347e-06, + "loss": 0.5286, + "step": 9426 + }, + { + "epoch": 2.5025886101154917, + "grad_norm": 0.4347583599855308, + "learning_rate": 3.162659026147521e-06, + "loss": 0.5728, + "step": 9427 + }, + { + "epoch": 2.502854108588876, + "grad_norm": 0.43073409845445587, + "learning_rate": 3.1623223797509347e-06, + "loss": 0.5608, + "step": 9428 + }, + { + "epoch": 2.5031196070622594, + "grad_norm": 0.4622175140503573, + "learning_rate": 3.161985720437439e-06, + "loss": 0.5839, + "step": 9429 + }, + { + "epoch": 2.503385105535643, + "grad_norm": 0.44353609136544103, + "learning_rate": 3.1616490482136014e-06, + "loss": 0.5402, + "step": 9430 + }, + { + "epoch": 2.503650604009027, + "grad_norm": 0.4214474514734924, + "learning_rate": 3.1613123630859876e-06, + "loss": 0.5502, + "step": 9431 + }, + { + "epoch": 2.503916102482411, + "grad_norm": 0.43788547739664513, + "learning_rate": 3.160975665061163e-06, + "loss": 0.5534, + "step": 9432 + }, + { + "epoch": 2.5041816009557945, + "grad_norm": 0.43323573813811256, + "learning_rate": 3.160638954145695e-06, + "loss": 0.6018, + "step": 9433 + }, + { + "epoch": 2.5044470994291785, + "grad_norm": 0.43083876935160986, + "learning_rate": 3.16030223034615e-06, + "loss": 0.5546, + "step": 9434 + }, + { + "epoch": 2.504712597902562, + "grad_norm": 0.42605749357146955, + "learning_rate": 3.1599654936690945e-06, + "loss": 0.543, + "step": 9435 + }, + { + "epoch": 2.504978096375946, + "grad_norm": 0.42765070690264945, + "learning_rate": 3.159628744121096e-06, + "loss": 0.5754, + "step": 9436 + }, + { + "epoch": 2.5052435948493295, + "grad_norm": 0.40949004682864903, + "learning_rate": 3.1592919817087226e-06, + "loss": 0.5184, + "step": 9437 + }, + { + "epoch": 2.5055090933227135, + "grad_norm": 0.4275121339098621, + "learning_rate": 3.1589552064385404e-06, + "loss": 0.5523, + "step": 9438 + }, + { + "epoch": 2.505774591796097, + "grad_norm": 0.437249294851448, + "learning_rate": 3.1586184183171188e-06, + "loss": 0.5438, + "step": 9439 + }, + { + "epoch": 2.506040090269481, + "grad_norm": 0.4401593618561723, + "learning_rate": 3.1582816173510257e-06, + "loss": 0.5576, + "step": 9440 + }, + { + "epoch": 2.5063055887428645, + "grad_norm": 0.43138843271765864, + "learning_rate": 3.1579448035468287e-06, + "loss": 0.6055, + "step": 9441 + }, + { + "epoch": 2.5065710872162486, + "grad_norm": 0.4353020564972434, + "learning_rate": 3.157607976911097e-06, + "loss": 0.5519, + "step": 9442 + }, + { + "epoch": 2.506836585689632, + "grad_norm": 0.44013769531834684, + "learning_rate": 3.1572711374504e-06, + "loss": 0.5522, + "step": 9443 + }, + { + "epoch": 2.507102084163016, + "grad_norm": 0.4323945301662739, + "learning_rate": 3.156934285171307e-06, + "loss": 0.5383, + "step": 9444 + }, + { + "epoch": 2.5073675826364, + "grad_norm": 0.4319426722078496, + "learning_rate": 3.1565974200803863e-06, + "loss": 0.5636, + "step": 9445 + }, + { + "epoch": 2.5076330811097836, + "grad_norm": 0.4263642212609335, + "learning_rate": 3.1562605421842085e-06, + "loss": 0.5248, + "step": 9446 + }, + { + "epoch": 2.507898579583167, + "grad_norm": 0.4315088290521208, + "learning_rate": 3.1559236514893433e-06, + "loss": 0.5453, + "step": 9447 + }, + { + "epoch": 2.5081640780565513, + "grad_norm": 0.44455244906088387, + "learning_rate": 3.1555867480023616e-06, + "loss": 0.5821, + "step": 9448 + }, + { + "epoch": 2.508429576529935, + "grad_norm": 0.5844649920620739, + "learning_rate": 3.1552498317298323e-06, + "loss": 0.5283, + "step": 9449 + }, + { + "epoch": 2.5086950750033186, + "grad_norm": 0.4465978020049837, + "learning_rate": 3.154912902678327e-06, + "loss": 0.5624, + "step": 9450 + }, + { + "epoch": 2.5089605734767026, + "grad_norm": 0.42463831427826604, + "learning_rate": 3.154575960854417e-06, + "loss": 0.5456, + "step": 9451 + }, + { + "epoch": 2.5092260719500863, + "grad_norm": 0.434014154650915, + "learning_rate": 3.154239006264672e-06, + "loss": 0.5618, + "step": 9452 + }, + { + "epoch": 2.50949157042347, + "grad_norm": 0.4434477667537793, + "learning_rate": 3.1539020389156655e-06, + "loss": 0.5678, + "step": 9453 + }, + { + "epoch": 2.509757068896854, + "grad_norm": 0.4271692212882504, + "learning_rate": 3.153565058813969e-06, + "loss": 0.5634, + "step": 9454 + }, + { + "epoch": 2.5100225673702377, + "grad_norm": 0.4574139663320451, + "learning_rate": 3.1532280659661522e-06, + "loss": 0.5247, + "step": 9455 + }, + { + "epoch": 2.5102880658436213, + "grad_norm": 0.4535509337897757, + "learning_rate": 3.1528910603787894e-06, + "loss": 0.5552, + "step": 9456 + }, + { + "epoch": 2.5105535643170054, + "grad_norm": 0.4466815256195863, + "learning_rate": 3.1525540420584525e-06, + "loss": 0.5486, + "step": 9457 + }, + { + "epoch": 2.510819062790389, + "grad_norm": 0.4447736236922766, + "learning_rate": 3.1522170110117133e-06, + "loss": 0.5434, + "step": 9458 + }, + { + "epoch": 2.5110845612637727, + "grad_norm": 0.4438877722799763, + "learning_rate": 3.1518799672451463e-06, + "loss": 0.5428, + "step": 9459 + }, + { + "epoch": 2.5113500597371567, + "grad_norm": 0.4449830037930682, + "learning_rate": 3.1515429107653233e-06, + "loss": 0.5532, + "step": 9460 + }, + { + "epoch": 2.5116155582105404, + "grad_norm": 0.45219101639328685, + "learning_rate": 3.151205841578818e-06, + "loss": 0.5782, + "step": 9461 + }, + { + "epoch": 2.511881056683924, + "grad_norm": 0.45266383036782876, + "learning_rate": 3.150868759692205e-06, + "loss": 0.5565, + "step": 9462 + }, + { + "epoch": 2.512146555157308, + "grad_norm": 0.44445814861076, + "learning_rate": 3.1505316651120584e-06, + "loss": 0.6087, + "step": 9463 + }, + { + "epoch": 2.5124120536306918, + "grad_norm": 0.44306589737683216, + "learning_rate": 3.15019455784495e-06, + "loss": 0.5846, + "step": 9464 + }, + { + "epoch": 2.5126775521040754, + "grad_norm": 0.4208172398550784, + "learning_rate": 3.1498574378974566e-06, + "loss": 0.5516, + "step": 9465 + }, + { + "epoch": 2.512943050577459, + "grad_norm": 0.434473317379948, + "learning_rate": 3.1495203052761526e-06, + "loss": 0.5713, + "step": 9466 + }, + { + "epoch": 2.513208549050843, + "grad_norm": 0.4388516385148562, + "learning_rate": 3.149183159987611e-06, + "loss": 0.5758, + "step": 9467 + }, + { + "epoch": 2.5134740475242268, + "grad_norm": 0.43571511140204006, + "learning_rate": 3.1488460020384087e-06, + "loss": 0.5638, + "step": 9468 + }, + { + "epoch": 2.5137395459976104, + "grad_norm": 0.43654398294349417, + "learning_rate": 3.1485088314351208e-06, + "loss": 0.5927, + "step": 9469 + }, + { + "epoch": 2.514005044470994, + "grad_norm": 0.43061235903936945, + "learning_rate": 3.1481716481843227e-06, + "loss": 0.5347, + "step": 9470 + }, + { + "epoch": 2.514270542944378, + "grad_norm": 0.4411758669881755, + "learning_rate": 3.1478344522925912e-06, + "loss": 0.5776, + "step": 9471 + }, + { + "epoch": 2.5145360414177618, + "grad_norm": 0.42823824659729354, + "learning_rate": 3.1474972437665015e-06, + "loss": 0.5126, + "step": 9472 + }, + { + "epoch": 2.5148015398911454, + "grad_norm": 0.4403791275761967, + "learning_rate": 3.14716002261263e-06, + "loss": 0.5709, + "step": 9473 + }, + { + "epoch": 2.5150670383645295, + "grad_norm": 0.44799490711890705, + "learning_rate": 3.1468227888375535e-06, + "loss": 0.5547, + "step": 9474 + }, + { + "epoch": 2.515332536837913, + "grad_norm": 0.4407152566077431, + "learning_rate": 3.146485542447849e-06, + "loss": 0.573, + "step": 9475 + }, + { + "epoch": 2.5155980353112968, + "grad_norm": 0.43329119398929544, + "learning_rate": 3.1461482834500936e-06, + "loss": 0.5587, + "step": 9476 + }, + { + "epoch": 2.515863533784681, + "grad_norm": 0.45528324970823947, + "learning_rate": 3.1458110118508643e-06, + "loss": 0.5737, + "step": 9477 + }, + { + "epoch": 2.5161290322580645, + "grad_norm": 0.4440823591559609, + "learning_rate": 3.145473727656739e-06, + "loss": 0.5683, + "step": 9478 + }, + { + "epoch": 2.516394530731448, + "grad_norm": 0.4373493703279611, + "learning_rate": 3.1451364308742955e-06, + "loss": 0.5739, + "step": 9479 + }, + { + "epoch": 2.5166600292048322, + "grad_norm": 0.4491901021562008, + "learning_rate": 3.1447991215101125e-06, + "loss": 0.5216, + "step": 9480 + }, + { + "epoch": 2.516925527678216, + "grad_norm": 0.4255890142292866, + "learning_rate": 3.1444617995707682e-06, + "loss": 0.5378, + "step": 9481 + }, + { + "epoch": 2.5171910261515995, + "grad_norm": 0.41890493629684467, + "learning_rate": 3.14412446506284e-06, + "loss": 0.5613, + "step": 9482 + }, + { + "epoch": 2.5174565246249836, + "grad_norm": 0.4313924028897693, + "learning_rate": 3.1437871179929086e-06, + "loss": 0.539, + "step": 9483 + }, + { + "epoch": 2.5177220230983672, + "grad_norm": 0.45136518996633596, + "learning_rate": 3.1434497583675514e-06, + "loss": 0.5838, + "step": 9484 + }, + { + "epoch": 2.517987521571751, + "grad_norm": 0.442987377724069, + "learning_rate": 3.1431123861933487e-06, + "loss": 0.5979, + "step": 9485 + }, + { + "epoch": 2.518253020045135, + "grad_norm": 0.45101425045741417, + "learning_rate": 3.1427750014768805e-06, + "loss": 0.5419, + "step": 9486 + }, + { + "epoch": 2.5185185185185186, + "grad_norm": 0.4351321527608984, + "learning_rate": 3.1424376042247253e-06, + "loss": 0.562, + "step": 9487 + }, + { + "epoch": 2.5187840169919022, + "grad_norm": 0.43242431047249213, + "learning_rate": 3.1421001944434633e-06, + "loss": 0.5544, + "step": 9488 + }, + { + "epoch": 2.5190495154652863, + "grad_norm": 0.4452696267555076, + "learning_rate": 3.141762772139676e-06, + "loss": 0.5489, + "step": 9489 + }, + { + "epoch": 2.51931501393867, + "grad_norm": 0.44612255944187135, + "learning_rate": 3.1414253373199434e-06, + "loss": 0.5976, + "step": 9490 + }, + { + "epoch": 2.5195805124120536, + "grad_norm": 0.47281284484367747, + "learning_rate": 3.141087889990846e-06, + "loss": 0.5886, + "step": 9491 + }, + { + "epoch": 2.5198460108854372, + "grad_norm": 0.4552362733647655, + "learning_rate": 3.1407504301589657e-06, + "loss": 0.5292, + "step": 9492 + }, + { + "epoch": 2.5201115093588213, + "grad_norm": 0.42790738361548813, + "learning_rate": 3.140412957830883e-06, + "loss": 0.5721, + "step": 9493 + }, + { + "epoch": 2.520377007832205, + "grad_norm": 0.4338820548613723, + "learning_rate": 3.1400754730131795e-06, + "loss": 0.5639, + "step": 9494 + }, + { + "epoch": 2.5206425063055886, + "grad_norm": 0.44211137226032915, + "learning_rate": 3.139737975712437e-06, + "loss": 0.5598, + "step": 9495 + }, + { + "epoch": 2.5209080047789723, + "grad_norm": 0.43897999381338676, + "learning_rate": 3.1394004659352373e-06, + "loss": 0.5646, + "step": 9496 + }, + { + "epoch": 2.5211735032523563, + "grad_norm": 0.43186772495692166, + "learning_rate": 3.139062943688163e-06, + "loss": 0.5808, + "step": 9497 + }, + { + "epoch": 2.52143900172574, + "grad_norm": 0.4225044981361485, + "learning_rate": 3.138725408977797e-06, + "loss": 0.5479, + "step": 9498 + }, + { + "epoch": 2.5217045001991236, + "grad_norm": 0.45391021063449577, + "learning_rate": 3.1383878618107213e-06, + "loss": 0.5823, + "step": 9499 + }, + { + "epoch": 2.5219699986725077, + "grad_norm": 0.4455262752039536, + "learning_rate": 3.13805030219352e-06, + "loss": 0.5808, + "step": 9500 + }, + { + "epoch": 2.5222354971458913, + "grad_norm": 0.44299883766118314, + "learning_rate": 3.137712730132775e-06, + "loss": 0.5747, + "step": 9501 + }, + { + "epoch": 2.522500995619275, + "grad_norm": 0.43684824160283026, + "learning_rate": 3.1373751456350703e-06, + "loss": 0.5588, + "step": 9502 + }, + { + "epoch": 2.522766494092659, + "grad_norm": 0.42922587695963926, + "learning_rate": 3.1370375487069905e-06, + "loss": 0.5619, + "step": 9503 + }, + { + "epoch": 2.5230319925660427, + "grad_norm": 0.43547936776945184, + "learning_rate": 3.136699939355118e-06, + "loss": 0.5809, + "step": 9504 + }, + { + "epoch": 2.5232974910394264, + "grad_norm": 0.42667776314168326, + "learning_rate": 3.1363623175860377e-06, + "loss": 0.5832, + "step": 9505 + }, + { + "epoch": 2.5235629895128104, + "grad_norm": 0.42527543983493854, + "learning_rate": 3.136024683406334e-06, + "loss": 0.533, + "step": 9506 + }, + { + "epoch": 2.523828487986194, + "grad_norm": 0.42309528273790514, + "learning_rate": 3.135687036822592e-06, + "loss": 0.5795, + "step": 9507 + }, + { + "epoch": 2.5240939864595777, + "grad_norm": 0.4497635986445369, + "learning_rate": 3.135349377841396e-06, + "loss": 0.6098, + "step": 9508 + }, + { + "epoch": 2.524359484932962, + "grad_norm": 0.42315734533823407, + "learning_rate": 3.1350117064693324e-06, + "loss": 0.5744, + "step": 9509 + }, + { + "epoch": 2.5246249834063454, + "grad_norm": 0.4355568249485254, + "learning_rate": 3.134674022712985e-06, + "loss": 0.5661, + "step": 9510 + }, + { + "epoch": 2.524890481879729, + "grad_norm": 0.43121043889104366, + "learning_rate": 3.1343363265789407e-06, + "loss": 0.564, + "step": 9511 + }, + { + "epoch": 2.525155980353113, + "grad_norm": 0.43134446245835184, + "learning_rate": 3.133998618073785e-06, + "loss": 0.5948, + "step": 9512 + }, + { + "epoch": 2.525421478826497, + "grad_norm": 0.4423339513765274, + "learning_rate": 3.1336608972041037e-06, + "loss": 0.6002, + "step": 9513 + }, + { + "epoch": 2.5256869772998805, + "grad_norm": 0.42365528100900635, + "learning_rate": 3.133323163976483e-06, + "loss": 0.5897, + "step": 9514 + }, + { + "epoch": 2.5259524757732645, + "grad_norm": 0.4389206580562018, + "learning_rate": 3.1329854183975106e-06, + "loss": 0.5714, + "step": 9515 + }, + { + "epoch": 2.526217974246648, + "grad_norm": 0.448376762853967, + "learning_rate": 3.132647660473772e-06, + "loss": 0.565, + "step": 9516 + }, + { + "epoch": 2.526483472720032, + "grad_norm": 0.4275117684605819, + "learning_rate": 3.1323098902118554e-06, + "loss": 0.5199, + "step": 9517 + }, + { + "epoch": 2.526748971193416, + "grad_norm": 0.43336623722199247, + "learning_rate": 3.1319721076183484e-06, + "loss": 0.5889, + "step": 9518 + }, + { + "epoch": 2.5270144696667995, + "grad_norm": 0.4322185586098048, + "learning_rate": 3.131634312699837e-06, + "loss": 0.5763, + "step": 9519 + }, + { + "epoch": 2.527279968140183, + "grad_norm": 0.42550780270464755, + "learning_rate": 3.1312965054629106e-06, + "loss": 0.5696, + "step": 9520 + }, + { + "epoch": 2.527545466613567, + "grad_norm": 0.43623261231775873, + "learning_rate": 3.1309586859141566e-06, + "loss": 0.5637, + "step": 9521 + }, + { + "epoch": 2.527810965086951, + "grad_norm": 0.4332095673225028, + "learning_rate": 3.1306208540601637e-06, + "loss": 0.5423, + "step": 9522 + }, + { + "epoch": 2.5280764635603346, + "grad_norm": 0.4203605509327906, + "learning_rate": 3.13028300990752e-06, + "loss": 0.5639, + "step": 9523 + }, + { + "epoch": 2.528341962033718, + "grad_norm": 0.43504759431508144, + "learning_rate": 3.1299451534628134e-06, + "loss": 0.5969, + "step": 9524 + }, + { + "epoch": 2.528607460507102, + "grad_norm": 0.42970801059919694, + "learning_rate": 3.129607284732634e-06, + "loss": 0.5752, + "step": 9525 + }, + { + "epoch": 2.528872958980486, + "grad_norm": 0.4287573954391148, + "learning_rate": 3.1292694037235723e-06, + "loss": 0.5392, + "step": 9526 + }, + { + "epoch": 2.5291384574538696, + "grad_norm": 0.41973472472414747, + "learning_rate": 3.128931510442216e-06, + "loss": 0.5369, + "step": 9527 + }, + { + "epoch": 2.529403955927253, + "grad_norm": 0.4361919066579121, + "learning_rate": 3.1285936048951547e-06, + "loss": 0.5419, + "step": 9528 + }, + { + "epoch": 2.5296694544006373, + "grad_norm": 0.4443904929303206, + "learning_rate": 3.12825568708898e-06, + "loss": 0.5876, + "step": 9529 + }, + { + "epoch": 2.529934952874021, + "grad_norm": 0.4402880808060107, + "learning_rate": 3.1279177570302802e-06, + "loss": 0.5771, + "step": 9530 + }, + { + "epoch": 2.5302004513474046, + "grad_norm": 0.45374940769583544, + "learning_rate": 3.1275798147256474e-06, + "loss": 0.5294, + "step": 9531 + }, + { + "epoch": 2.5304659498207887, + "grad_norm": 0.4363678735918915, + "learning_rate": 3.127241860181672e-06, + "loss": 0.5189, + "step": 9532 + }, + { + "epoch": 2.5307314482941723, + "grad_norm": 0.4344168100653965, + "learning_rate": 3.126903893404944e-06, + "loss": 0.5584, + "step": 9533 + }, + { + "epoch": 2.530996946767556, + "grad_norm": 0.4305850554702492, + "learning_rate": 3.1265659144020544e-06, + "loss": 0.5296, + "step": 9534 + }, + { + "epoch": 2.53126244524094, + "grad_norm": 0.44083573100602924, + "learning_rate": 3.126227923179597e-06, + "loss": 0.5621, + "step": 9535 + }, + { + "epoch": 2.5315279437143237, + "grad_norm": 0.4269563173026622, + "learning_rate": 3.1258899197441608e-06, + "loss": 0.5384, + "step": 9536 + }, + { + "epoch": 2.5317934421877073, + "grad_norm": 0.4441549610656123, + "learning_rate": 3.1255519041023387e-06, + "loss": 0.5822, + "step": 9537 + }, + { + "epoch": 2.5320589406610914, + "grad_norm": 0.4297305506469278, + "learning_rate": 3.125213876260723e-06, + "loss": 0.5348, + "step": 9538 + }, + { + "epoch": 2.532324439134475, + "grad_norm": 0.43566509214109717, + "learning_rate": 3.124875836225906e-06, + "loss": 0.5192, + "step": 9539 + }, + { + "epoch": 2.5325899376078587, + "grad_norm": 0.4416059876334432, + "learning_rate": 3.1245377840044805e-06, + "loss": 0.6153, + "step": 9540 + }, + { + "epoch": 2.5328554360812428, + "grad_norm": 0.42428828956074627, + "learning_rate": 3.1241997196030388e-06, + "loss": 0.5423, + "step": 9541 + }, + { + "epoch": 2.5331209345546264, + "grad_norm": 0.42596054185212223, + "learning_rate": 3.123861643028174e-06, + "loss": 0.5344, + "step": 9542 + }, + { + "epoch": 2.53338643302801, + "grad_norm": 0.43986906323238645, + "learning_rate": 3.12352355428648e-06, + "loss": 0.5739, + "step": 9543 + }, + { + "epoch": 2.533651931501394, + "grad_norm": 0.4317776710873684, + "learning_rate": 3.12318545338455e-06, + "loss": 0.568, + "step": 9544 + }, + { + "epoch": 2.5339174299747778, + "grad_norm": 0.42654226536784584, + "learning_rate": 3.1228473403289776e-06, + "loss": 0.5798, + "step": 9545 + }, + { + "epoch": 2.5341829284481614, + "grad_norm": 0.44675013196536045, + "learning_rate": 3.122509215126358e-06, + "loss": 0.5517, + "step": 9546 + }, + { + "epoch": 2.534448426921545, + "grad_norm": 0.42996163224144596, + "learning_rate": 3.1221710777832835e-06, + "loss": 0.5558, + "step": 9547 + }, + { + "epoch": 2.534713925394929, + "grad_norm": 0.4368409079749489, + "learning_rate": 3.1218329283063497e-06, + "loss": 0.5744, + "step": 9548 + }, + { + "epoch": 2.5349794238683128, + "grad_norm": 0.44339747108247923, + "learning_rate": 3.121494766702152e-06, + "loss": 0.5724, + "step": 9549 + }, + { + "epoch": 2.5352449223416964, + "grad_norm": 0.44277342219882204, + "learning_rate": 3.1211565929772845e-06, + "loss": 0.5916, + "step": 9550 + }, + { + "epoch": 2.53551042081508, + "grad_norm": 0.44487316385250864, + "learning_rate": 3.120818407138342e-06, + "loss": 0.582, + "step": 9551 + }, + { + "epoch": 2.535775919288464, + "grad_norm": 0.4362030697603754, + "learning_rate": 3.1204802091919204e-06, + "loss": 0.5725, + "step": 9552 + }, + { + "epoch": 2.5360414177618478, + "grad_norm": 0.435433119817375, + "learning_rate": 3.1201419991446165e-06, + "loss": 0.6164, + "step": 9553 + }, + { + "epoch": 2.5363069162352314, + "grad_norm": 0.446177343611205, + "learning_rate": 3.119803777003024e-06, + "loss": 0.563, + "step": 9554 + }, + { + "epoch": 2.5365724147086155, + "grad_norm": 0.4345535738139898, + "learning_rate": 3.119465542773742e-06, + "loss": 0.5578, + "step": 9555 + }, + { + "epoch": 2.536837913181999, + "grad_norm": 0.45093478053169606, + "learning_rate": 3.119127296463364e-06, + "loss": 0.5964, + "step": 9556 + }, + { + "epoch": 2.537103411655383, + "grad_norm": 0.4364144173436119, + "learning_rate": 3.118789038078488e-06, + "loss": 0.5248, + "step": 9557 + }, + { + "epoch": 2.537368910128767, + "grad_norm": 0.43196640147038123, + "learning_rate": 3.1184507676257114e-06, + "loss": 0.5519, + "step": 9558 + }, + { + "epoch": 2.5376344086021505, + "grad_norm": 0.43453490475121515, + "learning_rate": 3.1181124851116298e-06, + "loss": 0.5761, + "step": 9559 + }, + { + "epoch": 2.537899907075534, + "grad_norm": 0.43271268563202675, + "learning_rate": 3.1177741905428416e-06, + "loss": 0.5858, + "step": 9560 + }, + { + "epoch": 2.5381654055489182, + "grad_norm": 0.4351962821397591, + "learning_rate": 3.1174358839259432e-06, + "loss": 0.6051, + "step": 9561 + }, + { + "epoch": 2.538430904022302, + "grad_norm": 0.44725236624506076, + "learning_rate": 3.1170975652675347e-06, + "loss": 0.5602, + "step": 9562 + }, + { + "epoch": 2.5386964024956855, + "grad_norm": 0.44153102422449925, + "learning_rate": 3.116759234574212e-06, + "loss": 0.5664, + "step": 9563 + }, + { + "epoch": 2.5389619009690696, + "grad_norm": 0.4236565414140181, + "learning_rate": 3.1164208918525747e-06, + "loss": 0.569, + "step": 9564 + }, + { + "epoch": 2.5392273994424532, + "grad_norm": 0.4300754715942089, + "learning_rate": 3.11608253710922e-06, + "loss": 0.5369, + "step": 9565 + }, + { + "epoch": 2.539492897915837, + "grad_norm": 0.42609634091504456, + "learning_rate": 3.115744170350748e-06, + "loss": 0.5315, + "step": 9566 + }, + { + "epoch": 2.539758396389221, + "grad_norm": 0.42587900942237505, + "learning_rate": 3.115405791583757e-06, + "loss": 0.5801, + "step": 9567 + }, + { + "epoch": 2.5400238948626046, + "grad_norm": 0.431035010412275, + "learning_rate": 3.115067400814846e-06, + "loss": 0.5613, + "step": 9568 + }, + { + "epoch": 2.5402893933359882, + "grad_norm": 0.4339576942082634, + "learning_rate": 3.1147289980506153e-06, + "loss": 0.5588, + "step": 9569 + }, + { + "epoch": 2.5405548918093723, + "grad_norm": 0.4273639579508005, + "learning_rate": 3.1143905832976635e-06, + "loss": 0.5534, + "step": 9570 + }, + { + "epoch": 2.540820390282756, + "grad_norm": 0.44161662566454657, + "learning_rate": 3.114052156562591e-06, + "loss": 0.5517, + "step": 9571 + }, + { + "epoch": 2.5410858887561396, + "grad_norm": 0.4458514529256643, + "learning_rate": 3.1137137178519983e-06, + "loss": 0.5786, + "step": 9572 + }, + { + "epoch": 2.5413513872295237, + "grad_norm": 0.44340320434447833, + "learning_rate": 3.1133752671724854e-06, + "loss": 0.5465, + "step": 9573 + }, + { + "epoch": 2.5416168857029073, + "grad_norm": 0.4412991934226862, + "learning_rate": 3.1130368045306524e-06, + "loss": 0.584, + "step": 9574 + }, + { + "epoch": 2.541882384176291, + "grad_norm": 0.44850265188253113, + "learning_rate": 3.112698329933102e-06, + "loss": 0.5491, + "step": 9575 + }, + { + "epoch": 2.5421478826496746, + "grad_norm": 0.44833539809204115, + "learning_rate": 3.112359843386433e-06, + "loss": 0.5808, + "step": 9576 + }, + { + "epoch": 2.5424133811230587, + "grad_norm": 0.4544369119280941, + "learning_rate": 3.112021344897248e-06, + "loss": 0.5609, + "step": 9577 + }, + { + "epoch": 2.5426788795964423, + "grad_norm": 0.4313114021309775, + "learning_rate": 3.111682834472149e-06, + "loss": 0.5812, + "step": 9578 + }, + { + "epoch": 2.542944378069826, + "grad_norm": 0.4309077614383536, + "learning_rate": 3.1113443121177363e-06, + "loss": 0.5776, + "step": 9579 + }, + { + "epoch": 2.5432098765432096, + "grad_norm": 0.44483404090843465, + "learning_rate": 3.1110057778406116e-06, + "loss": 0.5782, + "step": 9580 + }, + { + "epoch": 2.5434753750165937, + "grad_norm": 0.4479458413040876, + "learning_rate": 3.11066723164738e-06, + "loss": 0.5972, + "step": 9581 + }, + { + "epoch": 2.5437408734899773, + "grad_norm": 0.42731519509149357, + "learning_rate": 3.1103286735446416e-06, + "loss": 0.556, + "step": 9582 + }, + { + "epoch": 2.544006371963361, + "grad_norm": 0.44485304815168575, + "learning_rate": 3.1099901035389997e-06, + "loss": 0.5329, + "step": 9583 + }, + { + "epoch": 2.544271870436745, + "grad_norm": 0.42894232338073646, + "learning_rate": 3.1096515216370576e-06, + "loss": 0.5779, + "step": 9584 + }, + { + "epoch": 2.5445373689101287, + "grad_norm": 0.4268356634060042, + "learning_rate": 3.1093129278454177e-06, + "loss": 0.5301, + "step": 9585 + }, + { + "epoch": 2.5448028673835124, + "grad_norm": 0.4511171018705595, + "learning_rate": 3.1089743221706835e-06, + "loss": 0.6065, + "step": 9586 + }, + { + "epoch": 2.5450683658568964, + "grad_norm": 0.4273845279605641, + "learning_rate": 3.1086357046194603e-06, + "loss": 0.5392, + "step": 9587 + }, + { + "epoch": 2.54533386433028, + "grad_norm": 0.43914860888158624, + "learning_rate": 3.1082970751983497e-06, + "loss": 0.5146, + "step": 9588 + }, + { + "epoch": 2.5455993628036637, + "grad_norm": 0.4324563904612212, + "learning_rate": 3.1079584339139566e-06, + "loss": 0.6039, + "step": 9589 + }, + { + "epoch": 2.545864861277048, + "grad_norm": 0.4271580225793852, + "learning_rate": 3.1076197807728857e-06, + "loss": 0.5515, + "step": 9590 + }, + { + "epoch": 2.5461303597504314, + "grad_norm": 0.46396719742564996, + "learning_rate": 3.107281115781741e-06, + "loss": 0.5709, + "step": 9591 + }, + { + "epoch": 2.546395858223815, + "grad_norm": 0.4349570813326812, + "learning_rate": 3.1069424389471292e-06, + "loss": 0.5636, + "step": 9592 + }, + { + "epoch": 2.546661356697199, + "grad_norm": 0.46137287062106475, + "learning_rate": 3.1066037502756523e-06, + "loss": 0.5609, + "step": 9593 + }, + { + "epoch": 2.546926855170583, + "grad_norm": 0.4500883025148973, + "learning_rate": 3.106265049773917e-06, + "loss": 0.5508, + "step": 9594 + }, + { + "epoch": 2.5471923536439665, + "grad_norm": 0.44443925103479776, + "learning_rate": 3.1059263374485305e-06, + "loss": 0.5659, + "step": 9595 + }, + { + "epoch": 2.5474578521173505, + "grad_norm": 0.4315078983705683, + "learning_rate": 3.105587613306095e-06, + "loss": 0.5667, + "step": 9596 + }, + { + "epoch": 2.547723350590734, + "grad_norm": 0.4846562068702339, + "learning_rate": 3.105248877353219e-06, + "loss": 0.5505, + "step": 9597 + }, + { + "epoch": 2.547988849064118, + "grad_norm": 0.4399541495163904, + "learning_rate": 3.104910129596508e-06, + "loss": 0.5369, + "step": 9598 + }, + { + "epoch": 2.548254347537502, + "grad_norm": 0.4236312804238317, + "learning_rate": 3.1045713700425685e-06, + "loss": 0.5206, + "step": 9599 + }, + { + "epoch": 2.5485198460108855, + "grad_norm": 0.4415132115063261, + "learning_rate": 3.1042325986980066e-06, + "loss": 0.5789, + "step": 9600 + }, + { + "epoch": 2.548785344484269, + "grad_norm": 0.43559110899813647, + "learning_rate": 3.1038938155694303e-06, + "loss": 0.5809, + "step": 9601 + }, + { + "epoch": 2.549050842957653, + "grad_norm": 0.44944716217112785, + "learning_rate": 3.103555020663445e-06, + "loss": 0.5458, + "step": 9602 + }, + { + "epoch": 2.549316341431037, + "grad_norm": 0.4462666502776378, + "learning_rate": 3.10321621398666e-06, + "loss": 0.5799, + "step": 9603 + }, + { + "epoch": 2.5495818399044206, + "grad_norm": 0.44363867021151626, + "learning_rate": 3.102877395545682e-06, + "loss": 0.5851, + "step": 9604 + }, + { + "epoch": 2.549847338377804, + "grad_norm": 0.42597884771753386, + "learning_rate": 3.102538565347118e-06, + "loss": 0.5575, + "step": 9605 + }, + { + "epoch": 2.550112836851188, + "grad_norm": 0.4290679298444189, + "learning_rate": 3.1021997233975766e-06, + "loss": 0.5285, + "step": 9606 + }, + { + "epoch": 2.550378335324572, + "grad_norm": 0.46242184259367003, + "learning_rate": 3.101860869703667e-06, + "loss": 0.5356, + "step": 9607 + }, + { + "epoch": 2.5506438337979556, + "grad_norm": 0.4294958834898439, + "learning_rate": 3.1015220042719955e-06, + "loss": 0.5771, + "step": 9608 + }, + { + "epoch": 2.550909332271339, + "grad_norm": 0.4406036185051546, + "learning_rate": 3.101183127109173e-06, + "loss": 0.5364, + "step": 9609 + }, + { + "epoch": 2.5511748307447233, + "grad_norm": 0.42664364356263557, + "learning_rate": 3.100844238221808e-06, + "loss": 0.5461, + "step": 9610 + }, + { + "epoch": 2.551440329218107, + "grad_norm": 0.4336074557591012, + "learning_rate": 3.100505337616509e-06, + "loss": 0.5513, + "step": 9611 + }, + { + "epoch": 2.5517058276914906, + "grad_norm": 0.42951183128196596, + "learning_rate": 3.1001664252998853e-06, + "loss": 0.5493, + "step": 9612 + }, + { + "epoch": 2.5519713261648747, + "grad_norm": 0.4418028266518351, + "learning_rate": 3.0998275012785473e-06, + "loss": 0.5733, + "step": 9613 + }, + { + "epoch": 2.5522368246382583, + "grad_norm": 0.4330427838676648, + "learning_rate": 3.099488565559104e-06, + "loss": 0.5734, + "step": 9614 + }, + { + "epoch": 2.552502323111642, + "grad_norm": 0.4443548976547577, + "learning_rate": 3.0991496181481654e-06, + "loss": 0.5868, + "step": 9615 + }, + { + "epoch": 2.552767821585026, + "grad_norm": 0.4411619204202222, + "learning_rate": 3.0988106590523425e-06, + "loss": 0.5515, + "step": 9616 + }, + { + "epoch": 2.5530333200584097, + "grad_norm": 0.4400094390422781, + "learning_rate": 3.0984716882782464e-06, + "loss": 0.5529, + "step": 9617 + }, + { + "epoch": 2.5532988185317933, + "grad_norm": 0.4287085676505781, + "learning_rate": 3.098132705832487e-06, + "loss": 0.578, + "step": 9618 + }, + { + "epoch": 2.5535643170051774, + "grad_norm": 0.43531895284167754, + "learning_rate": 3.0977937117216746e-06, + "loss": 0.5688, + "step": 9619 + }, + { + "epoch": 2.553829815478561, + "grad_norm": 0.4433710813700474, + "learning_rate": 3.097454705952421e-06, + "loss": 0.5429, + "step": 9620 + }, + { + "epoch": 2.5540953139519447, + "grad_norm": 0.44479554804842375, + "learning_rate": 3.0971156885313392e-06, + "loss": 0.5728, + "step": 9621 + }, + { + "epoch": 2.5543608124253288, + "grad_norm": 0.4310681895763898, + "learning_rate": 3.0967766594650383e-06, + "loss": 0.5217, + "step": 9622 + }, + { + "epoch": 2.5546263108987124, + "grad_norm": 0.43433357132671396, + "learning_rate": 3.096437618760131e-06, + "loss": 0.5698, + "step": 9623 + }, + { + "epoch": 2.554891809372096, + "grad_norm": 0.4258182024620701, + "learning_rate": 3.0960985664232314e-06, + "loss": 0.5654, + "step": 9624 + }, + { + "epoch": 2.55515730784548, + "grad_norm": 0.44314283305276586, + "learning_rate": 3.095759502460949e-06, + "loss": 0.6027, + "step": 9625 + }, + { + "epoch": 2.5554228063188638, + "grad_norm": 0.4481693364534932, + "learning_rate": 3.0954204268798972e-06, + "loss": 0.5402, + "step": 9626 + }, + { + "epoch": 2.5556883047922474, + "grad_norm": 0.44993258145260806, + "learning_rate": 3.095081339686691e-06, + "loss": 0.565, + "step": 9627 + }, + { + "epoch": 2.5559538032656315, + "grad_norm": 0.4482393847770999, + "learning_rate": 3.09474224088794e-06, + "loss": 0.6091, + "step": 9628 + }, + { + "epoch": 2.556219301739015, + "grad_norm": 0.4367650615935612, + "learning_rate": 3.09440313049026e-06, + "loss": 0.5642, + "step": 9629 + }, + { + "epoch": 2.5564848002123988, + "grad_norm": 0.44771433011254264, + "learning_rate": 3.0940640085002636e-06, + "loss": 0.5565, + "step": 9630 + }, + { + "epoch": 2.5567502986857824, + "grad_norm": 0.4376664303792177, + "learning_rate": 3.093724874924564e-06, + "loss": 0.5437, + "step": 9631 + }, + { + "epoch": 2.5570157971591665, + "grad_norm": 0.42659774084869156, + "learning_rate": 3.093385729769776e-06, + "loss": 0.5374, + "step": 9632 + }, + { + "epoch": 2.55728129563255, + "grad_norm": 0.4300593549908244, + "learning_rate": 3.0930465730425136e-06, + "loss": 0.558, + "step": 9633 + }, + { + "epoch": 2.5575467941059338, + "grad_norm": 0.4330741643839839, + "learning_rate": 3.092707404749391e-06, + "loss": 0.5566, + "step": 9634 + }, + { + "epoch": 2.5578122925793174, + "grad_norm": 0.42953398249368685, + "learning_rate": 3.092368224897022e-06, + "loss": 0.5827, + "step": 9635 + }, + { + "epoch": 2.5580777910527015, + "grad_norm": 0.419536117584414, + "learning_rate": 3.0920290334920228e-06, + "loss": 0.5379, + "step": 9636 + }, + { + "epoch": 2.558343289526085, + "grad_norm": 0.44578708584775045, + "learning_rate": 3.091689830541008e-06, + "loss": 0.5544, + "step": 9637 + }, + { + "epoch": 2.558608787999469, + "grad_norm": 0.4276303993102809, + "learning_rate": 3.091350616050592e-06, + "loss": 0.5899, + "step": 9638 + }, + { + "epoch": 2.558874286472853, + "grad_norm": 0.43882022603036214, + "learning_rate": 3.0910113900273924e-06, + "loss": 0.5519, + "step": 9639 + }, + { + "epoch": 2.5591397849462365, + "grad_norm": 0.4291659665658252, + "learning_rate": 3.090672152478023e-06, + "loss": 0.5553, + "step": 9640 + }, + { + "epoch": 2.55940528341962, + "grad_norm": 0.44232041095698077, + "learning_rate": 3.090332903409101e-06, + "loss": 0.5798, + "step": 9641 + }, + { + "epoch": 2.5596707818930042, + "grad_norm": 0.44130519804366836, + "learning_rate": 3.0899936428272413e-06, + "loss": 0.5657, + "step": 9642 + }, + { + "epoch": 2.559936280366388, + "grad_norm": 0.451146993664638, + "learning_rate": 3.0896543707390613e-06, + "loss": 0.6109, + "step": 9643 + }, + { + "epoch": 2.5602017788397715, + "grad_norm": 0.44537391289833644, + "learning_rate": 3.089315087151177e-06, + "loss": 0.5875, + "step": 9644 + }, + { + "epoch": 2.5604672773131556, + "grad_norm": 0.43163412393199746, + "learning_rate": 3.0889757920702057e-06, + "loss": 0.5899, + "step": 9645 + }, + { + "epoch": 2.5607327757865392, + "grad_norm": 0.45092081440197557, + "learning_rate": 3.0886364855027645e-06, + "loss": 0.5402, + "step": 9646 + }, + { + "epoch": 2.560998274259923, + "grad_norm": 0.4283447167542925, + "learning_rate": 3.0882971674554714e-06, + "loss": 0.5752, + "step": 9647 + }, + { + "epoch": 2.561263772733307, + "grad_norm": 0.4282776561390347, + "learning_rate": 3.0879578379349416e-06, + "loss": 0.4969, + "step": 9648 + }, + { + "epoch": 2.5615292712066906, + "grad_norm": 0.42995979364080233, + "learning_rate": 3.0876184969477956e-06, + "loss": 0.5207, + "step": 9649 + }, + { + "epoch": 2.5617947696800742, + "grad_norm": 0.4437998775448447, + "learning_rate": 3.08727914450065e-06, + "loss": 0.5301, + "step": 9650 + }, + { + "epoch": 2.5620602681534583, + "grad_norm": 0.45048063157224244, + "learning_rate": 3.0869397806001228e-06, + "loss": 0.5408, + "step": 9651 + }, + { + "epoch": 2.562325766626842, + "grad_norm": 0.4439482520063843, + "learning_rate": 3.0866004052528333e-06, + "loss": 0.5216, + "step": 9652 + }, + { + "epoch": 2.5625912651002256, + "grad_norm": 0.4518546624959561, + "learning_rate": 3.0862610184653987e-06, + "loss": 0.5493, + "step": 9653 + }, + { + "epoch": 2.5628567635736097, + "grad_norm": 0.4403500420155033, + "learning_rate": 3.0859216202444393e-06, + "loss": 0.5355, + "step": 9654 + }, + { + "epoch": 2.5631222620469933, + "grad_norm": 0.42894716317848847, + "learning_rate": 3.085582210596574e-06, + "loss": 0.567, + "step": 9655 + }, + { + "epoch": 2.563387760520377, + "grad_norm": 0.43339364611784054, + "learning_rate": 3.085242789528422e-06, + "loss": 0.5355, + "step": 9656 + }, + { + "epoch": 2.5636532589937606, + "grad_norm": 0.43156591436058017, + "learning_rate": 3.0849033570466017e-06, + "loss": 0.5758, + "step": 9657 + }, + { + "epoch": 2.5639187574671447, + "grad_norm": 0.4543316643247976, + "learning_rate": 3.084563913157735e-06, + "loss": 0.6042, + "step": 9658 + }, + { + "epoch": 2.5641842559405283, + "grad_norm": 0.44382083957879875, + "learning_rate": 3.0842244578684406e-06, + "loss": 0.6009, + "step": 9659 + }, + { + "epoch": 2.564449754413912, + "grad_norm": 0.46253761886490485, + "learning_rate": 3.083884991185338e-06, + "loss": 0.6028, + "step": 9660 + }, + { + "epoch": 2.5647152528872956, + "grad_norm": 0.43500935516070693, + "learning_rate": 3.0835455131150487e-06, + "loss": 0.5711, + "step": 9661 + }, + { + "epoch": 2.5649807513606797, + "grad_norm": 0.4594670650746794, + "learning_rate": 3.0832060236641936e-06, + "loss": 0.573, + "step": 9662 + }, + { + "epoch": 2.5652462498340634, + "grad_norm": 0.47308726852966054, + "learning_rate": 3.0828665228393928e-06, + "loss": 0.5986, + "step": 9663 + }, + { + "epoch": 2.565511748307447, + "grad_norm": 0.443394762718553, + "learning_rate": 3.0825270106472683e-06, + "loss": 0.5673, + "step": 9664 + }, + { + "epoch": 2.565777246780831, + "grad_norm": 0.4332601791833904, + "learning_rate": 3.0821874870944403e-06, + "loss": 0.5723, + "step": 9665 + }, + { + "epoch": 2.5660427452542147, + "grad_norm": 0.4371094025214304, + "learning_rate": 3.0818479521875307e-06, + "loss": 0.5766, + "step": 9666 + }, + { + "epoch": 2.5663082437275984, + "grad_norm": 0.4567428164942645, + "learning_rate": 3.0815084059331617e-06, + "loss": 0.5722, + "step": 9667 + }, + { + "epoch": 2.5665737422009824, + "grad_norm": 0.4408415113896548, + "learning_rate": 3.0811688483379546e-06, + "loss": 0.5824, + "step": 9668 + }, + { + "epoch": 2.566839240674366, + "grad_norm": 0.4325895208979403, + "learning_rate": 3.0808292794085327e-06, + "loss": 0.5682, + "step": 9669 + }, + { + "epoch": 2.5671047391477497, + "grad_norm": 0.42824672270415926, + "learning_rate": 3.0804896991515177e-06, + "loss": 0.5609, + "step": 9670 + }, + { + "epoch": 2.567370237621134, + "grad_norm": 0.4385187801553421, + "learning_rate": 3.080150107573531e-06, + "loss": 0.5439, + "step": 9671 + }, + { + "epoch": 2.5676357360945175, + "grad_norm": 0.43241807837773755, + "learning_rate": 3.0798105046811976e-06, + "loss": 0.5753, + "step": 9672 + }, + { + "epoch": 2.567901234567901, + "grad_norm": 0.45119715017116857, + "learning_rate": 3.07947089048114e-06, + "loss": 0.5653, + "step": 9673 + }, + { + "epoch": 2.568166733041285, + "grad_norm": 0.44053305754524424, + "learning_rate": 3.079131264979981e-06, + "loss": 0.5715, + "step": 9674 + }, + { + "epoch": 2.568432231514669, + "grad_norm": 0.43547949527770335, + "learning_rate": 3.078791628184344e-06, + "loss": 0.5702, + "step": 9675 + }, + { + "epoch": 2.5686977299880525, + "grad_norm": 0.4559303012637625, + "learning_rate": 3.0784519801008546e-06, + "loss": 0.5677, + "step": 9676 + }, + { + "epoch": 2.5689632284614365, + "grad_norm": 0.44548355383752986, + "learning_rate": 3.078112320736134e-06, + "loss": 0.5979, + "step": 9677 + }, + { + "epoch": 2.56922872693482, + "grad_norm": 0.43308078305629577, + "learning_rate": 3.0777726500968075e-06, + "loss": 0.5991, + "step": 9678 + }, + { + "epoch": 2.569494225408204, + "grad_norm": 0.4369511103644772, + "learning_rate": 3.077432968189501e-06, + "loss": 0.579, + "step": 9679 + }, + { + "epoch": 2.569759723881588, + "grad_norm": 0.45597989379802933, + "learning_rate": 3.0770932750208366e-06, + "loss": 0.5812, + "step": 9680 + }, + { + "epoch": 2.5700252223549716, + "grad_norm": 0.41739150056029206, + "learning_rate": 3.07675357059744e-06, + "loss": 0.5326, + "step": 9681 + }, + { + "epoch": 2.570290720828355, + "grad_norm": 0.43937655926575614, + "learning_rate": 3.0764138549259374e-06, + "loss": 0.5512, + "step": 9682 + }, + { + "epoch": 2.5705562193017393, + "grad_norm": 0.4503578024386901, + "learning_rate": 3.0760741280129535e-06, + "loss": 0.5601, + "step": 9683 + }, + { + "epoch": 2.570821717775123, + "grad_norm": 0.43065582273140096, + "learning_rate": 3.075734389865113e-06, + "loss": 0.5866, + "step": 9684 + }, + { + "epoch": 2.5710872162485066, + "grad_norm": 0.43642136230903655, + "learning_rate": 3.0753946404890432e-06, + "loss": 0.6166, + "step": 9685 + }, + { + "epoch": 2.57135271472189, + "grad_norm": 0.4387966750656058, + "learning_rate": 3.0750548798913687e-06, + "loss": 0.5479, + "step": 9686 + }, + { + "epoch": 2.5716182131952743, + "grad_norm": 0.4426165277122058, + "learning_rate": 3.0747151080787158e-06, + "loss": 0.5327, + "step": 9687 + }, + { + "epoch": 2.571883711668658, + "grad_norm": 0.4308996680887213, + "learning_rate": 3.0743753250577112e-06, + "loss": 0.5462, + "step": 9688 + }, + { + "epoch": 2.5721492101420416, + "grad_norm": 0.4329813988437253, + "learning_rate": 3.074035530834982e-06, + "loss": 0.5112, + "step": 9689 + }, + { + "epoch": 2.572414708615425, + "grad_norm": 0.42706876476087124, + "learning_rate": 3.0736957254171535e-06, + "loss": 0.5396, + "step": 9690 + }, + { + "epoch": 2.5726802070888093, + "grad_norm": 0.44080151752434044, + "learning_rate": 3.0733559088108543e-06, + "loss": 0.5276, + "step": 9691 + }, + { + "epoch": 2.572945705562193, + "grad_norm": 0.4294161573220226, + "learning_rate": 3.0730160810227105e-06, + "loss": 0.5576, + "step": 9692 + }, + { + "epoch": 2.5732112040355766, + "grad_norm": 0.4376772241817617, + "learning_rate": 3.0726762420593514e-06, + "loss": 0.5465, + "step": 9693 + }, + { + "epoch": 2.5734767025089607, + "grad_norm": 0.4398951307319967, + "learning_rate": 3.0723363919274026e-06, + "loss": 0.5903, + "step": 9694 + }, + { + "epoch": 2.5737422009823443, + "grad_norm": 0.435528661039307, + "learning_rate": 3.0719965306334924e-06, + "loss": 0.5722, + "step": 9695 + }, + { + "epoch": 2.574007699455728, + "grad_norm": 0.4390306049659235, + "learning_rate": 3.0716566581842507e-06, + "loss": 0.5426, + "step": 9696 + }, + { + "epoch": 2.574273197929112, + "grad_norm": 0.43547529734439866, + "learning_rate": 3.0713167745863033e-06, + "loss": 0.5172, + "step": 9697 + }, + { + "epoch": 2.5745386964024957, + "grad_norm": 0.43690288710044933, + "learning_rate": 3.0709768798462804e-06, + "loss": 0.5917, + "step": 9698 + }, + { + "epoch": 2.5748041948758793, + "grad_norm": 0.4297154448698437, + "learning_rate": 3.07063697397081e-06, + "loss": 0.5535, + "step": 9699 + }, + { + "epoch": 2.5750696933492634, + "grad_norm": 0.4229441284386931, + "learning_rate": 3.0702970569665213e-06, + "loss": 0.543, + "step": 9700 + }, + { + "epoch": 2.575335191822647, + "grad_norm": 0.44940251564345524, + "learning_rate": 3.069957128840044e-06, + "loss": 0.5527, + "step": 9701 + }, + { + "epoch": 2.5756006902960307, + "grad_norm": 0.45742016478394915, + "learning_rate": 3.0696171895980076e-06, + "loss": 0.5737, + "step": 9702 + }, + { + "epoch": 2.5758661887694148, + "grad_norm": 0.445479816965706, + "learning_rate": 3.069277239247041e-06, + "loss": 0.5392, + "step": 9703 + }, + { + "epoch": 2.5761316872427984, + "grad_norm": 0.440622712134827, + "learning_rate": 3.0689372777937745e-06, + "loss": 0.5898, + "step": 9704 + }, + { + "epoch": 2.576397185716182, + "grad_norm": 0.43273044340065625, + "learning_rate": 3.0685973052448383e-06, + "loss": 0.5489, + "step": 9705 + }, + { + "epoch": 2.576662684189566, + "grad_norm": 0.44018325564738436, + "learning_rate": 3.068257321606862e-06, + "loss": 0.6019, + "step": 9706 + }, + { + "epoch": 2.5769281826629498, + "grad_norm": 0.43263978548441684, + "learning_rate": 3.0679173268864766e-06, + "loss": 0.5657, + "step": 9707 + }, + { + "epoch": 2.5771936811363334, + "grad_norm": 0.44134116260728956, + "learning_rate": 3.0675773210903126e-06, + "loss": 0.5426, + "step": 9708 + }, + { + "epoch": 2.5774591796097175, + "grad_norm": 0.44403198567368973, + "learning_rate": 3.0672373042250014e-06, + "loss": 0.5841, + "step": 9709 + }, + { + "epoch": 2.577724678083101, + "grad_norm": 0.426394921965446, + "learning_rate": 3.066897276297174e-06, + "loss": 0.5746, + "step": 9710 + }, + { + "epoch": 2.5779901765564848, + "grad_norm": 0.4371630480201046, + "learning_rate": 3.066557237313462e-06, + "loss": 0.568, + "step": 9711 + }, + { + "epoch": 2.578255675029869, + "grad_norm": 0.4413669250540348, + "learning_rate": 3.066217187280497e-06, + "loss": 0.5748, + "step": 9712 + }, + { + "epoch": 2.5785211735032525, + "grad_norm": 0.440982396462472, + "learning_rate": 3.0658771262049104e-06, + "loss": 0.5373, + "step": 9713 + }, + { + "epoch": 2.578786671976636, + "grad_norm": 0.442487642415627, + "learning_rate": 3.0655370540933334e-06, + "loss": 0.6338, + "step": 9714 + }, + { + "epoch": 2.5790521704500198, + "grad_norm": 0.44948528578801356, + "learning_rate": 3.0651969709523997e-06, + "loss": 0.5582, + "step": 9715 + }, + { + "epoch": 2.5793176689234034, + "grad_norm": 0.43941225770394476, + "learning_rate": 3.064856876788741e-06, + "loss": 0.5928, + "step": 9716 + }, + { + "epoch": 2.5795831673967875, + "grad_norm": 0.43369359329392015, + "learning_rate": 3.06451677160899e-06, + "loss": 0.5789, + "step": 9717 + }, + { + "epoch": 2.579848665870171, + "grad_norm": 0.4442134752303508, + "learning_rate": 3.0641766554197806e-06, + "loss": 0.5864, + "step": 9718 + }, + { + "epoch": 2.580114164343555, + "grad_norm": 0.43899568244259546, + "learning_rate": 3.063836528227745e-06, + "loss": 0.5991, + "step": 9719 + }, + { + "epoch": 2.580379662816939, + "grad_norm": 0.4230559718199393, + "learning_rate": 3.063496390039516e-06, + "loss": 0.5607, + "step": 9720 + }, + { + "epoch": 2.5806451612903225, + "grad_norm": 0.4252969025278546, + "learning_rate": 3.0631562408617275e-06, + "loss": 0.591, + "step": 9721 + }, + { + "epoch": 2.580910659763706, + "grad_norm": 0.4560028926384697, + "learning_rate": 3.062816080701015e-06, + "loss": 0.5999, + "step": 9722 + }, + { + "epoch": 2.5811761582370902, + "grad_norm": 0.43833207953848735, + "learning_rate": 3.0624759095640092e-06, + "loss": 0.5603, + "step": 9723 + }, + { + "epoch": 2.581441656710474, + "grad_norm": 0.4363813445482862, + "learning_rate": 3.062135727457347e-06, + "loss": 0.5724, + "step": 9724 + }, + { + "epoch": 2.5817071551838575, + "grad_norm": 0.4326799520006999, + "learning_rate": 3.0617955343876617e-06, + "loss": 0.5709, + "step": 9725 + }, + { + "epoch": 2.5819726536572416, + "grad_norm": 0.4270160638351923, + "learning_rate": 3.0614553303615876e-06, + "loss": 0.5714, + "step": 9726 + }, + { + "epoch": 2.5822381521306252, + "grad_norm": 0.4422786158529615, + "learning_rate": 3.0611151153857592e-06, + "loss": 0.5579, + "step": 9727 + }, + { + "epoch": 2.582503650604009, + "grad_norm": 0.45552364147327573, + "learning_rate": 3.0607748894668133e-06, + "loss": 0.5972, + "step": 9728 + }, + { + "epoch": 2.582769149077393, + "grad_norm": 0.4452913847497938, + "learning_rate": 3.0604346526113836e-06, + "loss": 0.5267, + "step": 9729 + }, + { + "epoch": 2.5830346475507766, + "grad_norm": 0.43300755692225423, + "learning_rate": 3.060094404826106e-06, + "loss": 0.5571, + "step": 9730 + }, + { + "epoch": 2.5833001460241602, + "grad_norm": 0.4310298096754106, + "learning_rate": 3.0597541461176167e-06, + "loss": 0.5457, + "step": 9731 + }, + { + "epoch": 2.5835656444975443, + "grad_norm": 0.4445485083975476, + "learning_rate": 3.0594138764925507e-06, + "loss": 0.5489, + "step": 9732 + }, + { + "epoch": 2.583831142970928, + "grad_norm": 0.44137021353093164, + "learning_rate": 3.059073595957544e-06, + "loss": 0.5753, + "step": 9733 + }, + { + "epoch": 2.5840966414443116, + "grad_norm": 0.43509391341790743, + "learning_rate": 3.0587333045192343e-06, + "loss": 0.6055, + "step": 9734 + }, + { + "epoch": 2.5843621399176957, + "grad_norm": 0.4208525086294821, + "learning_rate": 3.058393002184256e-06, + "loss": 0.5489, + "step": 9735 + }, + { + "epoch": 2.5846276383910793, + "grad_norm": 0.4381786107640345, + "learning_rate": 3.0580526889592478e-06, + "loss": 0.5714, + "step": 9736 + }, + { + "epoch": 2.584893136864463, + "grad_norm": 0.44078837796859505, + "learning_rate": 3.057712364850845e-06, + "loss": 0.5843, + "step": 9737 + }, + { + "epoch": 2.585158635337847, + "grad_norm": 0.4356203587026822, + "learning_rate": 3.057372029865686e-06, + "loss": 0.5687, + "step": 9738 + }, + { + "epoch": 2.5854241338112307, + "grad_norm": 0.4287284696172287, + "learning_rate": 3.057031684010408e-06, + "loss": 0.5641, + "step": 9739 + }, + { + "epoch": 2.5856896322846143, + "grad_norm": 0.4347329717106706, + "learning_rate": 3.056691327291648e-06, + "loss": 0.5646, + "step": 9740 + }, + { + "epoch": 2.585955130757998, + "grad_norm": 0.4484382559259174, + "learning_rate": 3.056350959716044e-06, + "loss": 0.5585, + "step": 9741 + }, + { + "epoch": 2.586220629231382, + "grad_norm": 0.44551854589227574, + "learning_rate": 3.056010581290235e-06, + "loss": 0.5428, + "step": 9742 + }, + { + "epoch": 2.5864861277047657, + "grad_norm": 0.4485386527591479, + "learning_rate": 3.0556701920208574e-06, + "loss": 0.5826, + "step": 9743 + }, + { + "epoch": 2.5867516261781494, + "grad_norm": 0.43917854755312696, + "learning_rate": 3.0553297919145508e-06, + "loss": 0.5751, + "step": 9744 + }, + { + "epoch": 2.587017124651533, + "grad_norm": 0.4532500033425838, + "learning_rate": 3.0549893809779533e-06, + "loss": 0.5448, + "step": 9745 + }, + { + "epoch": 2.587282623124917, + "grad_norm": 0.4331426040609163, + "learning_rate": 3.054648959217704e-06, + "loss": 0.5561, + "step": 9746 + }, + { + "epoch": 2.5875481215983007, + "grad_norm": 0.42564462222038085, + "learning_rate": 3.054308526640442e-06, + "loss": 0.523, + "step": 9747 + }, + { + "epoch": 2.5878136200716844, + "grad_norm": 0.4373161538748567, + "learning_rate": 3.0539680832528074e-06, + "loss": 0.5852, + "step": 9748 + }, + { + "epoch": 2.5880791185450684, + "grad_norm": 0.43965705139518685, + "learning_rate": 3.053627629061438e-06, + "loss": 0.5554, + "step": 9749 + }, + { + "epoch": 2.588344617018452, + "grad_norm": 0.44356494700891186, + "learning_rate": 3.0532871640729745e-06, + "loss": 0.5399, + "step": 9750 + }, + { + "epoch": 2.5886101154918357, + "grad_norm": 0.44333843467667555, + "learning_rate": 3.052946688294057e-06, + "loss": 0.569, + "step": 9751 + }, + { + "epoch": 2.58887561396522, + "grad_norm": 0.4454987314272198, + "learning_rate": 3.052606201731325e-06, + "loss": 0.5295, + "step": 9752 + }, + { + "epoch": 2.5891411124386035, + "grad_norm": 0.43299005361834114, + "learning_rate": 3.052265704391419e-06, + "loss": 0.5116, + "step": 9753 + }, + { + "epoch": 2.589406610911987, + "grad_norm": 0.44992390350494116, + "learning_rate": 3.0519251962809793e-06, + "loss": 0.5933, + "step": 9754 + }, + { + "epoch": 2.589672109385371, + "grad_norm": 0.4475046198646384, + "learning_rate": 3.051584677406647e-06, + "loss": 0.5824, + "step": 9755 + }, + { + "epoch": 2.589937607858755, + "grad_norm": 0.45401987100159574, + "learning_rate": 3.0512441477750634e-06, + "loss": 0.563, + "step": 9756 + }, + { + "epoch": 2.5902031063321385, + "grad_norm": 0.44654322221378523, + "learning_rate": 3.0509036073928686e-06, + "loss": 0.5541, + "step": 9757 + }, + { + "epoch": 2.5904686048055225, + "grad_norm": 0.4563393408195813, + "learning_rate": 3.0505630562667056e-06, + "loss": 0.5402, + "step": 9758 + }, + { + "epoch": 2.590734103278906, + "grad_norm": 0.44925229949481094, + "learning_rate": 3.0502224944032136e-06, + "loss": 0.5672, + "step": 9759 + }, + { + "epoch": 2.59099960175229, + "grad_norm": 0.43030383881738954, + "learning_rate": 3.049881921809037e-06, + "loss": 0.5601, + "step": 9760 + }, + { + "epoch": 2.591265100225674, + "grad_norm": 0.44225812487070487, + "learning_rate": 3.0495413384908162e-06, + "loss": 0.5931, + "step": 9761 + }, + { + "epoch": 2.5915305986990576, + "grad_norm": 0.41567714851754306, + "learning_rate": 3.0492007444551935e-06, + "loss": 0.5593, + "step": 9762 + }, + { + "epoch": 2.591796097172441, + "grad_norm": 0.42934805119035097, + "learning_rate": 3.048860139708812e-06, + "loss": 0.585, + "step": 9763 + }, + { + "epoch": 2.5920615956458253, + "grad_norm": 0.44443860220951853, + "learning_rate": 3.048519524258314e-06, + "loss": 0.5669, + "step": 9764 + }, + { + "epoch": 2.592327094119209, + "grad_norm": 0.4370753854806174, + "learning_rate": 3.0481788981103423e-06, + "loss": 0.5085, + "step": 9765 + }, + { + "epoch": 2.5925925925925926, + "grad_norm": 0.4858166900980427, + "learning_rate": 3.0478382612715398e-06, + "loss": 0.5702, + "step": 9766 + }, + { + "epoch": 2.5928580910659766, + "grad_norm": 0.43261493530951145, + "learning_rate": 3.047497613748549e-06, + "loss": 0.5465, + "step": 9767 + }, + { + "epoch": 2.5931235895393603, + "grad_norm": 0.42241666580234893, + "learning_rate": 3.047156955548015e-06, + "loss": 0.5495, + "step": 9768 + }, + { + "epoch": 2.593389088012744, + "grad_norm": 0.4306741881923886, + "learning_rate": 3.046816286676581e-06, + "loss": 0.5557, + "step": 9769 + }, + { + "epoch": 2.5936545864861276, + "grad_norm": 0.4313434566745016, + "learning_rate": 3.04647560714089e-06, + "loss": 0.5683, + "step": 9770 + }, + { + "epoch": 2.593920084959511, + "grad_norm": 0.4396761462753513, + "learning_rate": 3.046134916947587e-06, + "loss": 0.5858, + "step": 9771 + }, + { + "epoch": 2.5941855834328953, + "grad_norm": 0.42664954869505867, + "learning_rate": 3.0457942161033155e-06, + "loss": 0.5811, + "step": 9772 + }, + { + "epoch": 2.594451081906279, + "grad_norm": 0.44056989475907066, + "learning_rate": 3.0454535046147204e-06, + "loss": 0.5482, + "step": 9773 + }, + { + "epoch": 2.5947165803796626, + "grad_norm": 0.4340837263558321, + "learning_rate": 3.0451127824884473e-06, + "loss": 0.5506, + "step": 9774 + }, + { + "epoch": 2.5949820788530467, + "grad_norm": 0.4373848539519573, + "learning_rate": 3.04477204973114e-06, + "loss": 0.5762, + "step": 9775 + }, + { + "epoch": 2.5952475773264303, + "grad_norm": 0.44704426977221406, + "learning_rate": 3.0444313063494433e-06, + "loss": 0.5532, + "step": 9776 + }, + { + "epoch": 2.595513075799814, + "grad_norm": 0.43485027470875753, + "learning_rate": 3.044090552350004e-06, + "loss": 0.5371, + "step": 9777 + }, + { + "epoch": 2.595778574273198, + "grad_norm": 0.4376157385527146, + "learning_rate": 3.043749787739466e-06, + "loss": 0.5828, + "step": 9778 + }, + { + "epoch": 2.5960440727465817, + "grad_norm": 0.4499818976277327, + "learning_rate": 3.0434090125244765e-06, + "loss": 0.5527, + "step": 9779 + }, + { + "epoch": 2.5963095712199653, + "grad_norm": 0.4281069380411903, + "learning_rate": 3.0430682267116808e-06, + "loss": 0.5657, + "step": 9780 + }, + { + "epoch": 2.5965750696933494, + "grad_norm": 0.42514294574719663, + "learning_rate": 3.0427274303077244e-06, + "loss": 0.5621, + "step": 9781 + }, + { + "epoch": 2.596840568166733, + "grad_norm": 0.4447212077327684, + "learning_rate": 3.0423866233192545e-06, + "loss": 0.5631, + "step": 9782 + }, + { + "epoch": 2.5971060666401167, + "grad_norm": 0.4460502603504364, + "learning_rate": 3.0420458057529173e-06, + "loss": 0.5577, + "step": 9783 + }, + { + "epoch": 2.5973715651135008, + "grad_norm": 0.4428713739205099, + "learning_rate": 3.0417049776153595e-06, + "loss": 0.5864, + "step": 9784 + }, + { + "epoch": 2.5976370635868844, + "grad_norm": 0.4451558243609404, + "learning_rate": 3.0413641389132294e-06, + "loss": 0.5553, + "step": 9785 + }, + { + "epoch": 2.597902562060268, + "grad_norm": 0.43309212325519936, + "learning_rate": 3.0410232896531724e-06, + "loss": 0.5461, + "step": 9786 + }, + { + "epoch": 2.598168060533652, + "grad_norm": 0.44156694273279407, + "learning_rate": 3.0406824298418356e-06, + "loss": 0.5932, + "step": 9787 + }, + { + "epoch": 2.5984335590070358, + "grad_norm": 0.4360336364953369, + "learning_rate": 3.040341559485869e-06, + "loss": 0.5198, + "step": 9788 + }, + { + "epoch": 2.5986990574804194, + "grad_norm": 0.4279731277995889, + "learning_rate": 3.040000678591919e-06, + "loss": 0.5787, + "step": 9789 + }, + { + "epoch": 2.5989645559538035, + "grad_norm": 0.4561929037477712, + "learning_rate": 3.039659787166633e-06, + "loss": 0.5824, + "step": 9790 + }, + { + "epoch": 2.599230054427187, + "grad_norm": 0.4506964886677526, + "learning_rate": 3.03931888521666e-06, + "loss": 0.581, + "step": 9791 + }, + { + "epoch": 2.5994955529005708, + "grad_norm": 0.4435214542838103, + "learning_rate": 3.0389779727486477e-06, + "loss": 0.5998, + "step": 9792 + }, + { + "epoch": 2.599761051373955, + "grad_norm": 0.45674711032145454, + "learning_rate": 3.038637049769246e-06, + "loss": 0.5544, + "step": 9793 + }, + { + "epoch": 2.6000265498473385, + "grad_norm": 0.4393354078245991, + "learning_rate": 3.0382961162851033e-06, + "loss": 0.548, + "step": 9794 + }, + { + "epoch": 2.600292048320722, + "grad_norm": 0.44694471978942, + "learning_rate": 3.0379551723028685e-06, + "loss": 0.549, + "step": 9795 + }, + { + "epoch": 2.600557546794106, + "grad_norm": 0.44017410469128426, + "learning_rate": 3.0376142178291895e-06, + "loss": 0.5687, + "step": 9796 + }, + { + "epoch": 2.60082304526749, + "grad_norm": 0.42482843852284996, + "learning_rate": 3.037273252870718e-06, + "loss": 0.5865, + "step": 9797 + }, + { + "epoch": 2.6010885437408735, + "grad_norm": 0.42901528053754867, + "learning_rate": 3.036932277434102e-06, + "loss": 0.5269, + "step": 9798 + }, + { + "epoch": 2.601354042214257, + "grad_norm": 0.44591400173005213, + "learning_rate": 3.036591291525992e-06, + "loss": 0.5873, + "step": 9799 + }, + { + "epoch": 2.601619540687641, + "grad_norm": 0.4385822464012712, + "learning_rate": 3.036250295153038e-06, + "loss": 0.5375, + "step": 9800 + }, + { + "epoch": 2.601885039161025, + "grad_norm": 0.4402534968600166, + "learning_rate": 3.0359092883218906e-06, + "loss": 0.5771, + "step": 9801 + }, + { + "epoch": 2.6021505376344085, + "grad_norm": 0.43833730244716207, + "learning_rate": 3.0355682710392e-06, + "loss": 0.585, + "step": 9802 + }, + { + "epoch": 2.602416036107792, + "grad_norm": 0.43479937801590046, + "learning_rate": 3.0352272433116167e-06, + "loss": 0.5517, + "step": 9803 + }, + { + "epoch": 2.6026815345811762, + "grad_norm": 0.41718724280708447, + "learning_rate": 3.0348862051457915e-06, + "loss": 0.545, + "step": 9804 + }, + { + "epoch": 2.60294703305456, + "grad_norm": 0.4371491257352857, + "learning_rate": 3.0345451565483757e-06, + "loss": 0.5796, + "step": 9805 + }, + { + "epoch": 2.6032125315279435, + "grad_norm": 0.44529406113816433, + "learning_rate": 3.034204097526021e-06, + "loss": 0.5476, + "step": 9806 + }, + { + "epoch": 2.6034780300013276, + "grad_norm": 0.4456608461963751, + "learning_rate": 3.033863028085378e-06, + "loss": 0.5608, + "step": 9807 + }, + { + "epoch": 2.6037435284747112, + "grad_norm": 0.4442525025244097, + "learning_rate": 3.0335219482330986e-06, + "loss": 0.5009, + "step": 9808 + }, + { + "epoch": 2.604009026948095, + "grad_norm": 0.43778616921276803, + "learning_rate": 3.0331808579758353e-06, + "loss": 0.5675, + "step": 9809 + }, + { + "epoch": 2.604274525421479, + "grad_norm": 0.43374274097977733, + "learning_rate": 3.0328397573202396e-06, + "loss": 0.5877, + "step": 9810 + }, + { + "epoch": 2.6045400238948626, + "grad_norm": 0.4412803474152078, + "learning_rate": 3.0324986462729646e-06, + "loss": 0.5445, + "step": 9811 + }, + { + "epoch": 2.6048055223682463, + "grad_norm": 0.4473201748434038, + "learning_rate": 3.0321575248406616e-06, + "loss": 0.5457, + "step": 9812 + }, + { + "epoch": 2.6050710208416303, + "grad_norm": 0.4383248526898061, + "learning_rate": 3.0318163930299837e-06, + "loss": 0.5869, + "step": 9813 + }, + { + "epoch": 2.605336519315014, + "grad_norm": 0.45294481003544385, + "learning_rate": 3.031475250847585e-06, + "loss": 0.5766, + "step": 9814 + }, + { + "epoch": 2.6056020177883976, + "grad_norm": 0.4382048548341572, + "learning_rate": 3.031134098300117e-06, + "loss": 0.5716, + "step": 9815 + }, + { + "epoch": 2.6058675162617817, + "grad_norm": 0.44171566858782185, + "learning_rate": 3.0307929353942334e-06, + "loss": 0.5787, + "step": 9816 + }, + { + "epoch": 2.6061330147351653, + "grad_norm": 0.4505579968909563, + "learning_rate": 3.0304517621365885e-06, + "loss": 0.598, + "step": 9817 + }, + { + "epoch": 2.606398513208549, + "grad_norm": 0.44601775398021676, + "learning_rate": 3.0301105785338345e-06, + "loss": 0.5794, + "step": 9818 + }, + { + "epoch": 2.606664011681933, + "grad_norm": 0.43829884704682603, + "learning_rate": 3.0297693845926275e-06, + "loss": 0.5793, + "step": 9819 + }, + { + "epoch": 2.6069295101553167, + "grad_norm": 0.42658114366263344, + "learning_rate": 3.02942818031962e-06, + "loss": 0.5501, + "step": 9820 + }, + { + "epoch": 2.6071950086287004, + "grad_norm": 0.43301045301465824, + "learning_rate": 3.0290869657214666e-06, + "loss": 0.5797, + "step": 9821 + }, + { + "epoch": 2.6074605071020844, + "grad_norm": 0.4402593412152837, + "learning_rate": 3.0287457408048217e-06, + "loss": 0.5794, + "step": 9822 + }, + { + "epoch": 2.607726005575468, + "grad_norm": 0.42525719379358584, + "learning_rate": 3.028404505576341e-06, + "loss": 0.5524, + "step": 9823 + }, + { + "epoch": 2.6079915040488517, + "grad_norm": 0.42923808583665674, + "learning_rate": 3.0280632600426777e-06, + "loss": 0.5587, + "step": 9824 + }, + { + "epoch": 2.6082570025222354, + "grad_norm": 0.42587532680181983, + "learning_rate": 3.027722004210489e-06, + "loss": 0.5202, + "step": 9825 + }, + { + "epoch": 2.608522500995619, + "grad_norm": 0.4363099031173716, + "learning_rate": 3.0273807380864285e-06, + "loss": 0.5541, + "step": 9826 + }, + { + "epoch": 2.608787999469003, + "grad_norm": 0.4372428248122534, + "learning_rate": 3.0270394616771527e-06, + "loss": 0.5341, + "step": 9827 + }, + { + "epoch": 2.6090534979423867, + "grad_norm": 0.42776404571600163, + "learning_rate": 3.026698174989316e-06, + "loss": 0.5203, + "step": 9828 + }, + { + "epoch": 2.6093189964157704, + "grad_norm": 0.44432985489600485, + "learning_rate": 3.026356878029576e-06, + "loss": 0.6067, + "step": 9829 + }, + { + "epoch": 2.6095844948891544, + "grad_norm": 0.4390615134331211, + "learning_rate": 3.026015570804588e-06, + "loss": 0.5918, + "step": 9830 + }, + { + "epoch": 2.609849993362538, + "grad_norm": 0.43020925799764337, + "learning_rate": 3.0256742533210087e-06, + "loss": 0.5758, + "step": 9831 + }, + { + "epoch": 2.6101154918359217, + "grad_norm": 0.43965596014813296, + "learning_rate": 3.0253329255854946e-06, + "loss": 0.5493, + "step": 9832 + }, + { + "epoch": 2.610380990309306, + "grad_norm": 0.44824104122635183, + "learning_rate": 3.024991587604702e-06, + "loss": 0.568, + "step": 9833 + }, + { + "epoch": 2.6106464887826895, + "grad_norm": 0.4342521722757152, + "learning_rate": 3.0246502393852882e-06, + "loss": 0.5497, + "step": 9834 + }, + { + "epoch": 2.610911987256073, + "grad_norm": 0.42379865502264136, + "learning_rate": 3.0243088809339093e-06, + "loss": 0.5263, + "step": 9835 + }, + { + "epoch": 2.611177485729457, + "grad_norm": 0.4282984012286767, + "learning_rate": 3.0239675122572245e-06, + "loss": 0.5882, + "step": 9836 + }, + { + "epoch": 2.611442984202841, + "grad_norm": 0.44381237227313886, + "learning_rate": 3.0236261333618897e-06, + "loss": 0.603, + "step": 9837 + }, + { + "epoch": 2.6117084826762245, + "grad_norm": 0.4298498609582854, + "learning_rate": 3.023284744254563e-06, + "loss": 0.5518, + "step": 9838 + }, + { + "epoch": 2.6119739811496085, + "grad_norm": 0.43593392010590215, + "learning_rate": 3.022943344941903e-06, + "loss": 0.5583, + "step": 9839 + }, + { + "epoch": 2.612239479622992, + "grad_norm": 0.4371588110601602, + "learning_rate": 3.022601935430568e-06, + "loss": 0.5702, + "step": 9840 + }, + { + "epoch": 2.612504978096376, + "grad_norm": 0.44425753489465514, + "learning_rate": 3.022260515727215e-06, + "loss": 0.525, + "step": 9841 + }, + { + "epoch": 2.61277047656976, + "grad_norm": 0.4436515239552187, + "learning_rate": 3.0219190858385032e-06, + "loss": 0.5727, + "step": 9842 + }, + { + "epoch": 2.6130359750431436, + "grad_norm": 0.4353254847410362, + "learning_rate": 3.0215776457710923e-06, + "loss": 0.5659, + "step": 9843 + }, + { + "epoch": 2.613301473516527, + "grad_norm": 0.43021916093285906, + "learning_rate": 3.0212361955316386e-06, + "loss": 0.5708, + "step": 9844 + }, + { + "epoch": 2.6135669719899113, + "grad_norm": 0.4364667903436583, + "learning_rate": 3.0208947351268036e-06, + "loss": 0.5653, + "step": 9845 + }, + { + "epoch": 2.613832470463295, + "grad_norm": 0.44241317567799704, + "learning_rate": 3.0205532645632462e-06, + "loss": 0.5937, + "step": 9846 + }, + { + "epoch": 2.6140979689366786, + "grad_norm": 0.4434230087935956, + "learning_rate": 3.0202117838476254e-06, + "loss": 0.559, + "step": 9847 + }, + { + "epoch": 2.6143634674100626, + "grad_norm": 0.44538061898001013, + "learning_rate": 3.019870292986601e-06, + "loss": 0.5901, + "step": 9848 + }, + { + "epoch": 2.6146289658834463, + "grad_norm": 0.4299221055882074, + "learning_rate": 3.019528791986833e-06, + "loss": 0.5311, + "step": 9849 + }, + { + "epoch": 2.61489446435683, + "grad_norm": 0.4356575337537641, + "learning_rate": 3.019187280854982e-06, + "loss": 0.5732, + "step": 9850 + }, + { + "epoch": 2.6151599628302136, + "grad_norm": 0.4294933456052979, + "learning_rate": 3.018845759597708e-06, + "loss": 0.5517, + "step": 9851 + }, + { + "epoch": 2.6154254613035977, + "grad_norm": 0.43726103254155024, + "learning_rate": 3.0185042282216707e-06, + "loss": 0.5451, + "step": 9852 + }, + { + "epoch": 2.6156909597769813, + "grad_norm": 0.4357481060127005, + "learning_rate": 3.018162686733532e-06, + "loss": 0.5684, + "step": 9853 + }, + { + "epoch": 2.615956458250365, + "grad_norm": 0.4550477948741028, + "learning_rate": 3.0178211351399513e-06, + "loss": 0.5524, + "step": 9854 + }, + { + "epoch": 2.6162219567237486, + "grad_norm": 0.43276500174969923, + "learning_rate": 3.017479573447591e-06, + "loss": 0.5609, + "step": 9855 + }, + { + "epoch": 2.6164874551971327, + "grad_norm": 0.4301267661100456, + "learning_rate": 3.017138001663112e-06, + "loss": 0.5535, + "step": 9856 + }, + { + "epoch": 2.6167529536705163, + "grad_norm": 0.4433054623039358, + "learning_rate": 3.016796419793176e-06, + "loss": 0.5814, + "step": 9857 + }, + { + "epoch": 2.6170184521439, + "grad_norm": 0.4455263422028563, + "learning_rate": 3.016454827844445e-06, + "loss": 0.5655, + "step": 9858 + }, + { + "epoch": 2.617283950617284, + "grad_norm": 0.44251737858832735, + "learning_rate": 3.0161132258235793e-06, + "loss": 0.569, + "step": 9859 + }, + { + "epoch": 2.6175494490906677, + "grad_norm": 0.44778610292819304, + "learning_rate": 3.015771613737243e-06, + "loss": 0.5762, + "step": 9860 + }, + { + "epoch": 2.6178149475640513, + "grad_norm": 0.4474395789662619, + "learning_rate": 3.0154299915920972e-06, + "loss": 0.5658, + "step": 9861 + }, + { + "epoch": 2.6180804460374354, + "grad_norm": 0.44436171688739035, + "learning_rate": 3.0150883593948037e-06, + "loss": 0.5503, + "step": 9862 + }, + { + "epoch": 2.618345944510819, + "grad_norm": 0.4383942255423122, + "learning_rate": 3.0147467171520272e-06, + "loss": 0.5803, + "step": 9863 + }, + { + "epoch": 2.6186114429842027, + "grad_norm": 0.4477474850048624, + "learning_rate": 3.014405064870428e-06, + "loss": 0.5573, + "step": 9864 + }, + { + "epoch": 2.6188769414575868, + "grad_norm": 0.4538311015711708, + "learning_rate": 3.014063402556672e-06, + "loss": 0.6119, + "step": 9865 + }, + { + "epoch": 2.6191424399309704, + "grad_norm": 0.434192142562641, + "learning_rate": 3.0137217302174205e-06, + "loss": 0.6023, + "step": 9866 + }, + { + "epoch": 2.619407938404354, + "grad_norm": 0.443036074110176, + "learning_rate": 3.0133800478593373e-06, + "loss": 0.6013, + "step": 9867 + }, + { + "epoch": 2.619673436877738, + "grad_norm": 0.4420928334380666, + "learning_rate": 3.013038355489086e-06, + "loss": 0.5735, + "step": 9868 + }, + { + "epoch": 2.6199389353511218, + "grad_norm": 0.4540012376124846, + "learning_rate": 3.0126966531133313e-06, + "loss": 0.5827, + "step": 9869 + }, + { + "epoch": 2.6202044338245054, + "grad_norm": 0.42553971704458887, + "learning_rate": 3.0123549407387357e-06, + "loss": 0.6071, + "step": 9870 + }, + { + "epoch": 2.6204699322978895, + "grad_norm": 0.43745212080732393, + "learning_rate": 3.0120132183719648e-06, + "loss": 0.5476, + "step": 9871 + }, + { + "epoch": 2.620735430771273, + "grad_norm": 0.44160750158320355, + "learning_rate": 3.011671486019683e-06, + "loss": 0.5545, + "step": 9872 + }, + { + "epoch": 2.6210009292446568, + "grad_norm": 0.43534641999773344, + "learning_rate": 3.0113297436885534e-06, + "loss": 0.5464, + "step": 9873 + }, + { + "epoch": 2.621266427718041, + "grad_norm": 0.42720012879130215, + "learning_rate": 3.010987991385242e-06, + "loss": 0.5709, + "step": 9874 + }, + { + "epoch": 2.6215319261914245, + "grad_norm": 0.42017140661247165, + "learning_rate": 3.0106462291164144e-06, + "loss": 0.5332, + "step": 9875 + }, + { + "epoch": 2.621797424664808, + "grad_norm": 0.43014527360916427, + "learning_rate": 3.0103044568887348e-06, + "loss": 0.5485, + "step": 9876 + }, + { + "epoch": 2.6220629231381922, + "grad_norm": 0.4502492781589114, + "learning_rate": 3.0099626747088682e-06, + "loss": 0.5359, + "step": 9877 + }, + { + "epoch": 2.622328421611576, + "grad_norm": 0.43447302548629424, + "learning_rate": 3.0096208825834817e-06, + "loss": 0.5713, + "step": 9878 + }, + { + "epoch": 2.6225939200849595, + "grad_norm": 0.43927347409848205, + "learning_rate": 3.00927908051924e-06, + "loss": 0.6004, + "step": 9879 + }, + { + "epoch": 2.622859418558343, + "grad_norm": 0.4501172378341211, + "learning_rate": 3.0089372685228093e-06, + "loss": 0.5481, + "step": 9880 + }, + { + "epoch": 2.6231249170317272, + "grad_norm": 0.4402316360607621, + "learning_rate": 3.008595446600856e-06, + "loss": 0.5646, + "step": 9881 + }, + { + "epoch": 2.623390415505111, + "grad_norm": 0.4431075994849239, + "learning_rate": 3.0082536147600454e-06, + "loss": 0.571, + "step": 9882 + }, + { + "epoch": 2.6236559139784945, + "grad_norm": 0.4293022625828644, + "learning_rate": 3.007911773007045e-06, + "loss": 0.5746, + "step": 9883 + }, + { + "epoch": 2.623921412451878, + "grad_norm": 0.44256524396095787, + "learning_rate": 3.007569921348522e-06, + "loss": 0.5678, + "step": 9884 + }, + { + "epoch": 2.6241869109252622, + "grad_norm": 0.4427629570602814, + "learning_rate": 3.0072280597911423e-06, + "loss": 0.5637, + "step": 9885 + }, + { + "epoch": 2.624452409398646, + "grad_norm": 0.46103695799888533, + "learning_rate": 3.0068861883415743e-06, + "loss": 0.5798, + "step": 9886 + }, + { + "epoch": 2.6247179078720295, + "grad_norm": 0.4471885163500397, + "learning_rate": 3.0065443070064843e-06, + "loss": 0.551, + "step": 9887 + }, + { + "epoch": 2.6249834063454136, + "grad_norm": 0.43172985904282923, + "learning_rate": 3.0062024157925396e-06, + "loss": 0.5429, + "step": 9888 + }, + { + "epoch": 2.6252489048187972, + "grad_norm": 0.4344381378988047, + "learning_rate": 3.005860514706409e-06, + "loss": 0.5442, + "step": 9889 + }, + { + "epoch": 2.625514403292181, + "grad_norm": 0.44152196416238965, + "learning_rate": 3.005518603754759e-06, + "loss": 0.539, + "step": 9890 + }, + { + "epoch": 2.625779901765565, + "grad_norm": 0.43606418948176584, + "learning_rate": 3.005176682944259e-06, + "loss": 0.5984, + "step": 9891 + }, + { + "epoch": 2.6260454002389486, + "grad_norm": 0.4551431622615924, + "learning_rate": 3.004834752281577e-06, + "loss": 0.5525, + "step": 9892 + }, + { + "epoch": 2.6263108987123323, + "grad_norm": 0.4345127376862315, + "learning_rate": 3.0044928117733805e-06, + "loss": 0.5559, + "step": 9893 + }, + { + "epoch": 2.6265763971857163, + "grad_norm": 0.43073686310602943, + "learning_rate": 3.004150861426339e-06, + "loss": 0.545, + "step": 9894 + }, + { + "epoch": 2.6268418956591, + "grad_norm": 0.4561239349515093, + "learning_rate": 3.0038089012471226e-06, + "loss": 0.6117, + "step": 9895 + }, + { + "epoch": 2.6271073941324836, + "grad_norm": 0.4543052337296326, + "learning_rate": 3.0034669312423985e-06, + "loss": 0.5521, + "step": 9896 + }, + { + "epoch": 2.6273728926058677, + "grad_norm": 0.4344483984864306, + "learning_rate": 3.003124951418836e-06, + "loss": 0.5473, + "step": 9897 + }, + { + "epoch": 2.6276383910792513, + "grad_norm": 0.437393101574257, + "learning_rate": 3.0027829617831063e-06, + "loss": 0.54, + "step": 9898 + }, + { + "epoch": 2.627903889552635, + "grad_norm": 0.42975660289869794, + "learning_rate": 3.0024409623418765e-06, + "loss": 0.5375, + "step": 9899 + }, + { + "epoch": 2.628169388026019, + "grad_norm": 0.45856132816048817, + "learning_rate": 3.002098953101818e-06, + "loss": 0.5619, + "step": 9900 + }, + { + "epoch": 2.6284348864994027, + "grad_norm": 0.4252337371629321, + "learning_rate": 3.0017569340696008e-06, + "loss": 0.5104, + "step": 9901 + }, + { + "epoch": 2.6287003849727864, + "grad_norm": 0.45033190648925175, + "learning_rate": 3.001414905251895e-06, + "loss": 0.5522, + "step": 9902 + }, + { + "epoch": 2.6289658834461704, + "grad_norm": 0.4403072129725849, + "learning_rate": 3.00107286665537e-06, + "loss": 0.5194, + "step": 9903 + }, + { + "epoch": 2.629231381919554, + "grad_norm": 0.4384666294140011, + "learning_rate": 3.0007308182866983e-06, + "loss": 0.5611, + "step": 9904 + }, + { + "epoch": 2.6294968803929377, + "grad_norm": 0.4381451154375576, + "learning_rate": 3.0003887601525494e-06, + "loss": 0.5735, + "step": 9905 + }, + { + "epoch": 2.6297623788663214, + "grad_norm": 0.4362011688325651, + "learning_rate": 3.000046692259595e-06, + "loss": 0.5649, + "step": 9906 + }, + { + "epoch": 2.6300278773397054, + "grad_norm": 0.45800339121839356, + "learning_rate": 2.999704614614505e-06, + "loss": 0.5585, + "step": 9907 + }, + { + "epoch": 2.630293375813089, + "grad_norm": 0.4407241866263677, + "learning_rate": 2.999362527223952e-06, + "loss": 0.5302, + "step": 9908 + }, + { + "epoch": 2.6305588742864727, + "grad_norm": 0.43478161686918587, + "learning_rate": 2.999020430094606e-06, + "loss": 0.5705, + "step": 9909 + }, + { + "epoch": 2.6308243727598564, + "grad_norm": 0.4353805070951268, + "learning_rate": 2.9986783232331407e-06, + "loss": 0.5492, + "step": 9910 + }, + { + "epoch": 2.6310898712332405, + "grad_norm": 0.4484856287692562, + "learning_rate": 2.9983362066462273e-06, + "loss": 0.5795, + "step": 9911 + }, + { + "epoch": 2.631355369706624, + "grad_norm": 0.4485160621100812, + "learning_rate": 2.997994080340538e-06, + "loss": 0.5957, + "step": 9912 + }, + { + "epoch": 2.6316208681800077, + "grad_norm": 0.4440385071848229, + "learning_rate": 2.9976519443227436e-06, + "loss": 0.5788, + "step": 9913 + }, + { + "epoch": 2.631886366653392, + "grad_norm": 0.4427672527353162, + "learning_rate": 2.9973097985995187e-06, + "loss": 0.5546, + "step": 9914 + }, + { + "epoch": 2.6321518651267755, + "grad_norm": 0.4283156233381252, + "learning_rate": 2.996967643177535e-06, + "loss": 0.5464, + "step": 9915 + }, + { + "epoch": 2.632417363600159, + "grad_norm": 0.4425679994489716, + "learning_rate": 2.9966254780634658e-06, + "loss": 0.5757, + "step": 9916 + }, + { + "epoch": 2.632682862073543, + "grad_norm": 0.4346918168905528, + "learning_rate": 2.996283303263983e-06, + "loss": 0.5564, + "step": 9917 + }, + { + "epoch": 2.632948360546927, + "grad_norm": 0.4349281578625761, + "learning_rate": 2.995941118785762e-06, + "loss": 0.5219, + "step": 9918 + }, + { + "epoch": 2.6332138590203105, + "grad_norm": 0.4341829099844109, + "learning_rate": 2.9955989246354734e-06, + "loss": 0.5892, + "step": 9919 + }, + { + "epoch": 2.6334793574936946, + "grad_norm": 0.4121911594040556, + "learning_rate": 2.9952567208197924e-06, + "loss": 0.5156, + "step": 9920 + }, + { + "epoch": 2.633744855967078, + "grad_norm": 0.43825855334069336, + "learning_rate": 2.9949145073453935e-06, + "loss": 0.5512, + "step": 9921 + }, + { + "epoch": 2.634010354440462, + "grad_norm": 0.4407661964039348, + "learning_rate": 2.9945722842189495e-06, + "loss": 0.5996, + "step": 9922 + }, + { + "epoch": 2.634275852913846, + "grad_norm": 0.4497807192797199, + "learning_rate": 2.9942300514471355e-06, + "loss": 0.5732, + "step": 9923 + }, + { + "epoch": 2.6345413513872296, + "grad_norm": 0.43875712577908155, + "learning_rate": 2.9938878090366253e-06, + "loss": 0.597, + "step": 9924 + }, + { + "epoch": 2.634806849860613, + "grad_norm": 0.44104759708540336, + "learning_rate": 2.9935455569940934e-06, + "loss": 0.5857, + "step": 9925 + }, + { + "epoch": 2.6350723483339973, + "grad_norm": 0.44576688880128146, + "learning_rate": 2.9932032953262146e-06, + "loss": 0.5535, + "step": 9926 + }, + { + "epoch": 2.635337846807381, + "grad_norm": 0.4306513806485075, + "learning_rate": 2.992861024039665e-06, + "loss": 0.5685, + "step": 9927 + }, + { + "epoch": 2.6356033452807646, + "grad_norm": 0.4595332475907878, + "learning_rate": 2.9925187431411174e-06, + "loss": 0.5604, + "step": 9928 + }, + { + "epoch": 2.6358688437541486, + "grad_norm": 0.429074626196928, + "learning_rate": 2.9921764526372488e-06, + "loss": 0.5458, + "step": 9929 + }, + { + "epoch": 2.6361343422275323, + "grad_norm": 0.45375906708898867, + "learning_rate": 2.9918341525347345e-06, + "loss": 0.5887, + "step": 9930 + }, + { + "epoch": 2.636399840700916, + "grad_norm": 0.43517054361955426, + "learning_rate": 2.9914918428402494e-06, + "loss": 0.4769, + "step": 9931 + }, + { + "epoch": 2.6366653391743, + "grad_norm": 0.4518620669062071, + "learning_rate": 2.991149523560471e-06, + "loss": 0.5568, + "step": 9932 + }, + { + "epoch": 2.6369308376476837, + "grad_norm": 0.433411098323447, + "learning_rate": 2.990807194702074e-06, + "loss": 0.5756, + "step": 9933 + }, + { + "epoch": 2.6371963361210673, + "grad_norm": 0.44459924396507006, + "learning_rate": 2.990464856271735e-06, + "loss": 0.5721, + "step": 9934 + }, + { + "epoch": 2.637461834594451, + "grad_norm": 0.43190456418692275, + "learning_rate": 2.9901225082761305e-06, + "loss": 0.5586, + "step": 9935 + }, + { + "epoch": 2.637727333067835, + "grad_norm": 0.4451665915362643, + "learning_rate": 2.9897801507219367e-06, + "loss": 0.5634, + "step": 9936 + }, + { + "epoch": 2.6379928315412187, + "grad_norm": 0.4344907995497725, + "learning_rate": 2.9894377836158307e-06, + "loss": 0.5739, + "step": 9937 + }, + { + "epoch": 2.6382583300146023, + "grad_norm": 0.4261535256584437, + "learning_rate": 2.9890954069644897e-06, + "loss": 0.5434, + "step": 9938 + }, + { + "epoch": 2.638523828487986, + "grad_norm": 0.43095871116470313, + "learning_rate": 2.988753020774591e-06, + "loss": 0.5514, + "step": 9939 + }, + { + "epoch": 2.63878932696137, + "grad_norm": 0.43417960813487233, + "learning_rate": 2.988410625052811e-06, + "loss": 0.5786, + "step": 9940 + }, + { + "epoch": 2.6390548254347537, + "grad_norm": 0.45532746169128974, + "learning_rate": 2.9880682198058284e-06, + "loss": 0.5786, + "step": 9941 + }, + { + "epoch": 2.6393203239081373, + "grad_norm": 0.4370798719342273, + "learning_rate": 2.9877258050403214e-06, + "loss": 0.5647, + "step": 9942 + }, + { + "epoch": 2.6395858223815214, + "grad_norm": 0.4527565676041506, + "learning_rate": 2.987383380762966e-06, + "loss": 0.5384, + "step": 9943 + }, + { + "epoch": 2.639851320854905, + "grad_norm": 0.4591415829286425, + "learning_rate": 2.987040946980442e-06, + "loss": 0.5869, + "step": 9944 + }, + { + "epoch": 2.6401168193282887, + "grad_norm": 0.4330559800873046, + "learning_rate": 2.9866985036994268e-06, + "loss": 0.5833, + "step": 9945 + }, + { + "epoch": 2.6403823178016728, + "grad_norm": 0.44154724894431163, + "learning_rate": 2.986356050926599e-06, + "loss": 0.5853, + "step": 9946 + }, + { + "epoch": 2.6406478162750564, + "grad_norm": 0.43497758344182674, + "learning_rate": 2.986013588668637e-06, + "loss": 0.563, + "step": 9947 + }, + { + "epoch": 2.64091331474844, + "grad_norm": 0.4558119909403106, + "learning_rate": 2.9856711169322202e-06, + "loss": 0.5331, + "step": 9948 + }, + { + "epoch": 2.641178813221824, + "grad_norm": 0.4297536496844275, + "learning_rate": 2.9853286357240275e-06, + "loss": 0.5675, + "step": 9949 + }, + { + "epoch": 2.6414443116952078, + "grad_norm": 0.4471329238556466, + "learning_rate": 2.984986145050739e-06, + "loss": 0.5924, + "step": 9950 + }, + { + "epoch": 2.6417098101685914, + "grad_norm": 0.4327496016628824, + "learning_rate": 2.9846436449190326e-06, + "loss": 0.5652, + "step": 9951 + }, + { + "epoch": 2.6419753086419755, + "grad_norm": 0.4347367031128844, + "learning_rate": 2.9843011353355885e-06, + "loss": 0.588, + "step": 9952 + }, + { + "epoch": 2.642240807115359, + "grad_norm": 0.4290859707748789, + "learning_rate": 2.9839586163070867e-06, + "loss": 0.5634, + "step": 9953 + }, + { + "epoch": 2.6425063055887428, + "grad_norm": 0.4340151566882483, + "learning_rate": 2.9836160878402064e-06, + "loss": 0.5268, + "step": 9954 + }, + { + "epoch": 2.642771804062127, + "grad_norm": 0.4347428623442634, + "learning_rate": 2.9832735499416287e-06, + "loss": 0.5568, + "step": 9955 + }, + { + "epoch": 2.6430373025355105, + "grad_norm": 0.44629294872974323, + "learning_rate": 2.9829310026180336e-06, + "loss": 0.5755, + "step": 9956 + }, + { + "epoch": 2.643302801008894, + "grad_norm": 0.44350117372897757, + "learning_rate": 2.9825884458761014e-06, + "loss": 0.5902, + "step": 9957 + }, + { + "epoch": 2.6435682994822782, + "grad_norm": 0.4347294438140043, + "learning_rate": 2.982245879722513e-06, + "loss": 0.5229, + "step": 9958 + }, + { + "epoch": 2.643833797955662, + "grad_norm": 0.464309377329224, + "learning_rate": 2.981903304163949e-06, + "loss": 0.5763, + "step": 9959 + }, + { + "epoch": 2.6440992964290455, + "grad_norm": 0.43347576524715087, + "learning_rate": 2.9815607192070905e-06, + "loss": 0.5316, + "step": 9960 + }, + { + "epoch": 2.644364794902429, + "grad_norm": 0.422618593162229, + "learning_rate": 2.9812181248586195e-06, + "loss": 0.5362, + "step": 9961 + }, + { + "epoch": 2.6446302933758132, + "grad_norm": 0.47056180163125516, + "learning_rate": 2.9808755211252166e-06, + "loss": 0.5674, + "step": 9962 + }, + { + "epoch": 2.644895791849197, + "grad_norm": 0.44377808454261547, + "learning_rate": 2.9805329080135636e-06, + "loss": 0.5699, + "step": 9963 + }, + { + "epoch": 2.6451612903225805, + "grad_norm": 0.4433120148103743, + "learning_rate": 2.9801902855303427e-06, + "loss": 0.5698, + "step": 9964 + }, + { + "epoch": 2.645426788795964, + "grad_norm": 0.43503419818805555, + "learning_rate": 2.9798476536822344e-06, + "loss": 0.5714, + "step": 9965 + }, + { + "epoch": 2.6456922872693482, + "grad_norm": 0.4238848177573442, + "learning_rate": 2.979505012475923e-06, + "loss": 0.5192, + "step": 9966 + }, + { + "epoch": 2.645957785742732, + "grad_norm": 0.4245364360524084, + "learning_rate": 2.9791623619180896e-06, + "loss": 0.5587, + "step": 9967 + }, + { + "epoch": 2.6462232842161155, + "grad_norm": 0.43412496439907994, + "learning_rate": 2.978819702015417e-06, + "loss": 0.5449, + "step": 9968 + }, + { + "epoch": 2.6464887826894996, + "grad_norm": 0.43815098599682767, + "learning_rate": 2.9784770327745875e-06, + "loss": 0.5539, + "step": 9969 + }, + { + "epoch": 2.6467542811628832, + "grad_norm": 0.42078824292443223, + "learning_rate": 2.978134354202285e-06, + "loss": 0.5345, + "step": 9970 + }, + { + "epoch": 2.647019779636267, + "grad_norm": 0.44171974494035376, + "learning_rate": 2.9777916663051916e-06, + "loss": 0.5511, + "step": 9971 + }, + { + "epoch": 2.647285278109651, + "grad_norm": 0.4367177314804473, + "learning_rate": 2.977448969089991e-06, + "loss": 0.5583, + "step": 9972 + }, + { + "epoch": 2.6475507765830346, + "grad_norm": 0.43275022214441033, + "learning_rate": 2.9771062625633666e-06, + "loss": 0.5498, + "step": 9973 + }, + { + "epoch": 2.6478162750564183, + "grad_norm": 0.4413254821738734, + "learning_rate": 2.9767635467320018e-06, + "loss": 0.5433, + "step": 9974 + }, + { + "epoch": 2.6480817735298023, + "grad_norm": 0.43812595901271423, + "learning_rate": 2.9764208216025797e-06, + "loss": 0.5359, + "step": 9975 + }, + { + "epoch": 2.648347272003186, + "grad_norm": 0.43561665666414756, + "learning_rate": 2.9760780871817856e-06, + "loss": 0.5391, + "step": 9976 + }, + { + "epoch": 2.6486127704765696, + "grad_norm": 0.4404188912623667, + "learning_rate": 2.9757353434763037e-06, + "loss": 0.5586, + "step": 9977 + }, + { + "epoch": 2.6488782689499537, + "grad_norm": 0.4265068887118984, + "learning_rate": 2.9753925904928173e-06, + "loss": 0.55, + "step": 9978 + }, + { + "epoch": 2.6491437674233373, + "grad_norm": 0.4334957993371011, + "learning_rate": 2.975049828238012e-06, + "loss": 0.5738, + "step": 9979 + }, + { + "epoch": 2.649409265896721, + "grad_norm": 0.42963775415310124, + "learning_rate": 2.974707056718571e-06, + "loss": 0.5375, + "step": 9980 + }, + { + "epoch": 2.649674764370105, + "grad_norm": 0.44812608556831035, + "learning_rate": 2.974364275941181e-06, + "loss": 0.5501, + "step": 9981 + }, + { + "epoch": 2.6499402628434887, + "grad_norm": 0.42247982002093054, + "learning_rate": 2.974021485912526e-06, + "loss": 0.5337, + "step": 9982 + }, + { + "epoch": 2.6502057613168724, + "grad_norm": 0.446713280995582, + "learning_rate": 2.973678686639291e-06, + "loss": 0.5725, + "step": 9983 + }, + { + "epoch": 2.6504712597902564, + "grad_norm": 0.43818268042763436, + "learning_rate": 2.973335878128162e-06, + "loss": 0.5717, + "step": 9984 + }, + { + "epoch": 2.65073675826364, + "grad_norm": 0.428673354337855, + "learning_rate": 2.9729930603858243e-06, + "loss": 0.5328, + "step": 9985 + }, + { + "epoch": 2.6510022567370237, + "grad_norm": 0.4308792338084989, + "learning_rate": 2.972650233418964e-06, + "loss": 0.545, + "step": 9986 + }, + { + "epoch": 2.651267755210408, + "grad_norm": 0.42699121503704074, + "learning_rate": 2.972307397234267e-06, + "loss": 0.5471, + "step": 9987 + }, + { + "epoch": 2.6515332536837914, + "grad_norm": 0.43465959699226836, + "learning_rate": 2.9719645518384194e-06, + "loss": 0.5461, + "step": 9988 + }, + { + "epoch": 2.651798752157175, + "grad_norm": 0.44647737583421293, + "learning_rate": 2.9716216972381068e-06, + "loss": 0.536, + "step": 9989 + }, + { + "epoch": 2.6520642506305587, + "grad_norm": 0.43951533700001766, + "learning_rate": 2.971278833440018e-06, + "loss": 0.5792, + "step": 9990 + }, + { + "epoch": 2.652329749103943, + "grad_norm": 0.4388284095506071, + "learning_rate": 2.9709359604508365e-06, + "loss": 0.594, + "step": 9991 + }, + { + "epoch": 2.6525952475773265, + "grad_norm": 0.4476194358008474, + "learning_rate": 2.970593078277251e-06, + "loss": 0.5772, + "step": 9992 + }, + { + "epoch": 2.65286074605071, + "grad_norm": 0.4346177530092667, + "learning_rate": 2.970250186925948e-06, + "loss": 0.5122, + "step": 9993 + }, + { + "epoch": 2.6531262445240937, + "grad_norm": 0.4259194812524258, + "learning_rate": 2.9699072864036155e-06, + "loss": 0.5305, + "step": 9994 + }, + { + "epoch": 2.653391742997478, + "grad_norm": 0.43401860691983807, + "learning_rate": 2.9695643767169404e-06, + "loss": 0.5495, + "step": 9995 + }, + { + "epoch": 2.6536572414708615, + "grad_norm": 0.438111486381578, + "learning_rate": 2.9692214578726105e-06, + "loss": 0.5741, + "step": 9996 + }, + { + "epoch": 2.653922739944245, + "grad_norm": 0.4447872054942858, + "learning_rate": 2.9688785298773127e-06, + "loss": 0.5716, + "step": 9997 + }, + { + "epoch": 2.654188238417629, + "grad_norm": 0.44147823236688527, + "learning_rate": 2.9685355927377356e-06, + "loss": 0.5669, + "step": 9998 + }, + { + "epoch": 2.654453736891013, + "grad_norm": 0.4543819093558324, + "learning_rate": 2.9681926464605683e-06, + "loss": 0.5789, + "step": 9999 + }, + { + "epoch": 2.6547192353643965, + "grad_norm": 0.44005221699009817, + "learning_rate": 2.967849691052497e-06, + "loss": 0.5728, + "step": 10000 + }, + { + "epoch": 2.6549847338377806, + "grad_norm": 0.4433984396748064, + "learning_rate": 2.9675067265202116e-06, + "loss": 0.5734, + "step": 10001 + }, + { + "epoch": 2.655250232311164, + "grad_norm": 0.4435068602968665, + "learning_rate": 2.9671637528704e-06, + "loss": 0.5844, + "step": 10002 + }, + { + "epoch": 2.655515730784548, + "grad_norm": 0.44560535059295125, + "learning_rate": 2.9668207701097516e-06, + "loss": 0.5588, + "step": 10003 + }, + { + "epoch": 2.655781229257932, + "grad_norm": 0.435557242600705, + "learning_rate": 2.966477778244956e-06, + "loss": 0.5109, + "step": 10004 + }, + { + "epoch": 2.6560467277313156, + "grad_norm": 0.441992460502229, + "learning_rate": 2.9661347772827003e-06, + "loss": 0.567, + "step": 10005 + }, + { + "epoch": 2.656312226204699, + "grad_norm": 0.43085623711822096, + "learning_rate": 2.965791767229676e-06, + "loss": 0.5371, + "step": 10006 + }, + { + "epoch": 2.6565777246780833, + "grad_norm": 0.4301600965929788, + "learning_rate": 2.965448748092572e-06, + "loss": 0.5421, + "step": 10007 + }, + { + "epoch": 2.656843223151467, + "grad_norm": 0.4343535198366537, + "learning_rate": 2.965105719878077e-06, + "loss": 0.5956, + "step": 10008 + }, + { + "epoch": 2.6571087216248506, + "grad_norm": 0.43443423761620603, + "learning_rate": 2.9647626825928823e-06, + "loss": 0.5669, + "step": 10009 + }, + { + "epoch": 2.6573742200982347, + "grad_norm": 0.4391693690322364, + "learning_rate": 2.9644196362436774e-06, + "loss": 0.5928, + "step": 10010 + }, + { + "epoch": 2.6576397185716183, + "grad_norm": 0.4385278233766275, + "learning_rate": 2.9640765808371513e-06, + "loss": 0.5803, + "step": 10011 + }, + { + "epoch": 2.657905217045002, + "grad_norm": 0.4487504103378222, + "learning_rate": 2.9637335163799965e-06, + "loss": 0.5444, + "step": 10012 + }, + { + "epoch": 2.658170715518386, + "grad_norm": 0.43886768781999347, + "learning_rate": 2.9633904428789027e-06, + "loss": 0.5436, + "step": 10013 + }, + { + "epoch": 2.6584362139917697, + "grad_norm": 0.44898417845346733, + "learning_rate": 2.9630473603405607e-06, + "loss": 0.5136, + "step": 10014 + }, + { + "epoch": 2.6587017124651533, + "grad_norm": 0.4325919937780632, + "learning_rate": 2.9627042687716613e-06, + "loss": 0.5362, + "step": 10015 + }, + { + "epoch": 2.658967210938537, + "grad_norm": 0.44319090233236563, + "learning_rate": 2.9623611681788967e-06, + "loss": 0.5911, + "step": 10016 + }, + { + "epoch": 2.659232709411921, + "grad_norm": 0.44158058769615516, + "learning_rate": 2.962018058568956e-06, + "loss": 0.5371, + "step": 10017 + }, + { + "epoch": 2.6594982078853047, + "grad_norm": 0.4738186887563765, + "learning_rate": 2.9616749399485323e-06, + "loss": 0.5562, + "step": 10018 + }, + { + "epoch": 2.6597637063586883, + "grad_norm": 0.44282116896878837, + "learning_rate": 2.961331812324317e-06, + "loss": 0.5845, + "step": 10019 + }, + { + "epoch": 2.660029204832072, + "grad_norm": 0.44391927108306606, + "learning_rate": 2.960988675703002e-06, + "loss": 0.5856, + "step": 10020 + }, + { + "epoch": 2.660294703305456, + "grad_norm": 0.43767251259841244, + "learning_rate": 2.9606455300912782e-06, + "loss": 0.5998, + "step": 10021 + }, + { + "epoch": 2.6605602017788397, + "grad_norm": 0.45773285112184725, + "learning_rate": 2.96030237549584e-06, + "loss": 0.5222, + "step": 10022 + }, + { + "epoch": 2.6608257002522233, + "grad_norm": 0.4418414236532392, + "learning_rate": 2.9599592119233777e-06, + "loss": 0.572, + "step": 10023 + }, + { + "epoch": 2.6610911987256074, + "grad_norm": 0.4440806721483077, + "learning_rate": 2.959616039380585e-06, + "loss": 0.5841, + "step": 10024 + }, + { + "epoch": 2.661356697198991, + "grad_norm": 0.4393298063617331, + "learning_rate": 2.9592728578741544e-06, + "loss": 0.5641, + "step": 10025 + }, + { + "epoch": 2.6616221956723747, + "grad_norm": 0.43115786132548123, + "learning_rate": 2.9589296674107783e-06, + "loss": 0.5792, + "step": 10026 + }, + { + "epoch": 2.6618876941457588, + "grad_norm": 0.438327381260109, + "learning_rate": 2.9585864679971497e-06, + "loss": 0.5261, + "step": 10027 + }, + { + "epoch": 2.6621531926191424, + "grad_norm": 0.4371737634947093, + "learning_rate": 2.9582432596399624e-06, + "loss": 0.5566, + "step": 10028 + }, + { + "epoch": 2.662418691092526, + "grad_norm": 0.42994922358797977, + "learning_rate": 2.9579000423459097e-06, + "loss": 0.5421, + "step": 10029 + }, + { + "epoch": 2.66268418956591, + "grad_norm": 0.4280656448124467, + "learning_rate": 2.957556816121685e-06, + "loss": 0.5735, + "step": 10030 + }, + { + "epoch": 2.6629496880392938, + "grad_norm": 0.435304777121573, + "learning_rate": 2.9572135809739815e-06, + "loss": 0.5599, + "step": 10031 + }, + { + "epoch": 2.6632151865126774, + "grad_norm": 0.44844620330297785, + "learning_rate": 2.9568703369094944e-06, + "loss": 0.5264, + "step": 10032 + }, + { + "epoch": 2.6634806849860615, + "grad_norm": 0.43924250591074826, + "learning_rate": 2.956527083934917e-06, + "loss": 0.5825, + "step": 10033 + }, + { + "epoch": 2.663746183459445, + "grad_norm": 0.4279164463099742, + "learning_rate": 2.956183822056944e-06, + "loss": 0.5629, + "step": 10034 + }, + { + "epoch": 2.664011681932829, + "grad_norm": 0.44262640683113946, + "learning_rate": 2.9558405512822686e-06, + "loss": 0.5473, + "step": 10035 + }, + { + "epoch": 2.664277180406213, + "grad_norm": 0.42727783753543735, + "learning_rate": 2.9554972716175872e-06, + "loss": 0.5513, + "step": 10036 + }, + { + "epoch": 2.6645426788795965, + "grad_norm": 0.4460294998299989, + "learning_rate": 2.9551539830695936e-06, + "loss": 0.5693, + "step": 10037 + }, + { + "epoch": 2.66480817735298, + "grad_norm": 0.43635902921422853, + "learning_rate": 2.9548106856449827e-06, + "loss": 0.532, + "step": 10038 + }, + { + "epoch": 2.6650736758263642, + "grad_norm": 0.4300929186771159, + "learning_rate": 2.9544673793504495e-06, + "loss": 0.5355, + "step": 10039 + }, + { + "epoch": 2.665339174299748, + "grad_norm": 0.4498479112839448, + "learning_rate": 2.95412406419269e-06, + "loss": 0.5393, + "step": 10040 + }, + { + "epoch": 2.6656046727731315, + "grad_norm": 0.4481747122242704, + "learning_rate": 2.9537807401783995e-06, + "loss": 0.49, + "step": 10041 + }, + { + "epoch": 2.6658701712465156, + "grad_norm": 0.42922945483704783, + "learning_rate": 2.953437407314274e-06, + "loss": 0.5457, + "step": 10042 + }, + { + "epoch": 2.6661356697198992, + "grad_norm": 0.45067682881804483, + "learning_rate": 2.9530940656070085e-06, + "loss": 0.5621, + "step": 10043 + }, + { + "epoch": 2.666401168193283, + "grad_norm": 0.4423377907769823, + "learning_rate": 2.9527507150632987e-06, + "loss": 0.5418, + "step": 10044 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.4576771676696478, + "learning_rate": 2.9524073556898427e-06, + "loss": 0.5999, + "step": 10045 + }, + { + "epoch": 2.6669321651400506, + "grad_norm": 0.41119082720464906, + "learning_rate": 2.9520639874933344e-06, + "loss": 0.5165, + "step": 10046 + }, + { + "epoch": 2.6671976636134342, + "grad_norm": 0.4407026403123097, + "learning_rate": 2.951720610480472e-06, + "loss": 0.545, + "step": 10047 + }, + { + "epoch": 2.667463162086818, + "grad_norm": 0.43943649703725896, + "learning_rate": 2.9513772246579513e-06, + "loss": 0.5383, + "step": 10048 + }, + { + "epoch": 2.6677286605602015, + "grad_norm": 0.4452665571103192, + "learning_rate": 2.9510338300324696e-06, + "loss": 0.53, + "step": 10049 + }, + { + "epoch": 2.6679941590335856, + "grad_norm": 0.45010291812331304, + "learning_rate": 2.950690426610724e-06, + "loss": 0.5865, + "step": 10050 + }, + { + "epoch": 2.6682596575069693, + "grad_norm": 0.43953229645860004, + "learning_rate": 2.9503470143994125e-06, + "loss": 0.5756, + "step": 10051 + }, + { + "epoch": 2.668525155980353, + "grad_norm": 0.43205631043090725, + "learning_rate": 2.9500035934052307e-06, + "loss": 0.546, + "step": 10052 + }, + { + "epoch": 2.668790654453737, + "grad_norm": 0.44896614006803726, + "learning_rate": 2.949660163634878e-06, + "loss": 0.5665, + "step": 10053 + }, + { + "epoch": 2.6690561529271206, + "grad_norm": 0.4448018299809349, + "learning_rate": 2.9493167250950492e-06, + "loss": 0.568, + "step": 10054 + }, + { + "epoch": 2.6693216514005043, + "grad_norm": 0.4397218621913262, + "learning_rate": 2.9489732777924455e-06, + "loss": 0.5492, + "step": 10055 + }, + { + "epoch": 2.6695871498738883, + "grad_norm": 0.4396828568097959, + "learning_rate": 2.948629821733764e-06, + "loss": 0.5266, + "step": 10056 + }, + { + "epoch": 2.669852648347272, + "grad_norm": 0.4350291166258978, + "learning_rate": 2.9482863569257014e-06, + "loss": 0.5103, + "step": 10057 + }, + { + "epoch": 2.6701181468206556, + "grad_norm": 0.4306601019035005, + "learning_rate": 2.9479428833749573e-06, + "loss": 0.5381, + "step": 10058 + }, + { + "epoch": 2.6703836452940397, + "grad_norm": 0.4562449666651449, + "learning_rate": 2.9475994010882314e-06, + "loss": 0.5645, + "step": 10059 + }, + { + "epoch": 2.6706491437674234, + "grad_norm": 0.4349976173282006, + "learning_rate": 2.9472559100722202e-06, + "loss": 0.5502, + "step": 10060 + }, + { + "epoch": 2.670914642240807, + "grad_norm": 0.4425134905784369, + "learning_rate": 2.946912410333624e-06, + "loss": 0.5698, + "step": 10061 + }, + { + "epoch": 2.671180140714191, + "grad_norm": 0.45366306352877084, + "learning_rate": 2.9465689018791423e-06, + "loss": 0.5346, + "step": 10062 + }, + { + "epoch": 2.6714456391875747, + "grad_norm": 0.4394334199558341, + "learning_rate": 2.9462253847154727e-06, + "loss": 0.5256, + "step": 10063 + }, + { + "epoch": 2.6717111376609584, + "grad_norm": 0.4420974366829381, + "learning_rate": 2.9458818588493154e-06, + "loss": 0.5544, + "step": 10064 + }, + { + "epoch": 2.6719766361343424, + "grad_norm": 0.4644453017402358, + "learning_rate": 2.945538324287371e-06, + "loss": 0.5545, + "step": 10065 + }, + { + "epoch": 2.672242134607726, + "grad_norm": 0.4452663875066514, + "learning_rate": 2.945194781036338e-06, + "loss": 0.5679, + "step": 10066 + }, + { + "epoch": 2.6725076330811097, + "grad_norm": 0.4415150797145218, + "learning_rate": 2.9448512291029157e-06, + "loss": 0.5903, + "step": 10067 + }, + { + "epoch": 2.672773131554494, + "grad_norm": 0.4408875375158782, + "learning_rate": 2.944507668493807e-06, + "loss": 0.5524, + "step": 10068 + }, + { + "epoch": 2.6730386300278774, + "grad_norm": 0.4487720661841312, + "learning_rate": 2.9441640992157097e-06, + "loss": 0.5607, + "step": 10069 + }, + { + "epoch": 2.673304128501261, + "grad_norm": 0.4436844731695859, + "learning_rate": 2.9438205212753246e-06, + "loss": 0.5467, + "step": 10070 + }, + { + "epoch": 2.6735696269746447, + "grad_norm": 0.4476656941026381, + "learning_rate": 2.9434769346793533e-06, + "loss": 0.5551, + "step": 10071 + }, + { + "epoch": 2.673835125448029, + "grad_norm": 0.4350859738000993, + "learning_rate": 2.943133339434495e-06, + "loss": 0.5613, + "step": 10072 + }, + { + "epoch": 2.6741006239214125, + "grad_norm": 0.44007298510078674, + "learning_rate": 2.942789735547453e-06, + "loss": 0.5389, + "step": 10073 + }, + { + "epoch": 2.674366122394796, + "grad_norm": 0.46238194214742406, + "learning_rate": 2.942446123024926e-06, + "loss": 0.5482, + "step": 10074 + }, + { + "epoch": 2.6746316208681797, + "grad_norm": 0.4296175389621784, + "learning_rate": 2.9421025018736165e-06, + "loss": 0.5609, + "step": 10075 + }, + { + "epoch": 2.674897119341564, + "grad_norm": 0.43174159564041675, + "learning_rate": 2.941758872100226e-06, + "loss": 0.5265, + "step": 10076 + }, + { + "epoch": 2.6751626178149475, + "grad_norm": 0.424370489314813, + "learning_rate": 2.9414152337114556e-06, + "loss": 0.5421, + "step": 10077 + }, + { + "epoch": 2.675428116288331, + "grad_norm": 0.44090883390840724, + "learning_rate": 2.941071586714007e-06, + "loss": 0.58, + "step": 10078 + }, + { + "epoch": 2.675693614761715, + "grad_norm": 0.43899599596927275, + "learning_rate": 2.9407279311145835e-06, + "loss": 0.5541, + "step": 10079 + }, + { + "epoch": 2.675959113235099, + "grad_norm": 0.4495020794366583, + "learning_rate": 2.9403842669198855e-06, + "loss": 0.5765, + "step": 10080 + }, + { + "epoch": 2.6762246117084825, + "grad_norm": 0.43188001085835753, + "learning_rate": 2.9400405941366166e-06, + "loss": 0.5382, + "step": 10081 + }, + { + "epoch": 2.6764901101818666, + "grad_norm": 0.4250489084525347, + "learning_rate": 2.9396969127714787e-06, + "loss": 0.5347, + "step": 10082 + }, + { + "epoch": 2.67675560865525, + "grad_norm": 0.4292068204209661, + "learning_rate": 2.939353222831174e-06, + "loss": 0.5336, + "step": 10083 + }, + { + "epoch": 2.677021107128634, + "grad_norm": 0.4412137586257352, + "learning_rate": 2.939009524322406e-06, + "loss": 0.5385, + "step": 10084 + }, + { + "epoch": 2.677286605602018, + "grad_norm": 0.44353663025248385, + "learning_rate": 2.9386658172518766e-06, + "loss": 0.5456, + "step": 10085 + }, + { + "epoch": 2.6775521040754016, + "grad_norm": 0.4372829510332513, + "learning_rate": 2.938322101626291e-06, + "loss": 0.5663, + "step": 10086 + }, + { + "epoch": 2.677817602548785, + "grad_norm": 0.4398102795718385, + "learning_rate": 2.93797837745235e-06, + "loss": 0.5176, + "step": 10087 + }, + { + "epoch": 2.6780831010221693, + "grad_norm": 0.433796774199086, + "learning_rate": 2.9376346447367598e-06, + "loss": 0.5778, + "step": 10088 + }, + { + "epoch": 2.678348599495553, + "grad_norm": 0.4409864604726168, + "learning_rate": 2.9372909034862214e-06, + "loss": 0.5854, + "step": 10089 + }, + { + "epoch": 2.6786140979689366, + "grad_norm": 0.4384980528467308, + "learning_rate": 2.9369471537074394e-06, + "loss": 0.5878, + "step": 10090 + }, + { + "epoch": 2.6788795964423207, + "grad_norm": 0.444255643847712, + "learning_rate": 2.9366033954071193e-06, + "loss": 0.5684, + "step": 10091 + }, + { + "epoch": 2.6791450949157043, + "grad_norm": 0.43641165987046193, + "learning_rate": 2.936259628591963e-06, + "loss": 0.5162, + "step": 10092 + }, + { + "epoch": 2.679410593389088, + "grad_norm": 0.44951006958696466, + "learning_rate": 2.935915853268676e-06, + "loss": 0.5661, + "step": 10093 + }, + { + "epoch": 2.679676091862472, + "grad_norm": 0.4313204473907715, + "learning_rate": 2.9355720694439627e-06, + "loss": 0.582, + "step": 10094 + }, + { + "epoch": 2.6799415903358557, + "grad_norm": 0.43933106602371624, + "learning_rate": 2.9352282771245277e-06, + "loss": 0.5557, + "step": 10095 + }, + { + "epoch": 2.6802070888092393, + "grad_norm": 0.43892682172970526, + "learning_rate": 2.9348844763170754e-06, + "loss": 0.6017, + "step": 10096 + }, + { + "epoch": 2.6804725872826234, + "grad_norm": 0.435635051963991, + "learning_rate": 2.9345406670283115e-06, + "loss": 0.6104, + "step": 10097 + }, + { + "epoch": 2.680738085756007, + "grad_norm": 0.45893619179706113, + "learning_rate": 2.9341968492649397e-06, + "loss": 0.5648, + "step": 10098 + }, + { + "epoch": 2.6810035842293907, + "grad_norm": 0.43116329511951146, + "learning_rate": 2.933853023033667e-06, + "loss": 0.5956, + "step": 10099 + }, + { + "epoch": 2.6812690827027743, + "grad_norm": 0.4406655863781548, + "learning_rate": 2.9335091883411982e-06, + "loss": 0.5694, + "step": 10100 + }, + { + "epoch": 2.6815345811761584, + "grad_norm": 0.4468323208880422, + "learning_rate": 2.9331653451942387e-06, + "loss": 0.5938, + "step": 10101 + }, + { + "epoch": 2.681800079649542, + "grad_norm": 0.4427978559765397, + "learning_rate": 2.9328214935994944e-06, + "loss": 0.5688, + "step": 10102 + }, + { + "epoch": 2.6820655781229257, + "grad_norm": 0.44571194839184297, + "learning_rate": 2.9324776335636707e-06, + "loss": 0.5708, + "step": 10103 + }, + { + "epoch": 2.6823310765963093, + "grad_norm": 0.44819799115975945, + "learning_rate": 2.9321337650934746e-06, + "loss": 0.5419, + "step": 10104 + }, + { + "epoch": 2.6825965750696934, + "grad_norm": 0.43864071128824067, + "learning_rate": 2.9317898881956123e-06, + "loss": 0.5473, + "step": 10105 + }, + { + "epoch": 2.682862073543077, + "grad_norm": 0.44819354612458345, + "learning_rate": 2.93144600287679e-06, + "loss": 0.5489, + "step": 10106 + }, + { + "epoch": 2.6831275720164607, + "grad_norm": 0.43484419959974174, + "learning_rate": 2.9311021091437137e-06, + "loss": 0.5816, + "step": 10107 + }, + { + "epoch": 2.6833930704898448, + "grad_norm": 0.42980750784080285, + "learning_rate": 2.9307582070030918e-06, + "loss": 0.5638, + "step": 10108 + }, + { + "epoch": 2.6836585689632284, + "grad_norm": 0.44690628571829843, + "learning_rate": 2.930414296461629e-06, + "loss": 0.5301, + "step": 10109 + }, + { + "epoch": 2.683924067436612, + "grad_norm": 0.43977425815622667, + "learning_rate": 2.9300703775260343e-06, + "loss": 0.5495, + "step": 10110 + }, + { + "epoch": 2.684189565909996, + "grad_norm": 0.43808437722194576, + "learning_rate": 2.9297264502030144e-06, + "loss": 0.5323, + "step": 10111 + }, + { + "epoch": 2.6844550643833798, + "grad_norm": 0.42324486002843337, + "learning_rate": 2.929382514499276e-06, + "loss": 0.5259, + "step": 10112 + }, + { + "epoch": 2.6847205628567634, + "grad_norm": 0.43050158565945634, + "learning_rate": 2.929038570421526e-06, + "loss": 0.5949, + "step": 10113 + }, + { + "epoch": 2.6849860613301475, + "grad_norm": 0.4549232931473547, + "learning_rate": 2.928694617976476e-06, + "loss": 0.5611, + "step": 10114 + }, + { + "epoch": 2.685251559803531, + "grad_norm": 0.45372931034691066, + "learning_rate": 2.92835065717083e-06, + "loss": 0.5259, + "step": 10115 + }, + { + "epoch": 2.685517058276915, + "grad_norm": 0.4276089160597099, + "learning_rate": 2.928006688011297e-06, + "loss": 0.5267, + "step": 10116 + }, + { + "epoch": 2.685782556750299, + "grad_norm": 0.45178187627395144, + "learning_rate": 2.927662710504587e-06, + "loss": 0.5887, + "step": 10117 + }, + { + "epoch": 2.6860480552236825, + "grad_norm": 0.4605202743122104, + "learning_rate": 2.9273187246574065e-06, + "loss": 0.5588, + "step": 10118 + }, + { + "epoch": 2.686313553697066, + "grad_norm": 0.45633522413441485, + "learning_rate": 2.9269747304764638e-06, + "loss": 0.6033, + "step": 10119 + }, + { + "epoch": 2.6865790521704502, + "grad_norm": 0.44421248833581395, + "learning_rate": 2.9266307279684698e-06, + "loss": 0.5737, + "step": 10120 + }, + { + "epoch": 2.686844550643834, + "grad_norm": 0.43618977261361214, + "learning_rate": 2.926286717140131e-06, + "loss": 0.5643, + "step": 10121 + }, + { + "epoch": 2.6871100491172175, + "grad_norm": 0.45493469690811333, + "learning_rate": 2.9259426979981576e-06, + "loss": 0.5985, + "step": 10122 + }, + { + "epoch": 2.6873755475906016, + "grad_norm": 0.4430360324782482, + "learning_rate": 2.9255986705492586e-06, + "loss": 0.5825, + "step": 10123 + }, + { + "epoch": 2.6876410460639852, + "grad_norm": 0.46115456136334293, + "learning_rate": 2.925254634800144e-06, + "loss": 0.533, + "step": 10124 + }, + { + "epoch": 2.687906544537369, + "grad_norm": 0.4569653467832881, + "learning_rate": 2.9249105907575226e-06, + "loss": 0.5621, + "step": 10125 + }, + { + "epoch": 2.688172043010753, + "grad_norm": 0.44302748387267804, + "learning_rate": 2.9245665384281043e-06, + "loss": 0.5374, + "step": 10126 + }, + { + "epoch": 2.6884375414841366, + "grad_norm": 0.435841955460323, + "learning_rate": 2.9242224778185985e-06, + "loss": 0.5525, + "step": 10127 + }, + { + "epoch": 2.6887030399575202, + "grad_norm": 0.4392626183352283, + "learning_rate": 2.9238784089357166e-06, + "loss": 0.5554, + "step": 10128 + }, + { + "epoch": 2.688968538430904, + "grad_norm": 0.4424011386025097, + "learning_rate": 2.9235343317861675e-06, + "loss": 0.5617, + "step": 10129 + }, + { + "epoch": 2.6892340369042875, + "grad_norm": 0.4411197501777723, + "learning_rate": 2.9231902463766614e-06, + "loss": 0.5555, + "step": 10130 + }, + { + "epoch": 2.6894995353776716, + "grad_norm": 0.444098152256741, + "learning_rate": 2.9228461527139098e-06, + "loss": 0.537, + "step": 10131 + }, + { + "epoch": 2.6897650338510553, + "grad_norm": 0.4445601872178917, + "learning_rate": 2.9225020508046233e-06, + "loss": 0.5836, + "step": 10132 + }, + { + "epoch": 2.690030532324439, + "grad_norm": 0.4405321199538382, + "learning_rate": 2.9221579406555112e-06, + "loss": 0.5753, + "step": 10133 + }, + { + "epoch": 2.690296030797823, + "grad_norm": 0.43234118829709495, + "learning_rate": 2.9218138222732872e-06, + "loss": 0.5522, + "step": 10134 + }, + { + "epoch": 2.6905615292712066, + "grad_norm": 0.4608281057479714, + "learning_rate": 2.9214696956646597e-06, + "loss": 0.5735, + "step": 10135 + }, + { + "epoch": 2.6908270277445903, + "grad_norm": 0.4321020388490961, + "learning_rate": 2.9211255608363415e-06, + "loss": 0.5323, + "step": 10136 + }, + { + "epoch": 2.6910925262179743, + "grad_norm": 0.44074106921283657, + "learning_rate": 2.9207814177950445e-06, + "loss": 0.5682, + "step": 10137 + }, + { + "epoch": 2.691358024691358, + "grad_norm": 0.44815829488913456, + "learning_rate": 2.920437266547479e-06, + "loss": 0.5923, + "step": 10138 + }, + { + "epoch": 2.6916235231647416, + "grad_norm": 0.45861230577868295, + "learning_rate": 2.9200931071003568e-06, + "loss": 0.5733, + "step": 10139 + }, + { + "epoch": 2.6918890216381257, + "grad_norm": 0.4534081695307583, + "learning_rate": 2.919748939460391e-06, + "loss": 0.6176, + "step": 10140 + }, + { + "epoch": 2.6921545201115094, + "grad_norm": 0.43691484881192744, + "learning_rate": 2.919404763634293e-06, + "loss": 0.5898, + "step": 10141 + }, + { + "epoch": 2.692420018584893, + "grad_norm": 0.42518703142546543, + "learning_rate": 2.919060579628776e-06, + "loss": 0.5041, + "step": 10142 + }, + { + "epoch": 2.692685517058277, + "grad_norm": 0.42544922469166396, + "learning_rate": 2.9187163874505507e-06, + "loss": 0.5289, + "step": 10143 + }, + { + "epoch": 2.6929510155316607, + "grad_norm": 0.44754361744708193, + "learning_rate": 2.9183721871063313e-06, + "loss": 0.5961, + "step": 10144 + }, + { + "epoch": 2.6932165140050444, + "grad_norm": 0.45116552148572175, + "learning_rate": 2.9180279786028294e-06, + "loss": 0.562, + "step": 10145 + }, + { + "epoch": 2.6934820124784284, + "grad_norm": 0.43796753904122016, + "learning_rate": 2.917683761946759e-06, + "loss": 0.5667, + "step": 10146 + }, + { + "epoch": 2.693747510951812, + "grad_norm": 0.441296744345199, + "learning_rate": 2.9173395371448322e-06, + "loss": 0.5666, + "step": 10147 + }, + { + "epoch": 2.6940130094251957, + "grad_norm": 0.43889977861376195, + "learning_rate": 2.9169953042037623e-06, + "loss": 0.5532, + "step": 10148 + }, + { + "epoch": 2.69427850789858, + "grad_norm": 0.43204457008165237, + "learning_rate": 2.9166510631302635e-06, + "loss": 0.5665, + "step": 10149 + }, + { + "epoch": 2.6945440063719635, + "grad_norm": 0.4287091008325839, + "learning_rate": 2.9163068139310484e-06, + "loss": 0.5744, + "step": 10150 + }, + { + "epoch": 2.694809504845347, + "grad_norm": 0.4439695107237071, + "learning_rate": 2.915962556612832e-06, + "loss": 0.5741, + "step": 10151 + }, + { + "epoch": 2.695075003318731, + "grad_norm": 0.4366243909985819, + "learning_rate": 2.915618291182326e-06, + "loss": 0.5695, + "step": 10152 + }, + { + "epoch": 2.695340501792115, + "grad_norm": 0.44477894506137927, + "learning_rate": 2.9152740176462463e-06, + "loss": 0.5965, + "step": 10153 + }, + { + "epoch": 2.6956060002654985, + "grad_norm": 0.4487188796862963, + "learning_rate": 2.914929736011307e-06, + "loss": 0.5541, + "step": 10154 + }, + { + "epoch": 2.695871498738882, + "grad_norm": 0.4437546711296801, + "learning_rate": 2.9145854462842214e-06, + "loss": 0.5646, + "step": 10155 + }, + { + "epoch": 2.696136997212266, + "grad_norm": 0.43036566495529827, + "learning_rate": 2.9142411484717048e-06, + "loss": 0.5758, + "step": 10156 + }, + { + "epoch": 2.69640249568565, + "grad_norm": 0.4365573923260459, + "learning_rate": 2.9138968425804716e-06, + "loss": 0.5576, + "step": 10157 + }, + { + "epoch": 2.6966679941590335, + "grad_norm": 0.44101860474461374, + "learning_rate": 2.9135525286172356e-06, + "loss": 0.5852, + "step": 10158 + }, + { + "epoch": 2.696933492632417, + "grad_norm": 0.4453933256532241, + "learning_rate": 2.9132082065887135e-06, + "loss": 0.566, + "step": 10159 + }, + { + "epoch": 2.697198991105801, + "grad_norm": 0.4434611364850128, + "learning_rate": 2.91286387650162e-06, + "loss": 0.604, + "step": 10160 + }, + { + "epoch": 2.697464489579185, + "grad_norm": 0.4392404423358599, + "learning_rate": 2.912519538362669e-06, + "loss": 0.5447, + "step": 10161 + }, + { + "epoch": 2.6977299880525685, + "grad_norm": 0.43303605357684627, + "learning_rate": 2.912175192178578e-06, + "loss": 0.5556, + "step": 10162 + }, + { + "epoch": 2.6979954865259526, + "grad_norm": 0.45837599748872915, + "learning_rate": 2.9118308379560615e-06, + "loss": 0.6133, + "step": 10163 + }, + { + "epoch": 2.698260984999336, + "grad_norm": 0.4526662166515674, + "learning_rate": 2.911486475701835e-06, + "loss": 0.5314, + "step": 10164 + }, + { + "epoch": 2.69852648347272, + "grad_norm": 0.43883768370901466, + "learning_rate": 2.9111421054226145e-06, + "loss": 0.5388, + "step": 10165 + }, + { + "epoch": 2.698791981946104, + "grad_norm": 0.44445997235757106, + "learning_rate": 2.9107977271251174e-06, + "loss": 0.548, + "step": 10166 + }, + { + "epoch": 2.6990574804194876, + "grad_norm": 0.4550284157712892, + "learning_rate": 2.910453340816058e-06, + "loss": 0.5913, + "step": 10167 + }, + { + "epoch": 2.699322978892871, + "grad_norm": 0.43658869262833233, + "learning_rate": 2.9101089465021533e-06, + "loss": 0.6039, + "step": 10168 + }, + { + "epoch": 2.6995884773662553, + "grad_norm": 0.44586788070347355, + "learning_rate": 2.90976454419012e-06, + "loss": 0.5529, + "step": 10169 + }, + { + "epoch": 2.699853975839639, + "grad_norm": 0.4266545587764369, + "learning_rate": 2.909420133886675e-06, + "loss": 0.5516, + "step": 10170 + }, + { + "epoch": 2.7001194743130226, + "grad_norm": 0.4431055348578981, + "learning_rate": 2.909075715598535e-06, + "loss": 0.5655, + "step": 10171 + }, + { + "epoch": 2.7003849727864067, + "grad_norm": 0.4439226564662244, + "learning_rate": 2.908731289332417e-06, + "loss": 0.5804, + "step": 10172 + }, + { + "epoch": 2.7006504712597903, + "grad_norm": 0.43741596479613637, + "learning_rate": 2.9083868550950378e-06, + "loss": 0.5411, + "step": 10173 + }, + { + "epoch": 2.700915969733174, + "grad_norm": 0.4567293071196986, + "learning_rate": 2.9080424128931163e-06, + "loss": 0.579, + "step": 10174 + }, + { + "epoch": 2.701181468206558, + "grad_norm": 0.43807146698167687, + "learning_rate": 2.907697962733367e-06, + "loss": 0.5478, + "step": 10175 + }, + { + "epoch": 2.7014469666799417, + "grad_norm": 0.4442205453998902, + "learning_rate": 2.9073535046225097e-06, + "loss": 0.5779, + "step": 10176 + }, + { + "epoch": 2.7017124651533253, + "grad_norm": 0.43460845056961783, + "learning_rate": 2.9070090385672625e-06, + "loss": 0.551, + "step": 10177 + }, + { + "epoch": 2.7019779636267094, + "grad_norm": 0.43089441786303767, + "learning_rate": 2.906664564574341e-06, + "loss": 0.5607, + "step": 10178 + }, + { + "epoch": 2.702243462100093, + "grad_norm": 0.43806138550830276, + "learning_rate": 2.9063200826504656e-06, + "loss": 0.5474, + "step": 10179 + }, + { + "epoch": 2.7025089605734767, + "grad_norm": 0.4307302636907822, + "learning_rate": 2.9059755928023547e-06, + "loss": 0.5746, + "step": 10180 + }, + { + "epoch": 2.7027744590468608, + "grad_norm": 0.4362951242567226, + "learning_rate": 2.9056310950367244e-06, + "loss": 0.532, + "step": 10181 + }, + { + "epoch": 2.7030399575202444, + "grad_norm": 0.4335504550219663, + "learning_rate": 2.905286589360295e-06, + "loss": 0.6246, + "step": 10182 + }, + { + "epoch": 2.703305455993628, + "grad_norm": 0.44551132411467825, + "learning_rate": 2.904942075779785e-06, + "loss": 0.5836, + "step": 10183 + }, + { + "epoch": 2.7035709544670117, + "grad_norm": 0.43076782821831744, + "learning_rate": 2.9045975543019134e-06, + "loss": 0.5446, + "step": 10184 + }, + { + "epoch": 2.7038364529403953, + "grad_norm": 0.4486741657587676, + "learning_rate": 2.904253024933398e-06, + "loss": 0.5476, + "step": 10185 + }, + { + "epoch": 2.7041019514137794, + "grad_norm": 0.4342935319116983, + "learning_rate": 2.9039084876809593e-06, + "loss": 0.5816, + "step": 10186 + }, + { + "epoch": 2.704367449887163, + "grad_norm": 0.4394810326791022, + "learning_rate": 2.903563942551316e-06, + "loss": 0.5811, + "step": 10187 + }, + { + "epoch": 2.7046329483605467, + "grad_norm": 0.43517064720426085, + "learning_rate": 2.903219389551187e-06, + "loss": 0.5471, + "step": 10188 + }, + { + "epoch": 2.7048984468339308, + "grad_norm": 0.4428454491543437, + "learning_rate": 2.9028748286872943e-06, + "loss": 0.5293, + "step": 10189 + }, + { + "epoch": 2.7051639453073144, + "grad_norm": 0.453359727330235, + "learning_rate": 2.9025302599663552e-06, + "loss": 0.5859, + "step": 10190 + }, + { + "epoch": 2.705429443780698, + "grad_norm": 0.4247083903059448, + "learning_rate": 2.9021856833950902e-06, + "loss": 0.5217, + "step": 10191 + }, + { + "epoch": 2.705694942254082, + "grad_norm": 0.43505388503812364, + "learning_rate": 2.9018410989802205e-06, + "loss": 0.5584, + "step": 10192 + }, + { + "epoch": 2.7059604407274658, + "grad_norm": 0.44285716758921784, + "learning_rate": 2.9014965067284643e-06, + "loss": 0.5638, + "step": 10193 + }, + { + "epoch": 2.7062259392008494, + "grad_norm": 0.45040368042488255, + "learning_rate": 2.9011519066465437e-06, + "loss": 0.554, + "step": 10194 + }, + { + "epoch": 2.7064914376742335, + "grad_norm": 0.44506375560710565, + "learning_rate": 2.900807298741179e-06, + "loss": 0.5979, + "step": 10195 + }, + { + "epoch": 2.706756936147617, + "grad_norm": 0.4400339333408414, + "learning_rate": 2.9004626830190903e-06, + "loss": 0.5794, + "step": 10196 + }, + { + "epoch": 2.707022434621001, + "grad_norm": 0.4459688244165978, + "learning_rate": 2.9001180594869994e-06, + "loss": 0.5701, + "step": 10197 + }, + { + "epoch": 2.707287933094385, + "grad_norm": 0.4436688354506341, + "learning_rate": 2.899773428151626e-06, + "loss": 0.5351, + "step": 10198 + }, + { + "epoch": 2.7075534315677685, + "grad_norm": 0.448335991666371, + "learning_rate": 2.8994287890196917e-06, + "loss": 0.612, + "step": 10199 + }, + { + "epoch": 2.707818930041152, + "grad_norm": 0.4385498421908971, + "learning_rate": 2.899084142097919e-06, + "loss": 0.5289, + "step": 10200 + }, + { + "epoch": 2.7080844285145362, + "grad_norm": 0.43042540619984915, + "learning_rate": 2.8987394873930276e-06, + "loss": 0.5506, + "step": 10201 + }, + { + "epoch": 2.70834992698792, + "grad_norm": 0.46417012457458945, + "learning_rate": 2.89839482491174e-06, + "loss": 0.5336, + "step": 10202 + }, + { + "epoch": 2.7086154254613035, + "grad_norm": 0.4321025152650331, + "learning_rate": 2.8980501546607785e-06, + "loss": 0.5557, + "step": 10203 + }, + { + "epoch": 2.7088809239346876, + "grad_norm": 0.4537749218447374, + "learning_rate": 2.897705476646863e-06, + "loss": 0.5487, + "step": 10204 + }, + { + "epoch": 2.7091464224080712, + "grad_norm": 0.4360817293315029, + "learning_rate": 2.8973607908767183e-06, + "loss": 0.5596, + "step": 10205 + }, + { + "epoch": 2.709411920881455, + "grad_norm": 0.4546763306757774, + "learning_rate": 2.897016097357065e-06, + "loss": 0.5776, + "step": 10206 + }, + { + "epoch": 2.709677419354839, + "grad_norm": 0.45476443627915464, + "learning_rate": 2.8966713960946256e-06, + "loss": 0.5843, + "step": 10207 + }, + { + "epoch": 2.7099429178282226, + "grad_norm": 0.4315199299370785, + "learning_rate": 2.8963266870961226e-06, + "loss": 0.5495, + "step": 10208 + }, + { + "epoch": 2.7102084163016062, + "grad_norm": 0.43459150804360874, + "learning_rate": 2.8959819703682797e-06, + "loss": 0.5439, + "step": 10209 + }, + { + "epoch": 2.71047391477499, + "grad_norm": 0.4412231899526384, + "learning_rate": 2.895637245917818e-06, + "loss": 0.5606, + "step": 10210 + }, + { + "epoch": 2.710739413248374, + "grad_norm": 0.4406979847113424, + "learning_rate": 2.895292513751461e-06, + "loss": 0.5633, + "step": 10211 + }, + { + "epoch": 2.7110049117217576, + "grad_norm": 0.43296962620667295, + "learning_rate": 2.8949477738759337e-06, + "loss": 0.5585, + "step": 10212 + }, + { + "epoch": 2.7112704101951413, + "grad_norm": 0.4501251361074169, + "learning_rate": 2.8946030262979565e-06, + "loss": 0.5629, + "step": 10213 + }, + { + "epoch": 2.711535908668525, + "grad_norm": 0.44126313119357236, + "learning_rate": 2.8942582710242534e-06, + "loss": 0.5576, + "step": 10214 + }, + { + "epoch": 2.711801407141909, + "grad_norm": 0.42749229612120254, + "learning_rate": 2.89391350806155e-06, + "loss": 0.5633, + "step": 10215 + }, + { + "epoch": 2.7120669056152926, + "grad_norm": 0.45957870531395834, + "learning_rate": 2.8935687374165684e-06, + "loss": 0.5579, + "step": 10216 + }, + { + "epoch": 2.7123324040886763, + "grad_norm": 0.4360893174722151, + "learning_rate": 2.8932239590960327e-06, + "loss": 0.5675, + "step": 10217 + }, + { + "epoch": 2.7125979025620603, + "grad_norm": 0.45370050701947046, + "learning_rate": 2.8928791731066673e-06, + "loss": 0.5757, + "step": 10218 + }, + { + "epoch": 2.712863401035444, + "grad_norm": 0.4403385448272651, + "learning_rate": 2.8925343794551956e-06, + "loss": 0.5485, + "step": 10219 + }, + { + "epoch": 2.7131288995088276, + "grad_norm": 0.4426351958102985, + "learning_rate": 2.8921895781483427e-06, + "loss": 0.5487, + "step": 10220 + }, + { + "epoch": 2.7133943979822117, + "grad_norm": 0.4391844449722344, + "learning_rate": 2.891844769192833e-06, + "loss": 0.5264, + "step": 10221 + }, + { + "epoch": 2.7136598964555954, + "grad_norm": 0.4304856481399934, + "learning_rate": 2.891499952595391e-06, + "loss": 0.5796, + "step": 10222 + }, + { + "epoch": 2.713925394928979, + "grad_norm": 0.4234577013632456, + "learning_rate": 2.891155128362741e-06, + "loss": 0.566, + "step": 10223 + }, + { + "epoch": 2.714190893402363, + "grad_norm": 0.4636773893498481, + "learning_rate": 2.890810296501608e-06, + "loss": 0.5482, + "step": 10224 + }, + { + "epoch": 2.7144563918757467, + "grad_norm": 0.4489502189194947, + "learning_rate": 2.8904654570187174e-06, + "loss": 0.5718, + "step": 10225 + }, + { + "epoch": 2.7147218903491304, + "grad_norm": 0.4297856085653684, + "learning_rate": 2.8901206099207953e-06, + "loss": 0.5713, + "step": 10226 + }, + { + "epoch": 2.7149873888225144, + "grad_norm": 0.4490390343748039, + "learning_rate": 2.889775755214565e-06, + "loss": 0.5253, + "step": 10227 + }, + { + "epoch": 2.715252887295898, + "grad_norm": 0.4432906699961794, + "learning_rate": 2.889430892906754e-06, + "loss": 0.5521, + "step": 10228 + }, + { + "epoch": 2.7155183857692817, + "grad_norm": 0.4269082452158409, + "learning_rate": 2.8890860230040867e-06, + "loss": 0.533, + "step": 10229 + }, + { + "epoch": 2.715783884242666, + "grad_norm": 0.45730782926041164, + "learning_rate": 2.8887411455132893e-06, + "loss": 0.5937, + "step": 10230 + }, + { + "epoch": 2.7160493827160495, + "grad_norm": 0.4333852814691258, + "learning_rate": 2.8883962604410875e-06, + "loss": 0.5924, + "step": 10231 + }, + { + "epoch": 2.716314881189433, + "grad_norm": 0.44351573462855143, + "learning_rate": 2.888051367794208e-06, + "loss": 0.5515, + "step": 10232 + }, + { + "epoch": 2.716580379662817, + "grad_norm": 0.4252288714134628, + "learning_rate": 2.8877064675793766e-06, + "loss": 0.5139, + "step": 10233 + }, + { + "epoch": 2.716845878136201, + "grad_norm": 0.44673288305101033, + "learning_rate": 2.8873615598033194e-06, + "loss": 0.5586, + "step": 10234 + }, + { + "epoch": 2.7171113766095845, + "grad_norm": 0.4285174552763303, + "learning_rate": 2.887016644472764e-06, + "loss": 0.5898, + "step": 10235 + }, + { + "epoch": 2.7173768750829685, + "grad_norm": 0.45347068066869795, + "learning_rate": 2.8866717215944363e-06, + "loss": 0.5461, + "step": 10236 + }, + { + "epoch": 2.717642373556352, + "grad_norm": 0.4440321627109502, + "learning_rate": 2.886326791175063e-06, + "loss": 0.5788, + "step": 10237 + }, + { + "epoch": 2.717907872029736, + "grad_norm": 0.4193244441322592, + "learning_rate": 2.885981853221372e-06, + "loss": 0.5083, + "step": 10238 + }, + { + "epoch": 2.7181733705031195, + "grad_norm": 0.42589757698017205, + "learning_rate": 2.8856369077400896e-06, + "loss": 0.5295, + "step": 10239 + }, + { + "epoch": 2.718438868976503, + "grad_norm": 0.43029285291308367, + "learning_rate": 2.885291954737943e-06, + "loss": 0.5315, + "step": 10240 + }, + { + "epoch": 2.718704367449887, + "grad_norm": 0.4512118297488647, + "learning_rate": 2.88494699422166e-06, + "loss": 0.5388, + "step": 10241 + }, + { + "epoch": 2.718969865923271, + "grad_norm": 0.43228604883114413, + "learning_rate": 2.884602026197968e-06, + "loss": 0.5583, + "step": 10242 + }, + { + "epoch": 2.7192353643966545, + "grad_norm": 0.4575615646582, + "learning_rate": 2.8842570506735952e-06, + "loss": 0.5681, + "step": 10243 + }, + { + "epoch": 2.7195008628700386, + "grad_norm": 0.4592349652920735, + "learning_rate": 2.883912067655269e-06, + "loss": 0.5741, + "step": 10244 + }, + { + "epoch": 2.719766361343422, + "grad_norm": 0.4269147614749827, + "learning_rate": 2.8835670771497174e-06, + "loss": 0.5757, + "step": 10245 + }, + { + "epoch": 2.720031859816806, + "grad_norm": 0.4574142720896556, + "learning_rate": 2.8832220791636687e-06, + "loss": 0.524, + "step": 10246 + }, + { + "epoch": 2.72029735829019, + "grad_norm": 0.44551944203516236, + "learning_rate": 2.8828770737038515e-06, + "loss": 0.5829, + "step": 10247 + }, + { + "epoch": 2.7205628567635736, + "grad_norm": 0.4442857150696108, + "learning_rate": 2.8825320607769937e-06, + "loss": 0.6094, + "step": 10248 + }, + { + "epoch": 2.720828355236957, + "grad_norm": 0.4556480864316161, + "learning_rate": 2.8821870403898233e-06, + "loss": 0.5496, + "step": 10249 + }, + { + "epoch": 2.7210938537103413, + "grad_norm": 0.44113528389842227, + "learning_rate": 2.8818420125490705e-06, + "loss": 0.5764, + "step": 10250 + }, + { + "epoch": 2.721359352183725, + "grad_norm": 0.4622985401187027, + "learning_rate": 2.881496977261463e-06, + "loss": 0.5835, + "step": 10251 + }, + { + "epoch": 2.7216248506571086, + "grad_norm": 0.4486770539443488, + "learning_rate": 2.881151934533732e-06, + "loss": 0.563, + "step": 10252 + }, + { + "epoch": 2.7218903491304927, + "grad_norm": 0.438055948866357, + "learning_rate": 2.8808068843726034e-06, + "loss": 0.5887, + "step": 10253 + }, + { + "epoch": 2.7221558476038763, + "grad_norm": 0.4541672053501981, + "learning_rate": 2.880461826784809e-06, + "loss": 0.5817, + "step": 10254 + }, + { + "epoch": 2.72242134607726, + "grad_norm": 0.4451859760813674, + "learning_rate": 2.8801167617770776e-06, + "loss": 0.5459, + "step": 10255 + }, + { + "epoch": 2.722686844550644, + "grad_norm": 0.44037258932811785, + "learning_rate": 2.879771689356138e-06, + "loss": 0.5893, + "step": 10256 + }, + { + "epoch": 2.7229523430240277, + "grad_norm": 0.4353396261090973, + "learning_rate": 2.879426609528721e-06, + "loss": 0.5315, + "step": 10257 + }, + { + "epoch": 2.7232178414974113, + "grad_norm": 0.4454624505272185, + "learning_rate": 2.879081522301556e-06, + "loss": 0.5844, + "step": 10258 + }, + { + "epoch": 2.7234833399707954, + "grad_norm": 0.4517189569398687, + "learning_rate": 2.8787364276813734e-06, + "loss": 0.6006, + "step": 10259 + }, + { + "epoch": 2.723748838444179, + "grad_norm": 0.43210777897024255, + "learning_rate": 2.878391325674902e-06, + "loss": 0.5587, + "step": 10260 + }, + { + "epoch": 2.7240143369175627, + "grad_norm": 0.45917523874992766, + "learning_rate": 2.8780462162888752e-06, + "loss": 0.5582, + "step": 10261 + }, + { + "epoch": 2.7242798353909468, + "grad_norm": 0.45243517432940794, + "learning_rate": 2.87770109953002e-06, + "loss": 0.598, + "step": 10262 + }, + { + "epoch": 2.7245453338643304, + "grad_norm": 0.4439983830860151, + "learning_rate": 2.8773559754050688e-06, + "loss": 0.5735, + "step": 10263 + }, + { + "epoch": 2.724810832337714, + "grad_norm": 0.433536528137935, + "learning_rate": 2.8770108439207534e-06, + "loss": 0.5274, + "step": 10264 + }, + { + "epoch": 2.7250763308110977, + "grad_norm": 0.4420608250394168, + "learning_rate": 2.876665705083802e-06, + "loss": 0.5321, + "step": 10265 + }, + { + "epoch": 2.7253418292844818, + "grad_norm": 0.447493197408292, + "learning_rate": 2.876320558900948e-06, + "loss": 0.5407, + "step": 10266 + }, + { + "epoch": 2.7256073277578654, + "grad_norm": 0.44197996759121094, + "learning_rate": 2.875975405378922e-06, + "loss": 0.5383, + "step": 10267 + }, + { + "epoch": 2.725872826231249, + "grad_norm": 0.43150406968870186, + "learning_rate": 2.8756302445244536e-06, + "loss": 0.5662, + "step": 10268 + }, + { + "epoch": 2.7261383247046327, + "grad_norm": 0.4435022757307573, + "learning_rate": 2.875285076344277e-06, + "loss": 0.5875, + "step": 10269 + }, + { + "epoch": 2.7264038231780168, + "grad_norm": 0.4630022875977805, + "learning_rate": 2.8749399008451217e-06, + "loss": 0.5283, + "step": 10270 + }, + { + "epoch": 2.7266693216514004, + "grad_norm": 0.4487880725450732, + "learning_rate": 2.874594718033721e-06, + "loss": 0.5665, + "step": 10271 + }, + { + "epoch": 2.726934820124784, + "grad_norm": 0.44693622658289794, + "learning_rate": 2.8742495279168057e-06, + "loss": 0.5402, + "step": 10272 + }, + { + "epoch": 2.727200318598168, + "grad_norm": 0.43927945919667544, + "learning_rate": 2.873904330501109e-06, + "loss": 0.5889, + "step": 10273 + }, + { + "epoch": 2.727465817071552, + "grad_norm": 0.43940486252357186, + "learning_rate": 2.8735591257933613e-06, + "loss": 0.5655, + "step": 10274 + }, + { + "epoch": 2.7277313155449354, + "grad_norm": 0.44068802549051656, + "learning_rate": 2.8732139138002973e-06, + "loss": 0.5537, + "step": 10275 + }, + { + "epoch": 2.7279968140183195, + "grad_norm": 0.43916667465849263, + "learning_rate": 2.872868694528647e-06, + "loss": 0.5255, + "step": 10276 + }, + { + "epoch": 2.728262312491703, + "grad_norm": 0.43608657115105287, + "learning_rate": 2.8725234679851448e-06, + "loss": 0.5922, + "step": 10277 + }, + { + "epoch": 2.728527810965087, + "grad_norm": 0.46293574563631634, + "learning_rate": 2.8721782341765223e-06, + "loss": 0.5445, + "step": 10278 + }, + { + "epoch": 2.728793309438471, + "grad_norm": 0.43345665379478404, + "learning_rate": 2.8718329931095136e-06, + "loss": 0.5415, + "step": 10279 + }, + { + "epoch": 2.7290588079118545, + "grad_norm": 0.43498407272427503, + "learning_rate": 2.8714877447908505e-06, + "loss": 0.5556, + "step": 10280 + }, + { + "epoch": 2.729324306385238, + "grad_norm": 0.4350387166346781, + "learning_rate": 2.871142489227268e-06, + "loss": 0.6031, + "step": 10281 + }, + { + "epoch": 2.7295898048586222, + "grad_norm": 0.44126109668365776, + "learning_rate": 2.8707972264254973e-06, + "loss": 0.5574, + "step": 10282 + }, + { + "epoch": 2.729855303332006, + "grad_norm": 0.45918024015676406, + "learning_rate": 2.8704519563922733e-06, + "loss": 0.5688, + "step": 10283 + }, + { + "epoch": 2.7301208018053895, + "grad_norm": 0.42868988141374076, + "learning_rate": 2.870106679134329e-06, + "loss": 0.5835, + "step": 10284 + }, + { + "epoch": 2.7303863002787736, + "grad_norm": 0.4535947590739218, + "learning_rate": 2.869761394658398e-06, + "loss": 0.5463, + "step": 10285 + }, + { + "epoch": 2.7306517987521572, + "grad_norm": 0.43678625128534565, + "learning_rate": 2.869416102971215e-06, + "loss": 0.5281, + "step": 10286 + }, + { + "epoch": 2.730917297225541, + "grad_norm": 0.4457301754832794, + "learning_rate": 2.869070804079513e-06, + "loss": 0.5591, + "step": 10287 + }, + { + "epoch": 2.731182795698925, + "grad_norm": 0.43669980721056834, + "learning_rate": 2.8687254979900263e-06, + "loss": 0.5901, + "step": 10288 + }, + { + "epoch": 2.7314482941723086, + "grad_norm": 0.4447984074771697, + "learning_rate": 2.86838018470949e-06, + "loss": 0.5813, + "step": 10289 + }, + { + "epoch": 2.7317137926456923, + "grad_norm": 0.44536873773597246, + "learning_rate": 2.8680348642446385e-06, + "loss": 0.5798, + "step": 10290 + }, + { + "epoch": 2.7319792911190763, + "grad_norm": 0.44187847174583095, + "learning_rate": 2.8676895366022064e-06, + "loss": 0.5468, + "step": 10291 + }, + { + "epoch": 2.73224478959246, + "grad_norm": 0.45147062982548897, + "learning_rate": 2.867344201788927e-06, + "loss": 0.5766, + "step": 10292 + }, + { + "epoch": 2.7325102880658436, + "grad_norm": 0.43411976725545276, + "learning_rate": 2.866998859811537e-06, + "loss": 0.5434, + "step": 10293 + }, + { + "epoch": 2.7327757865392273, + "grad_norm": 0.4243805492589512, + "learning_rate": 2.8666535106767706e-06, + "loss": 0.5543, + "step": 10294 + }, + { + "epoch": 2.7330412850126113, + "grad_norm": 0.4485995572979169, + "learning_rate": 2.8663081543913624e-06, + "loss": 0.56, + "step": 10295 + }, + { + "epoch": 2.733306783485995, + "grad_norm": 0.4609874392191662, + "learning_rate": 2.8659627909620487e-06, + "loss": 0.5844, + "step": 10296 + }, + { + "epoch": 2.7335722819593786, + "grad_norm": 0.4489747516785865, + "learning_rate": 2.8656174203955646e-06, + "loss": 0.5594, + "step": 10297 + }, + { + "epoch": 2.7338377804327623, + "grad_norm": 0.4285498008743859, + "learning_rate": 2.865272042698646e-06, + "loss": 0.5648, + "step": 10298 + }, + { + "epoch": 2.7341032789061464, + "grad_norm": 0.437820489377927, + "learning_rate": 2.864926657878028e-06, + "loss": 0.557, + "step": 10299 + }, + { + "epoch": 2.73436877737953, + "grad_norm": 0.4426475389305618, + "learning_rate": 2.864581265940446e-06, + "loss": 0.5898, + "step": 10300 + }, + { + "epoch": 2.7346342758529136, + "grad_norm": 0.43671278903532534, + "learning_rate": 2.8642358668926375e-06, + "loss": 0.587, + "step": 10301 + }, + { + "epoch": 2.7348997743262977, + "grad_norm": 0.4406011189568316, + "learning_rate": 2.8638904607413376e-06, + "loss": 0.5416, + "step": 10302 + }, + { + "epoch": 2.7351652727996814, + "grad_norm": 0.4400776011375223, + "learning_rate": 2.8635450474932826e-06, + "loss": 0.5575, + "step": 10303 + }, + { + "epoch": 2.735430771273065, + "grad_norm": 0.4380610064877206, + "learning_rate": 2.8631996271552095e-06, + "loss": 0.5725, + "step": 10304 + }, + { + "epoch": 2.735696269746449, + "grad_norm": 0.4367289116352328, + "learning_rate": 2.8628541997338534e-06, + "loss": 0.554, + "step": 10305 + }, + { + "epoch": 2.7359617682198327, + "grad_norm": 0.422361387702185, + "learning_rate": 2.8625087652359517e-06, + "loss": 0.5441, + "step": 10306 + }, + { + "epoch": 2.7362272666932164, + "grad_norm": 0.4308649023992815, + "learning_rate": 2.8621633236682433e-06, + "loss": 0.5378, + "step": 10307 + }, + { + "epoch": 2.7364927651666004, + "grad_norm": 0.43502383684920753, + "learning_rate": 2.861817875037462e-06, + "loss": 0.5962, + "step": 10308 + }, + { + "epoch": 2.736758263639984, + "grad_norm": 0.4450716419718206, + "learning_rate": 2.861472419350347e-06, + "loss": 0.597, + "step": 10309 + }, + { + "epoch": 2.7370237621133677, + "grad_norm": 0.4473263353340631, + "learning_rate": 2.8611269566136347e-06, + "loss": 0.6067, + "step": 10310 + }, + { + "epoch": 2.737289260586752, + "grad_norm": 0.4400704880917872, + "learning_rate": 2.860781486834062e-06, + "loss": 0.567, + "step": 10311 + }, + { + "epoch": 2.7375547590601355, + "grad_norm": 0.4552158124413648, + "learning_rate": 2.860436010018367e-06, + "loss": 0.5683, + "step": 10312 + }, + { + "epoch": 2.737820257533519, + "grad_norm": 0.4401579972712152, + "learning_rate": 2.860090526173288e-06, + "loss": 0.5725, + "step": 10313 + }, + { + "epoch": 2.738085756006903, + "grad_norm": 0.44081188723914233, + "learning_rate": 2.8597450353055615e-06, + "loss": 0.5623, + "step": 10314 + }, + { + "epoch": 2.738351254480287, + "grad_norm": 0.4294027187144235, + "learning_rate": 2.859399537421926e-06, + "loss": 0.5429, + "step": 10315 + }, + { + "epoch": 2.7386167529536705, + "grad_norm": 0.4462035420980938, + "learning_rate": 2.85905403252912e-06, + "loss": 0.5515, + "step": 10316 + }, + { + "epoch": 2.7388822514270545, + "grad_norm": 0.4503717683168987, + "learning_rate": 2.858708520633881e-06, + "loss": 0.597, + "step": 10317 + }, + { + "epoch": 2.739147749900438, + "grad_norm": 0.4467780220647689, + "learning_rate": 2.8583630017429474e-06, + "loss": 0.5665, + "step": 10318 + }, + { + "epoch": 2.739413248373822, + "grad_norm": 0.4408404904903175, + "learning_rate": 2.8580174758630584e-06, + "loss": 0.5589, + "step": 10319 + }, + { + "epoch": 2.7396787468472055, + "grad_norm": 0.4570127388806598, + "learning_rate": 2.8576719430009514e-06, + "loss": 0.5719, + "step": 10320 + }, + { + "epoch": 2.7399442453205896, + "grad_norm": 0.44217595899356277, + "learning_rate": 2.8573264031633663e-06, + "loss": 0.5466, + "step": 10321 + }, + { + "epoch": 2.740209743793973, + "grad_norm": 0.4292987868333343, + "learning_rate": 2.8569808563570413e-06, + "loss": 0.5581, + "step": 10322 + }, + { + "epoch": 2.740475242267357, + "grad_norm": 0.43339613211084893, + "learning_rate": 2.856635302588715e-06, + "loss": 0.5548, + "step": 10323 + }, + { + "epoch": 2.7407407407407405, + "grad_norm": 0.426727202307013, + "learning_rate": 2.8562897418651277e-06, + "loss": 0.5539, + "step": 10324 + }, + { + "epoch": 2.7410062392141246, + "grad_norm": 0.43100006376263783, + "learning_rate": 2.8559441741930176e-06, + "loss": 0.55, + "step": 10325 + }, + { + "epoch": 2.741271737687508, + "grad_norm": 0.4437840074908016, + "learning_rate": 2.8555985995791252e-06, + "loss": 0.5617, + "step": 10326 + }, + { + "epoch": 2.741537236160892, + "grad_norm": 0.4361272163148602, + "learning_rate": 2.8552530180301896e-06, + "loss": 0.5363, + "step": 10327 + }, + { + "epoch": 2.741802734634276, + "grad_norm": 0.45490136042657886, + "learning_rate": 2.8549074295529496e-06, + "loss": 0.5485, + "step": 10328 + }, + { + "epoch": 2.7420682331076596, + "grad_norm": 0.4411175260710266, + "learning_rate": 2.854561834154146e-06, + "loss": 0.5547, + "step": 10329 + }, + { + "epoch": 2.742333731581043, + "grad_norm": 0.44883807460656977, + "learning_rate": 2.854216231840519e-06, + "loss": 0.5543, + "step": 10330 + }, + { + "epoch": 2.7425992300544273, + "grad_norm": 0.42735714815829845, + "learning_rate": 2.853870622618808e-06, + "loss": 0.5412, + "step": 10331 + }, + { + "epoch": 2.742864728527811, + "grad_norm": 0.4365619714477265, + "learning_rate": 2.8535250064957524e-06, + "loss": 0.5703, + "step": 10332 + }, + { + "epoch": 2.7431302270011946, + "grad_norm": 0.4364700836451843, + "learning_rate": 2.8531793834780945e-06, + "loss": 0.5507, + "step": 10333 + }, + { + "epoch": 2.7433957254745787, + "grad_norm": 0.4362064039030367, + "learning_rate": 2.8528337535725735e-06, + "loss": 0.5354, + "step": 10334 + }, + { + "epoch": 2.7436612239479623, + "grad_norm": 0.4494013599318121, + "learning_rate": 2.852488116785931e-06, + "loss": 0.5315, + "step": 10335 + }, + { + "epoch": 2.743926722421346, + "grad_norm": 0.4482954919432676, + "learning_rate": 2.8521424731249077e-06, + "loss": 0.5327, + "step": 10336 + }, + { + "epoch": 2.74419222089473, + "grad_norm": 0.43838764627046833, + "learning_rate": 2.8517968225962427e-06, + "loss": 0.5162, + "step": 10337 + }, + { + "epoch": 2.7444577193681137, + "grad_norm": 0.4364985111413365, + "learning_rate": 2.851451165206679e-06, + "loss": 0.55, + "step": 10338 + }, + { + "epoch": 2.7447232178414973, + "grad_norm": 0.4333031956021111, + "learning_rate": 2.851105500962957e-06, + "loss": 0.5401, + "step": 10339 + }, + { + "epoch": 2.7449887163148814, + "grad_norm": 0.4650515288523981, + "learning_rate": 2.850759829871818e-06, + "loss": 0.5702, + "step": 10340 + }, + { + "epoch": 2.745254214788265, + "grad_norm": 0.4542592947308028, + "learning_rate": 2.8504141519400037e-06, + "loss": 0.5608, + "step": 10341 + }, + { + "epoch": 2.7455197132616487, + "grad_norm": 0.44073447454774434, + "learning_rate": 2.8500684671742547e-06, + "loss": 0.5734, + "step": 10342 + }, + { + "epoch": 2.7457852117350328, + "grad_norm": 0.431434601367619, + "learning_rate": 2.8497227755813138e-06, + "loss": 0.5419, + "step": 10343 + }, + { + "epoch": 2.7460507102084164, + "grad_norm": 0.44235496145075687, + "learning_rate": 2.8493770771679234e-06, + "loss": 0.5573, + "step": 10344 + }, + { + "epoch": 2.7463162086818, + "grad_norm": 0.4421584125154759, + "learning_rate": 2.8490313719408238e-06, + "loss": 0.5545, + "step": 10345 + }, + { + "epoch": 2.746581707155184, + "grad_norm": 0.4429390801958356, + "learning_rate": 2.848685659906758e-06, + "loss": 0.5341, + "step": 10346 + }, + { + "epoch": 2.7468472056285678, + "grad_norm": 0.43919380812696407, + "learning_rate": 2.848339941072468e-06, + "loss": 0.5553, + "step": 10347 + }, + { + "epoch": 2.7471127041019514, + "grad_norm": 0.44228658446309593, + "learning_rate": 2.8479942154446965e-06, + "loss": 0.5798, + "step": 10348 + }, + { + "epoch": 2.747378202575335, + "grad_norm": 0.4243031030067016, + "learning_rate": 2.8476484830301856e-06, + "loss": 0.5617, + "step": 10349 + }, + { + "epoch": 2.747643701048719, + "grad_norm": 0.45169389106771335, + "learning_rate": 2.8473027438356786e-06, + "loss": 0.5773, + "step": 10350 + }, + { + "epoch": 2.7479091995221028, + "grad_norm": 0.41382595798301464, + "learning_rate": 2.8469569978679167e-06, + "loss": 0.5222, + "step": 10351 + }, + { + "epoch": 2.7481746979954864, + "grad_norm": 0.434215407756758, + "learning_rate": 2.846611245133644e-06, + "loss": 0.555, + "step": 10352 + }, + { + "epoch": 2.74844019646887, + "grad_norm": 0.41105680802092653, + "learning_rate": 2.8462654856396045e-06, + "loss": 0.5177, + "step": 10353 + }, + { + "epoch": 2.748705694942254, + "grad_norm": 0.4372703123715657, + "learning_rate": 2.845919719392539e-06, + "loss": 0.5877, + "step": 10354 + }, + { + "epoch": 2.748971193415638, + "grad_norm": 0.44502346600000375, + "learning_rate": 2.8455739463991927e-06, + "loss": 0.6182, + "step": 10355 + }, + { + "epoch": 2.7492366918890214, + "grad_norm": 0.4371636584793063, + "learning_rate": 2.8452281666663085e-06, + "loss": 0.5944, + "step": 10356 + }, + { + "epoch": 2.7495021903624055, + "grad_norm": 0.44417921068643, + "learning_rate": 2.844882380200629e-06, + "loss": 0.5446, + "step": 10357 + }, + { + "epoch": 2.749767688835789, + "grad_norm": 0.4495981338486847, + "learning_rate": 2.8445365870088992e-06, + "loss": 0.57, + "step": 10358 + }, + { + "epoch": 2.750033187309173, + "grad_norm": 0.43896790265342484, + "learning_rate": 2.844190787097863e-06, + "loss": 0.5422, + "step": 10359 + }, + { + "epoch": 2.750298685782557, + "grad_norm": 0.44425507257667096, + "learning_rate": 2.843844980474263e-06, + "loss": 0.5516, + "step": 10360 + }, + { + "epoch": 2.7505641842559405, + "grad_norm": 0.4572694847231948, + "learning_rate": 2.843499167144843e-06, + "loss": 0.5897, + "step": 10361 + }, + { + "epoch": 2.750829682729324, + "grad_norm": 0.4312272602493397, + "learning_rate": 2.8431533471163497e-06, + "loss": 0.5827, + "step": 10362 + }, + { + "epoch": 2.7510951812027082, + "grad_norm": 0.44783875949204516, + "learning_rate": 2.8428075203955253e-06, + "loss": 0.5484, + "step": 10363 + }, + { + "epoch": 2.751360679676092, + "grad_norm": 0.4528468735810305, + "learning_rate": 2.842461686989115e-06, + "loss": 0.5699, + "step": 10364 + }, + { + "epoch": 2.7516261781494755, + "grad_norm": 0.44944031830805287, + "learning_rate": 2.842115846903864e-06, + "loss": 0.548, + "step": 10365 + }, + { + "epoch": 2.7518916766228596, + "grad_norm": 0.44012100969131446, + "learning_rate": 2.8417700001465156e-06, + "loss": 0.5833, + "step": 10366 + }, + { + "epoch": 2.7521571750962432, + "grad_norm": 0.44636105350907634, + "learning_rate": 2.8414241467238156e-06, + "loss": 0.5655, + "step": 10367 + }, + { + "epoch": 2.752422673569627, + "grad_norm": 0.465260730003754, + "learning_rate": 2.8410782866425092e-06, + "loss": 0.5657, + "step": 10368 + }, + { + "epoch": 2.752688172043011, + "grad_norm": 0.4375795658230993, + "learning_rate": 2.84073241990934e-06, + "loss": 0.5595, + "step": 10369 + }, + { + "epoch": 2.7529536705163946, + "grad_norm": 0.4666940037495597, + "learning_rate": 2.840386546531055e-06, + "loss": 0.5586, + "step": 10370 + }, + { + "epoch": 2.7532191689897783, + "grad_norm": 0.4342168920567903, + "learning_rate": 2.8400406665143987e-06, + "loss": 0.5654, + "step": 10371 + }, + { + "epoch": 2.7534846674631623, + "grad_norm": 0.44029261239817913, + "learning_rate": 2.839694779866117e-06, + "loss": 0.6042, + "step": 10372 + }, + { + "epoch": 2.753750165936546, + "grad_norm": 0.4406684636472852, + "learning_rate": 2.839348886592956e-06, + "loss": 0.5534, + "step": 10373 + }, + { + "epoch": 2.7540156644099296, + "grad_norm": 0.45779469119606037, + "learning_rate": 2.83900298670166e-06, + "loss": 0.5925, + "step": 10374 + }, + { + "epoch": 2.7542811628833133, + "grad_norm": 0.48758790918103284, + "learning_rate": 2.838657080198976e-06, + "loss": 0.5194, + "step": 10375 + }, + { + "epoch": 2.7545466613566973, + "grad_norm": 0.43934849146512, + "learning_rate": 2.83831116709165e-06, + "loss": 0.5638, + "step": 10376 + }, + { + "epoch": 2.754812159830081, + "grad_norm": 0.4399822948444774, + "learning_rate": 2.837965247386428e-06, + "loss": 0.5654, + "step": 10377 + }, + { + "epoch": 2.7550776583034646, + "grad_norm": 0.44247494973297535, + "learning_rate": 2.8376193210900555e-06, + "loss": 0.563, + "step": 10378 + }, + { + "epoch": 2.7553431567768483, + "grad_norm": 0.43830208665937304, + "learning_rate": 2.83727338820928e-06, + "loss": 0.5277, + "step": 10379 + }, + { + "epoch": 2.7556086552502324, + "grad_norm": 0.4765741882696403, + "learning_rate": 2.8369274487508476e-06, + "loss": 0.5568, + "step": 10380 + }, + { + "epoch": 2.755874153723616, + "grad_norm": 0.4692609222245702, + "learning_rate": 2.836581502721505e-06, + "loss": 0.5665, + "step": 10381 + }, + { + "epoch": 2.7561396521969996, + "grad_norm": 0.44860205205317094, + "learning_rate": 2.8362355501279994e-06, + "loss": 0.5525, + "step": 10382 + }, + { + "epoch": 2.7564051506703837, + "grad_norm": 0.46521192222691604, + "learning_rate": 2.8358895909770766e-06, + "loss": 0.563, + "step": 10383 + }, + { + "epoch": 2.7566706491437674, + "grad_norm": 0.4474053102130928, + "learning_rate": 2.8355436252754847e-06, + "loss": 0.5842, + "step": 10384 + }, + { + "epoch": 2.756936147617151, + "grad_norm": 0.44445799731814095, + "learning_rate": 2.835197653029971e-06, + "loss": 0.5576, + "step": 10385 + }, + { + "epoch": 2.757201646090535, + "grad_norm": 0.45401756737886223, + "learning_rate": 2.834851674247282e-06, + "loss": 0.5407, + "step": 10386 + }, + { + "epoch": 2.7574671445639187, + "grad_norm": 0.43142535199135806, + "learning_rate": 2.8345056889341647e-06, + "loss": 0.5222, + "step": 10387 + }, + { + "epoch": 2.7577326430373024, + "grad_norm": 0.4214409156777529, + "learning_rate": 2.8341596970973683e-06, + "loss": 0.5426, + "step": 10388 + }, + { + "epoch": 2.7579981415106865, + "grad_norm": 0.4658747984371521, + "learning_rate": 2.8338136987436397e-06, + "loss": 0.5382, + "step": 10389 + }, + { + "epoch": 2.75826363998407, + "grad_norm": 0.44835710242458643, + "learning_rate": 2.8334676938797264e-06, + "loss": 0.566, + "step": 10390 + }, + { + "epoch": 2.7585291384574537, + "grad_norm": 0.43426629376815395, + "learning_rate": 2.833121682512377e-06, + "loss": 0.5949, + "step": 10391 + }, + { + "epoch": 2.758794636930838, + "grad_norm": 0.43755638084860565, + "learning_rate": 2.832775664648339e-06, + "loss": 0.537, + "step": 10392 + }, + { + "epoch": 2.7590601354042215, + "grad_norm": 0.43040285557481295, + "learning_rate": 2.8324296402943606e-06, + "loss": 0.5474, + "step": 10393 + }, + { + "epoch": 2.759325633877605, + "grad_norm": 0.44265473232882074, + "learning_rate": 2.832083609457191e-06, + "loss": 0.5799, + "step": 10394 + }, + { + "epoch": 2.759591132350989, + "grad_norm": 0.43971191906185214, + "learning_rate": 2.831737572143577e-06, + "loss": 0.5434, + "step": 10395 + }, + { + "epoch": 2.759856630824373, + "grad_norm": 0.4316051737507818, + "learning_rate": 2.8313915283602684e-06, + "loss": 0.5337, + "step": 10396 + }, + { + "epoch": 2.7601221292977565, + "grad_norm": 0.441735364058747, + "learning_rate": 2.831045478114013e-06, + "loss": 0.562, + "step": 10397 + }, + { + "epoch": 2.7603876277711406, + "grad_norm": 0.4334694278868336, + "learning_rate": 2.830699421411561e-06, + "loss": 0.5434, + "step": 10398 + }, + { + "epoch": 2.760653126244524, + "grad_norm": 0.45532016165336586, + "learning_rate": 2.8303533582596605e-06, + "loss": 0.5692, + "step": 10399 + }, + { + "epoch": 2.760918624717908, + "grad_norm": 0.4310481823721104, + "learning_rate": 2.8300072886650604e-06, + "loss": 0.5336, + "step": 10400 + }, + { + "epoch": 2.761184123191292, + "grad_norm": 0.458014931421607, + "learning_rate": 2.8296612126345104e-06, + "loss": 0.5351, + "step": 10401 + }, + { + "epoch": 2.7614496216646756, + "grad_norm": 0.4547647171633738, + "learning_rate": 2.82931513017476e-06, + "loss": 0.5587, + "step": 10402 + }, + { + "epoch": 2.761715120138059, + "grad_norm": 0.4296890502659497, + "learning_rate": 2.8289690412925573e-06, + "loss": 0.5666, + "step": 10403 + }, + { + "epoch": 2.761980618611443, + "grad_norm": 0.43374033439784093, + "learning_rate": 2.8286229459946534e-06, + "loss": 0.5645, + "step": 10404 + }, + { + "epoch": 2.762246117084827, + "grad_norm": 0.4288191046132695, + "learning_rate": 2.8282768442877976e-06, + "loss": 0.5945, + "step": 10405 + }, + { + "epoch": 2.7625116155582106, + "grad_norm": 0.4433726197619747, + "learning_rate": 2.8279307361787394e-06, + "loss": 0.5722, + "step": 10406 + }, + { + "epoch": 2.762777114031594, + "grad_norm": 0.4461913777228567, + "learning_rate": 2.8275846216742287e-06, + "loss": 0.5763, + "step": 10407 + }, + { + "epoch": 2.763042612504978, + "grad_norm": 0.4394180484470262, + "learning_rate": 2.8272385007810166e-06, + "loss": 0.5343, + "step": 10408 + }, + { + "epoch": 2.763308110978362, + "grad_norm": 0.43727444056490866, + "learning_rate": 2.8268923735058516e-06, + "loss": 0.5873, + "step": 10409 + }, + { + "epoch": 2.7635736094517456, + "grad_norm": 0.4423476244037865, + "learning_rate": 2.8265462398554847e-06, + "loss": 0.5446, + "step": 10410 + }, + { + "epoch": 2.763839107925129, + "grad_norm": 0.44610067424462474, + "learning_rate": 2.826200099836668e-06, + "loss": 0.5778, + "step": 10411 + }, + { + "epoch": 2.7641046063985133, + "grad_norm": 0.4630554212385094, + "learning_rate": 2.8258539534561497e-06, + "loss": 0.5164, + "step": 10412 + }, + { + "epoch": 2.764370104871897, + "grad_norm": 0.4252987547976094, + "learning_rate": 2.8255078007206817e-06, + "loss": 0.5446, + "step": 10413 + }, + { + "epoch": 2.7646356033452806, + "grad_norm": 0.43068527404955054, + "learning_rate": 2.8251616416370147e-06, + "loss": 0.5512, + "step": 10414 + }, + { + "epoch": 2.7649011018186647, + "grad_norm": 0.4516559538378109, + "learning_rate": 2.8248154762118995e-06, + "loss": 0.5083, + "step": 10415 + }, + { + "epoch": 2.7651666002920483, + "grad_norm": 0.4511639399421076, + "learning_rate": 2.824469304452087e-06, + "loss": 0.5812, + "step": 10416 + }, + { + "epoch": 2.765432098765432, + "grad_norm": 0.43816184492083227, + "learning_rate": 2.824123126364329e-06, + "loss": 0.5845, + "step": 10417 + }, + { + "epoch": 2.765697597238816, + "grad_norm": 0.4395001660844736, + "learning_rate": 2.8237769419553757e-06, + "loss": 0.5315, + "step": 10418 + }, + { + "epoch": 2.7659630957121997, + "grad_norm": 0.43088502168696324, + "learning_rate": 2.82343075123198e-06, + "loss": 0.6021, + "step": 10419 + }, + { + "epoch": 2.7662285941855833, + "grad_norm": 0.44579347053699364, + "learning_rate": 2.8230845542008927e-06, + "loss": 0.5777, + "step": 10420 + }, + { + "epoch": 2.7664940926589674, + "grad_norm": 0.45889474200732183, + "learning_rate": 2.8227383508688653e-06, + "loss": 0.5324, + "step": 10421 + }, + { + "epoch": 2.766759591132351, + "grad_norm": 0.446563758104612, + "learning_rate": 2.8223921412426504e-06, + "loss": 0.5544, + "step": 10422 + }, + { + "epoch": 2.7670250896057347, + "grad_norm": 0.43395202679761435, + "learning_rate": 2.822045925328999e-06, + "loss": 0.5348, + "step": 10423 + }, + { + "epoch": 2.7672905880791188, + "grad_norm": 0.4362175831318424, + "learning_rate": 2.8216997031346633e-06, + "loss": 0.5746, + "step": 10424 + }, + { + "epoch": 2.7675560865525024, + "grad_norm": 0.4535493488882397, + "learning_rate": 2.8213534746663958e-06, + "loss": 0.579, + "step": 10425 + }, + { + "epoch": 2.767821585025886, + "grad_norm": 0.44663352989242516, + "learning_rate": 2.8210072399309484e-06, + "loss": 0.5618, + "step": 10426 + }, + { + "epoch": 2.76808708349927, + "grad_norm": 0.43944910461314524, + "learning_rate": 2.820660998935074e-06, + "loss": 0.5409, + "step": 10427 + }, + { + "epoch": 2.7683525819726538, + "grad_norm": 0.43732145388218513, + "learning_rate": 2.820314751685526e-06, + "loss": 0.5753, + "step": 10428 + }, + { + "epoch": 2.7686180804460374, + "grad_norm": 0.4268623368929724, + "learning_rate": 2.819968498189055e-06, + "loss": 0.5521, + "step": 10429 + }, + { + "epoch": 2.768883578919421, + "grad_norm": 0.4341207782016763, + "learning_rate": 2.819622238452415e-06, + "loss": 0.5764, + "step": 10430 + }, + { + "epoch": 2.769149077392805, + "grad_norm": 0.444066710970643, + "learning_rate": 2.8192759724823598e-06, + "loss": 0.5484, + "step": 10431 + }, + { + "epoch": 2.7694145758661888, + "grad_norm": 0.44742294242189534, + "learning_rate": 2.8189297002856407e-06, + "loss": 0.5611, + "step": 10432 + }, + { + "epoch": 2.7696800743395724, + "grad_norm": 0.4251223472526734, + "learning_rate": 2.8185834218690117e-06, + "loss": 0.5575, + "step": 10433 + }, + { + "epoch": 2.769945572812956, + "grad_norm": 0.44098748647605973, + "learning_rate": 2.8182371372392253e-06, + "loss": 0.5462, + "step": 10434 + }, + { + "epoch": 2.77021107128634, + "grad_norm": 0.4349442280508103, + "learning_rate": 2.8178908464030363e-06, + "loss": 0.6108, + "step": 10435 + }, + { + "epoch": 2.770476569759724, + "grad_norm": 0.4482186329336556, + "learning_rate": 2.817544549367197e-06, + "loss": 0.5364, + "step": 10436 + }, + { + "epoch": 2.7707420682331074, + "grad_norm": 0.4455139050163501, + "learning_rate": 2.8171982461384616e-06, + "loss": 0.5541, + "step": 10437 + }, + { + "epoch": 2.7710075667064915, + "grad_norm": 0.4376667630388307, + "learning_rate": 2.816851936723584e-06, + "loss": 0.5764, + "step": 10438 + }, + { + "epoch": 2.771273065179875, + "grad_norm": 0.4440568799011048, + "learning_rate": 2.8165056211293186e-06, + "loss": 0.5824, + "step": 10439 + }, + { + "epoch": 2.771538563653259, + "grad_norm": 0.45588536453291484, + "learning_rate": 2.8161592993624175e-06, + "loss": 0.5034, + "step": 10440 + }, + { + "epoch": 2.771804062126643, + "grad_norm": 0.4097374512846095, + "learning_rate": 2.815812971429636e-06, + "loss": 0.5279, + "step": 10441 + }, + { + "epoch": 2.7720695606000265, + "grad_norm": 0.44841972684479187, + "learning_rate": 2.815466637337728e-06, + "loss": 0.5707, + "step": 10442 + }, + { + "epoch": 2.77233505907341, + "grad_norm": 0.4338551234463223, + "learning_rate": 2.8151202970934487e-06, + "loss": 0.5716, + "step": 10443 + }, + { + "epoch": 2.7726005575467942, + "grad_norm": 0.4365821609522977, + "learning_rate": 2.8147739507035522e-06, + "loss": 0.5778, + "step": 10444 + }, + { + "epoch": 2.772866056020178, + "grad_norm": 0.4405568759647794, + "learning_rate": 2.8144275981747934e-06, + "loss": 0.5236, + "step": 10445 + }, + { + "epoch": 2.7731315544935615, + "grad_norm": 0.4514311872677338, + "learning_rate": 2.814081239513926e-06, + "loss": 0.5475, + "step": 10446 + }, + { + "epoch": 2.7733970529669456, + "grad_norm": 0.4377383858435026, + "learning_rate": 2.813734874727705e-06, + "loss": 0.5383, + "step": 10447 + }, + { + "epoch": 2.7736625514403292, + "grad_norm": 0.4357072195342958, + "learning_rate": 2.813388503822887e-06, + "loss": 0.5416, + "step": 10448 + }, + { + "epoch": 2.773928049913713, + "grad_norm": 0.4428693730865895, + "learning_rate": 2.8130421268062247e-06, + "loss": 0.5884, + "step": 10449 + }, + { + "epoch": 2.774193548387097, + "grad_norm": 0.4351616711546554, + "learning_rate": 2.8126957436844747e-06, + "loss": 0.5402, + "step": 10450 + }, + { + "epoch": 2.7744590468604806, + "grad_norm": 0.43107814665507527, + "learning_rate": 2.8123493544643924e-06, + "loss": 0.552, + "step": 10451 + }, + { + "epoch": 2.7747245453338643, + "grad_norm": 0.44684950829612546, + "learning_rate": 2.812002959152733e-06, + "loss": 0.627, + "step": 10452 + }, + { + "epoch": 2.7749900438072483, + "grad_norm": 0.4441425613276137, + "learning_rate": 2.811656557756251e-06, + "loss": 0.5809, + "step": 10453 + }, + { + "epoch": 2.775255542280632, + "grad_norm": 0.44871380809170247, + "learning_rate": 2.8113101502817044e-06, + "loss": 0.5329, + "step": 10454 + }, + { + "epoch": 2.7755210407540156, + "grad_norm": 0.453442819464806, + "learning_rate": 2.810963736735847e-06, + "loss": 0.56, + "step": 10455 + }, + { + "epoch": 2.7757865392273997, + "grad_norm": 0.440766513265747, + "learning_rate": 2.8106173171254353e-06, + "loss": 0.5357, + "step": 10456 + }, + { + "epoch": 2.7760520377007833, + "grad_norm": 0.4361895433190712, + "learning_rate": 2.810270891457226e-06, + "loss": 0.5543, + "step": 10457 + }, + { + "epoch": 2.776317536174167, + "grad_norm": 0.4292111533195813, + "learning_rate": 2.809924459737974e-06, + "loss": 0.5532, + "step": 10458 + }, + { + "epoch": 2.7765830346475506, + "grad_norm": 0.4359804552692377, + "learning_rate": 2.809578021974436e-06, + "loss": 0.59, + "step": 10459 + }, + { + "epoch": 2.7768485331209347, + "grad_norm": 0.4516275071504658, + "learning_rate": 2.80923157817337e-06, + "loss": 0.5493, + "step": 10460 + }, + { + "epoch": 2.7771140315943184, + "grad_norm": 0.44249514003272455, + "learning_rate": 2.8088851283415298e-06, + "loss": 0.5522, + "step": 10461 + }, + { + "epoch": 2.777379530067702, + "grad_norm": 0.4461739276153777, + "learning_rate": 2.8085386724856734e-06, + "loss": 0.5427, + "step": 10462 + }, + { + "epoch": 2.7776450285410856, + "grad_norm": 0.44795781004318747, + "learning_rate": 2.8081922106125577e-06, + "loss": 0.5409, + "step": 10463 + }, + { + "epoch": 2.7779105270144697, + "grad_norm": 0.43388589812294953, + "learning_rate": 2.807845742728939e-06, + "loss": 0.5413, + "step": 10464 + }, + { + "epoch": 2.7781760254878534, + "grad_norm": 0.44392046527414447, + "learning_rate": 2.807499268841575e-06, + "loss": 0.5694, + "step": 10465 + }, + { + "epoch": 2.778441523961237, + "grad_norm": 0.4491370408988263, + "learning_rate": 2.8071527889572224e-06, + "loss": 0.5827, + "step": 10466 + }, + { + "epoch": 2.778707022434621, + "grad_norm": 0.4300037785589767, + "learning_rate": 2.8068063030826383e-06, + "loss": 0.5395, + "step": 10467 + }, + { + "epoch": 2.7789725209080047, + "grad_norm": 0.45269389369088164, + "learning_rate": 2.80645981122458e-06, + "loss": 0.5593, + "step": 10468 + }, + { + "epoch": 2.7792380193813884, + "grad_norm": 0.4688278417951856, + "learning_rate": 2.806113313389805e-06, + "loss": 0.5629, + "step": 10469 + }, + { + "epoch": 2.7795035178547725, + "grad_norm": 0.43709253594876935, + "learning_rate": 2.805766809585071e-06, + "loss": 0.522, + "step": 10470 + }, + { + "epoch": 2.779769016328156, + "grad_norm": 0.4340647150768764, + "learning_rate": 2.8054202998171347e-06, + "loss": 0.5422, + "step": 10471 + }, + { + "epoch": 2.7800345148015397, + "grad_norm": 0.44013145330380476, + "learning_rate": 2.8050737840927554e-06, + "loss": 0.5842, + "step": 10472 + }, + { + "epoch": 2.780300013274924, + "grad_norm": 0.43444892789772244, + "learning_rate": 2.8047272624186905e-06, + "loss": 0.5404, + "step": 10473 + }, + { + "epoch": 2.7805655117483075, + "grad_norm": 0.460427172920911, + "learning_rate": 2.804380734801699e-06, + "loss": 0.5172, + "step": 10474 + }, + { + "epoch": 2.780831010221691, + "grad_norm": 0.44657994425161107, + "learning_rate": 2.804034201248536e-06, + "loss": 0.58, + "step": 10475 + }, + { + "epoch": 2.781096508695075, + "grad_norm": 0.4228911374224898, + "learning_rate": 2.8036876617659625e-06, + "loss": 0.5423, + "step": 10476 + }, + { + "epoch": 2.781362007168459, + "grad_norm": 0.4409461251556048, + "learning_rate": 2.8033411163607365e-06, + "loss": 0.5705, + "step": 10477 + }, + { + "epoch": 2.7816275056418425, + "grad_norm": 0.44718543232049196, + "learning_rate": 2.8029945650396155e-06, + "loss": 0.5716, + "step": 10478 + }, + { + "epoch": 2.7818930041152266, + "grad_norm": 0.4409114102438968, + "learning_rate": 2.802648007809359e-06, + "loss": 0.5636, + "step": 10479 + }, + { + "epoch": 2.78215850258861, + "grad_norm": 0.44209091517713006, + "learning_rate": 2.8023014446767243e-06, + "loss": 0.5588, + "step": 10480 + }, + { + "epoch": 2.782424001061994, + "grad_norm": 0.45289837051153686, + "learning_rate": 2.8019548756484722e-06, + "loss": 0.5811, + "step": 10481 + }, + { + "epoch": 2.782689499535378, + "grad_norm": 0.4578715155628391, + "learning_rate": 2.80160830073136e-06, + "loss": 0.5644, + "step": 10482 + }, + { + "epoch": 2.7829549980087616, + "grad_norm": 0.456940218300286, + "learning_rate": 2.8012617199321487e-06, + "loss": 0.5292, + "step": 10483 + }, + { + "epoch": 2.783220496482145, + "grad_norm": 0.4629766024596027, + "learning_rate": 2.8009151332575953e-06, + "loss": 0.5764, + "step": 10484 + }, + { + "epoch": 2.783485994955529, + "grad_norm": 0.43351524769943706, + "learning_rate": 2.80056854071446e-06, + "loss": 0.5496, + "step": 10485 + }, + { + "epoch": 2.783751493428913, + "grad_norm": 0.44332475123629084, + "learning_rate": 2.800221942309503e-06, + "loss": 0.5478, + "step": 10486 + }, + { + "epoch": 2.7840169919022966, + "grad_norm": 0.4391659746281752, + "learning_rate": 2.799875338049482e-06, + "loss": 0.5726, + "step": 10487 + }, + { + "epoch": 2.78428249037568, + "grad_norm": 0.45699278228552365, + "learning_rate": 2.799528727941158e-06, + "loss": 0.5486, + "step": 10488 + }, + { + "epoch": 2.784547988849064, + "grad_norm": 0.4527070693125884, + "learning_rate": 2.799182111991291e-06, + "loss": 0.5839, + "step": 10489 + }, + { + "epoch": 2.784813487322448, + "grad_norm": 0.43521745818840013, + "learning_rate": 2.7988354902066396e-06, + "loss": 0.5312, + "step": 10490 + }, + { + "epoch": 2.7850789857958316, + "grad_norm": 0.4334742801710779, + "learning_rate": 2.798488862593965e-06, + "loss": 0.5466, + "step": 10491 + }, + { + "epoch": 2.785344484269215, + "grad_norm": 0.44982183326516045, + "learning_rate": 2.7981422291600267e-06, + "loss": 0.5531, + "step": 10492 + }, + { + "epoch": 2.7856099827425993, + "grad_norm": 0.43731541186878287, + "learning_rate": 2.7977955899115846e-06, + "loss": 0.5351, + "step": 10493 + }, + { + "epoch": 2.785875481215983, + "grad_norm": 0.44127802375670694, + "learning_rate": 2.7974489448553997e-06, + "loss": 0.5567, + "step": 10494 + }, + { + "epoch": 2.7861409796893666, + "grad_norm": 0.4308114385907659, + "learning_rate": 2.797102293998232e-06, + "loss": 0.5212, + "step": 10495 + }, + { + "epoch": 2.7864064781627507, + "grad_norm": 0.4351691881307375, + "learning_rate": 2.7967556373468417e-06, + "loss": 0.5493, + "step": 10496 + }, + { + "epoch": 2.7866719766361343, + "grad_norm": 0.43146999624051124, + "learning_rate": 2.7964089749079907e-06, + "loss": 0.5337, + "step": 10497 + }, + { + "epoch": 2.786937475109518, + "grad_norm": 0.4306086596391168, + "learning_rate": 2.7960623066884384e-06, + "loss": 0.556, + "step": 10498 + }, + { + "epoch": 2.787202973582902, + "grad_norm": 0.44025750445403783, + "learning_rate": 2.7957156326949458e-06, + "loss": 0.5554, + "step": 10499 + }, + { + "epoch": 2.7874684720562857, + "grad_norm": 0.44842871131581036, + "learning_rate": 2.795368952934275e-06, + "loss": 0.5596, + "step": 10500 + }, + { + "epoch": 2.7877339705296693, + "grad_norm": 0.4322385909166665, + "learning_rate": 2.795022267413187e-06, + "loss": 0.577, + "step": 10501 + }, + { + "epoch": 2.7879994690030534, + "grad_norm": 0.4431076615005532, + "learning_rate": 2.7946755761384416e-06, + "loss": 0.546, + "step": 10502 + }, + { + "epoch": 2.788264967476437, + "grad_norm": 0.42339241529810967, + "learning_rate": 2.7943288791168015e-06, + "loss": 0.5182, + "step": 10503 + }, + { + "epoch": 2.7885304659498207, + "grad_norm": 0.43891955385552456, + "learning_rate": 2.7939821763550275e-06, + "loss": 0.561, + "step": 10504 + }, + { + "epoch": 2.7887959644232048, + "grad_norm": 0.442670818292544, + "learning_rate": 2.7936354678598806e-06, + "loss": 0.5751, + "step": 10505 + }, + { + "epoch": 2.7890614628965884, + "grad_norm": 0.43367325477647933, + "learning_rate": 2.793288753638124e-06, + "loss": 0.5303, + "step": 10506 + }, + { + "epoch": 2.789326961369972, + "grad_norm": 0.433386786819893, + "learning_rate": 2.792942033696518e-06, + "loss": 0.5627, + "step": 10507 + }, + { + "epoch": 2.789592459843356, + "grad_norm": 0.4498891646756554, + "learning_rate": 2.792595308041825e-06, + "loss": 0.5483, + "step": 10508 + }, + { + "epoch": 2.7898579583167398, + "grad_norm": 0.44050096668010646, + "learning_rate": 2.792248576680807e-06, + "loss": 0.5427, + "step": 10509 + }, + { + "epoch": 2.7901234567901234, + "grad_norm": 0.42376822589900326, + "learning_rate": 2.791901839620227e-06, + "loss": 0.5499, + "step": 10510 + }, + { + "epoch": 2.7903889552635075, + "grad_norm": 0.41797640623111665, + "learning_rate": 2.7915550968668457e-06, + "loss": 0.4954, + "step": 10511 + }, + { + "epoch": 2.790654453736891, + "grad_norm": 0.4543151101844229, + "learning_rate": 2.7912083484274266e-06, + "loss": 0.5929, + "step": 10512 + }, + { + "epoch": 2.790919952210275, + "grad_norm": 0.4402275738029535, + "learning_rate": 2.790861594308732e-06, + "loss": 0.5427, + "step": 10513 + }, + { + "epoch": 2.7911854506836584, + "grad_norm": 0.4425170165553875, + "learning_rate": 2.7905148345175236e-06, + "loss": 0.6049, + "step": 10514 + }, + { + "epoch": 2.7914509491570425, + "grad_norm": 0.44340689957085677, + "learning_rate": 2.7901680690605646e-06, + "loss": 0.5821, + "step": 10515 + }, + { + "epoch": 2.791716447630426, + "grad_norm": 0.4392457825631366, + "learning_rate": 2.7898212979446184e-06, + "loss": 0.6112, + "step": 10516 + }, + { + "epoch": 2.79198194610381, + "grad_norm": 0.42883478684387166, + "learning_rate": 2.7894745211764462e-06, + "loss": 0.522, + "step": 10517 + }, + { + "epoch": 2.7922474445771934, + "grad_norm": 0.42951308276688926, + "learning_rate": 2.789127738762813e-06, + "loss": 0.5389, + "step": 10518 + }, + { + "epoch": 2.7925129430505775, + "grad_norm": 0.44832622581488474, + "learning_rate": 2.7887809507104807e-06, + "loss": 0.5555, + "step": 10519 + }, + { + "epoch": 2.792778441523961, + "grad_norm": 0.4412561276020851, + "learning_rate": 2.788434157026213e-06, + "loss": 0.52, + "step": 10520 + }, + { + "epoch": 2.793043939997345, + "grad_norm": 0.4478459143617638, + "learning_rate": 2.7880873577167726e-06, + "loss": 0.5607, + "step": 10521 + }, + { + "epoch": 2.793309438470729, + "grad_norm": 0.45758509786411566, + "learning_rate": 2.7877405527889232e-06, + "loss": 0.5664, + "step": 10522 + }, + { + "epoch": 2.7935749369441125, + "grad_norm": 0.4272726399720209, + "learning_rate": 2.7873937422494298e-06, + "loss": 0.5271, + "step": 10523 + }, + { + "epoch": 2.793840435417496, + "grad_norm": 0.4487350175073668, + "learning_rate": 2.7870469261050536e-06, + "loss": 0.5741, + "step": 10524 + }, + { + "epoch": 2.7941059338908802, + "grad_norm": 0.43904134072888507, + "learning_rate": 2.7867001043625596e-06, + "loss": 0.5658, + "step": 10525 + }, + { + "epoch": 2.794371432364264, + "grad_norm": 0.4419789448856741, + "learning_rate": 2.786353277028712e-06, + "loss": 0.5265, + "step": 10526 + }, + { + "epoch": 2.7946369308376475, + "grad_norm": 0.4437510334547723, + "learning_rate": 2.786006444110274e-06, + "loss": 0.5602, + "step": 10527 + }, + { + "epoch": 2.7949024293110316, + "grad_norm": 0.43769917218369814, + "learning_rate": 2.78565960561401e-06, + "loss": 0.5493, + "step": 10528 + }, + { + "epoch": 2.7951679277844153, + "grad_norm": 0.43720577913508585, + "learning_rate": 2.785312761546685e-06, + "loss": 0.551, + "step": 10529 + }, + { + "epoch": 2.795433426257799, + "grad_norm": 0.4252455232570073, + "learning_rate": 2.7849659119150617e-06, + "loss": 0.5013, + "step": 10530 + }, + { + "epoch": 2.795698924731183, + "grad_norm": 0.42993662535985805, + "learning_rate": 2.7846190567259065e-06, + "loss": 0.5325, + "step": 10531 + }, + { + "epoch": 2.7959644232045666, + "grad_norm": 0.4510636230448123, + "learning_rate": 2.784272195985982e-06, + "loss": 0.6062, + "step": 10532 + }, + { + "epoch": 2.7962299216779503, + "grad_norm": 0.4449948562515512, + "learning_rate": 2.783925329702054e-06, + "loss": 0.6123, + "step": 10533 + }, + { + "epoch": 2.7964954201513343, + "grad_norm": 0.43664890573548476, + "learning_rate": 2.7835784578808867e-06, + "loss": 0.5426, + "step": 10534 + }, + { + "epoch": 2.796760918624718, + "grad_norm": 0.4341847820773193, + "learning_rate": 2.783231580529245e-06, + "loss": 0.5775, + "step": 10535 + }, + { + "epoch": 2.7970264170981016, + "grad_norm": 0.4466786420060746, + "learning_rate": 2.7828846976538937e-06, + "loss": 0.562, + "step": 10536 + }, + { + "epoch": 2.7972919155714857, + "grad_norm": 0.44597150859224177, + "learning_rate": 2.7825378092615996e-06, + "loss": 0.5481, + "step": 10537 + }, + { + "epoch": 2.7975574140448694, + "grad_norm": 0.4445035943142575, + "learning_rate": 2.7821909153591254e-06, + "loss": 0.5456, + "step": 10538 + }, + { + "epoch": 2.797822912518253, + "grad_norm": 0.4568422746265537, + "learning_rate": 2.781844015953237e-06, + "loss": 0.5677, + "step": 10539 + }, + { + "epoch": 2.7980884109916366, + "grad_norm": 0.44309339754799343, + "learning_rate": 2.781497111050701e-06, + "loss": 0.5617, + "step": 10540 + }, + { + "epoch": 2.7983539094650207, + "grad_norm": 0.4456142447545801, + "learning_rate": 2.7811502006582817e-06, + "loss": 0.5814, + "step": 10541 + }, + { + "epoch": 2.7986194079384044, + "grad_norm": 0.42861490071073605, + "learning_rate": 2.7808032847827454e-06, + "loss": 0.5524, + "step": 10542 + }, + { + "epoch": 2.798884906411788, + "grad_norm": 0.43350977887963693, + "learning_rate": 2.780456363430858e-06, + "loss": 0.5702, + "step": 10543 + }, + { + "epoch": 2.7991504048851716, + "grad_norm": 0.4328383772956244, + "learning_rate": 2.7801094366093832e-06, + "loss": 0.5248, + "step": 10544 + }, + { + "epoch": 2.7994159033585557, + "grad_norm": 0.43908501157336555, + "learning_rate": 2.77976250432509e-06, + "loss": 0.5667, + "step": 10545 + }, + { + "epoch": 2.7996814018319394, + "grad_norm": 0.45323643999360086, + "learning_rate": 2.7794155665847428e-06, + "loss": 0.5803, + "step": 10546 + }, + { + "epoch": 2.799946900305323, + "grad_norm": 0.44208681324142574, + "learning_rate": 2.7790686233951074e-06, + "loss": 0.589, + "step": 10547 + }, + { + "epoch": 2.800212398778707, + "grad_norm": 0.4441191195150581, + "learning_rate": 2.7787216747629508e-06, + "loss": 0.5544, + "step": 10548 + }, + { + "epoch": 2.8004778972520907, + "grad_norm": 0.45056287323084654, + "learning_rate": 2.7783747206950394e-06, + "loss": 0.55, + "step": 10549 + }, + { + "epoch": 2.8007433957254744, + "grad_norm": 0.4422943593481195, + "learning_rate": 2.7780277611981394e-06, + "loss": 0.5398, + "step": 10550 + }, + { + "epoch": 2.8010088941988585, + "grad_norm": 0.4467493248070624, + "learning_rate": 2.7776807962790168e-06, + "loss": 0.564, + "step": 10551 + }, + { + "epoch": 2.801274392672242, + "grad_norm": 0.43920425013664993, + "learning_rate": 2.7773338259444395e-06, + "loss": 0.552, + "step": 10552 + }, + { + "epoch": 2.8015398911456257, + "grad_norm": 0.443143371632332, + "learning_rate": 2.776986850201173e-06, + "loss": 0.5491, + "step": 10553 + }, + { + "epoch": 2.80180538961901, + "grad_norm": 0.427892193826595, + "learning_rate": 2.776639869055984e-06, + "loss": 0.5907, + "step": 10554 + }, + { + "epoch": 2.8020708880923935, + "grad_norm": 0.43638936817574187, + "learning_rate": 2.7762928825156417e-06, + "loss": 0.5416, + "step": 10555 + }, + { + "epoch": 2.802336386565777, + "grad_norm": 0.4264751950088093, + "learning_rate": 2.7759458905869106e-06, + "loss": 0.6014, + "step": 10556 + }, + { + "epoch": 2.802601885039161, + "grad_norm": 0.4388001403143616, + "learning_rate": 2.77559889327656e-06, + "loss": 0.5757, + "step": 10557 + }, + { + "epoch": 2.802867383512545, + "grad_norm": 0.4366498684355708, + "learning_rate": 2.775251890591356e-06, + "loss": 0.5693, + "step": 10558 + }, + { + "epoch": 2.8031328819859285, + "grad_norm": 0.4442745644298153, + "learning_rate": 2.774904882538066e-06, + "loss": 0.5953, + "step": 10559 + }, + { + "epoch": 2.8033983804593126, + "grad_norm": 0.43767105485588054, + "learning_rate": 2.774557869123457e-06, + "loss": 0.5543, + "step": 10560 + }, + { + "epoch": 2.803663878932696, + "grad_norm": 0.4331951885932399, + "learning_rate": 2.7742108503542984e-06, + "loss": 0.5588, + "step": 10561 + }, + { + "epoch": 2.80392937740608, + "grad_norm": 0.4401328264003281, + "learning_rate": 2.773863826237356e-06, + "loss": 0.5742, + "step": 10562 + }, + { + "epoch": 2.804194875879464, + "grad_norm": 0.436347563718071, + "learning_rate": 2.7735167967793992e-06, + "loss": 0.5343, + "step": 10563 + }, + { + "epoch": 2.8044603743528476, + "grad_norm": 0.4426794260742564, + "learning_rate": 2.7731697619871943e-06, + "loss": 0.5871, + "step": 10564 + }, + { + "epoch": 2.804725872826231, + "grad_norm": 0.42455976872486323, + "learning_rate": 2.772822721867511e-06, + "loss": 0.5224, + "step": 10565 + }, + { + "epoch": 2.8049913712996153, + "grad_norm": 0.4463293295311799, + "learning_rate": 2.7724756764271168e-06, + "loss": 0.5476, + "step": 10566 + }, + { + "epoch": 2.805256869772999, + "grad_norm": 0.4452776662208447, + "learning_rate": 2.7721286256727793e-06, + "loss": 0.5724, + "step": 10567 + }, + { + "epoch": 2.8055223682463826, + "grad_norm": 0.43512064655992394, + "learning_rate": 2.771781569611267e-06, + "loss": 0.5475, + "step": 10568 + }, + { + "epoch": 2.805787866719766, + "grad_norm": 0.4726925165112414, + "learning_rate": 2.7714345082493492e-06, + "loss": 0.5692, + "step": 10569 + }, + { + "epoch": 2.8060533651931503, + "grad_norm": 0.4419990293823392, + "learning_rate": 2.771087441593794e-06, + "loss": 0.5674, + "step": 10570 + }, + { + "epoch": 2.806318863666534, + "grad_norm": 0.42811353149230036, + "learning_rate": 2.770740369651369e-06, + "loss": 0.5481, + "step": 10571 + }, + { + "epoch": 2.8065843621399176, + "grad_norm": 0.43469683578923585, + "learning_rate": 2.7703932924288447e-06, + "loss": 0.5561, + "step": 10572 + }, + { + "epoch": 2.806849860613301, + "grad_norm": 0.43842701465361344, + "learning_rate": 2.770046209932989e-06, + "loss": 0.5467, + "step": 10573 + }, + { + "epoch": 2.8071153590866853, + "grad_norm": 0.4386137823938788, + "learning_rate": 2.7696991221705703e-06, + "loss": 0.5492, + "step": 10574 + }, + { + "epoch": 2.807380857560069, + "grad_norm": 0.4381966253564306, + "learning_rate": 2.76935202914836e-06, + "loss": 0.5477, + "step": 10575 + }, + { + "epoch": 2.8076463560334526, + "grad_norm": 0.44927958665691115, + "learning_rate": 2.769004930873125e-06, + "loss": 0.5964, + "step": 10576 + }, + { + "epoch": 2.8079118545068367, + "grad_norm": 0.4515440615149994, + "learning_rate": 2.7686578273516346e-06, + "loss": 0.5535, + "step": 10577 + }, + { + "epoch": 2.8081773529802203, + "grad_norm": 0.44200279878803483, + "learning_rate": 2.7683107185906593e-06, + "loss": 0.5499, + "step": 10578 + }, + { + "epoch": 2.808442851453604, + "grad_norm": 0.43924566835624673, + "learning_rate": 2.7679636045969678e-06, + "loss": 0.5548, + "step": 10579 + }, + { + "epoch": 2.808708349926988, + "grad_norm": 0.4483943066425492, + "learning_rate": 2.7676164853773303e-06, + "loss": 0.5726, + "step": 10580 + }, + { + "epoch": 2.8089738484003717, + "grad_norm": 0.4610698812710244, + "learning_rate": 2.7672693609385155e-06, + "loss": 0.5407, + "step": 10581 + }, + { + "epoch": 2.8092393468737553, + "grad_norm": 0.44800539585814536, + "learning_rate": 2.766922231287294e-06, + "loss": 0.5512, + "step": 10582 + }, + { + "epoch": 2.8095048453471394, + "grad_norm": 0.43799287921852015, + "learning_rate": 2.766575096430435e-06, + "loss": 0.5443, + "step": 10583 + }, + { + "epoch": 2.809770343820523, + "grad_norm": 0.44111341464566506, + "learning_rate": 2.76622795637471e-06, + "loss": 0.5682, + "step": 10584 + }, + { + "epoch": 2.8100358422939067, + "grad_norm": 0.4338807378969649, + "learning_rate": 2.7658808111268877e-06, + "loss": 0.5802, + "step": 10585 + }, + { + "epoch": 2.8103013407672908, + "grad_norm": 0.44332913970194315, + "learning_rate": 2.7655336606937384e-06, + "loss": 0.5451, + "step": 10586 + }, + { + "epoch": 2.8105668392406744, + "grad_norm": 0.45022207017053134, + "learning_rate": 2.7651865050820325e-06, + "loss": 0.5856, + "step": 10587 + }, + { + "epoch": 2.810832337714058, + "grad_norm": 0.4496839119812933, + "learning_rate": 2.7648393442985403e-06, + "loss": 0.5551, + "step": 10588 + }, + { + "epoch": 2.811097836187442, + "grad_norm": 0.4451370812827847, + "learning_rate": 2.7644921783500336e-06, + "loss": 0.5671, + "step": 10589 + }, + { + "epoch": 2.8113633346608258, + "grad_norm": 0.45048299410558373, + "learning_rate": 2.7641450072432795e-06, + "loss": 0.5605, + "step": 10590 + }, + { + "epoch": 2.8116288331342094, + "grad_norm": 0.4367601271746524, + "learning_rate": 2.763797830985053e-06, + "loss": 0.5944, + "step": 10591 + }, + { + "epoch": 2.8118943316075935, + "grad_norm": 0.4378956555887698, + "learning_rate": 2.7634506495821225e-06, + "loss": 0.5504, + "step": 10592 + }, + { + "epoch": 2.812159830080977, + "grad_norm": 0.444979611795338, + "learning_rate": 2.763103463041259e-06, + "loss": 0.5364, + "step": 10593 + }, + { + "epoch": 2.812425328554361, + "grad_norm": 0.44129340232543635, + "learning_rate": 2.7627562713692334e-06, + "loss": 0.5409, + "step": 10594 + }, + { + "epoch": 2.812690827027745, + "grad_norm": 0.44016923989723683, + "learning_rate": 2.7624090745728184e-06, + "loss": 0.5691, + "step": 10595 + }, + { + "epoch": 2.8129563255011285, + "grad_norm": 0.43654130244920164, + "learning_rate": 2.7620618726587832e-06, + "loss": 0.5718, + "step": 10596 + }, + { + "epoch": 2.813221823974512, + "grad_norm": 0.4273729110499703, + "learning_rate": 2.7617146656338995e-06, + "loss": 0.5603, + "step": 10597 + }, + { + "epoch": 2.813487322447896, + "grad_norm": 0.4520871551321622, + "learning_rate": 2.76136745350494e-06, + "loss": 0.5923, + "step": 10598 + }, + { + "epoch": 2.8137528209212794, + "grad_norm": 0.4451472313663982, + "learning_rate": 2.7610202362786743e-06, + "loss": 0.5816, + "step": 10599 + }, + { + "epoch": 2.8140183193946635, + "grad_norm": 0.45006732120869, + "learning_rate": 2.760673013961875e-06, + "loss": 0.5359, + "step": 10600 + }, + { + "epoch": 2.814283817868047, + "grad_norm": 0.45882437981787233, + "learning_rate": 2.760325786561314e-06, + "loss": 0.5354, + "step": 10601 + }, + { + "epoch": 2.814549316341431, + "grad_norm": 0.4410479847607938, + "learning_rate": 2.7599785540837624e-06, + "loss": 0.5213, + "step": 10602 + }, + { + "epoch": 2.814814814814815, + "grad_norm": 0.4358462493118188, + "learning_rate": 2.7596313165359926e-06, + "loss": 0.5556, + "step": 10603 + }, + { + "epoch": 2.8150803132881985, + "grad_norm": 0.4628377559891519, + "learning_rate": 2.759284073924777e-06, + "loss": 0.5763, + "step": 10604 + }, + { + "epoch": 2.815345811761582, + "grad_norm": 0.44226971996897274, + "learning_rate": 2.758936826256887e-06, + "loss": 0.5389, + "step": 10605 + }, + { + "epoch": 2.8156113102349662, + "grad_norm": 0.4557947426164022, + "learning_rate": 2.7585895735390945e-06, + "loss": 0.5746, + "step": 10606 + }, + { + "epoch": 2.81587680870835, + "grad_norm": 0.4436909232556306, + "learning_rate": 2.7582423157781725e-06, + "loss": 0.5735, + "step": 10607 + }, + { + "epoch": 2.8161423071817335, + "grad_norm": 0.4513578264659114, + "learning_rate": 2.7578950529808927e-06, + "loss": 0.5526, + "step": 10608 + }, + { + "epoch": 2.8164078056551176, + "grad_norm": 0.43611830729237494, + "learning_rate": 2.757547785154028e-06, + "loss": 0.5813, + "step": 10609 + }, + { + "epoch": 2.8166733041285013, + "grad_norm": 0.4416523485895305, + "learning_rate": 2.757200512304351e-06, + "loss": 0.5671, + "step": 10610 + }, + { + "epoch": 2.816938802601885, + "grad_norm": 0.45280427706140286, + "learning_rate": 2.756853234438634e-06, + "loss": 0.5884, + "step": 10611 + }, + { + "epoch": 2.817204301075269, + "grad_norm": 0.45190705040783713, + "learning_rate": 2.7565059515636503e-06, + "loss": 0.5268, + "step": 10612 + }, + { + "epoch": 2.8174697995486526, + "grad_norm": 0.4452826536415397, + "learning_rate": 2.756158663686173e-06, + "loss": 0.5808, + "step": 10613 + }, + { + "epoch": 2.8177352980220363, + "grad_norm": 0.440392837647871, + "learning_rate": 2.755811370812974e-06, + "loss": 0.5502, + "step": 10614 + }, + { + "epoch": 2.8180007964954203, + "grad_norm": 0.436849901388548, + "learning_rate": 2.7554640729508273e-06, + "loss": 0.5335, + "step": 10615 + }, + { + "epoch": 2.818266294968804, + "grad_norm": 0.4418294719857132, + "learning_rate": 2.7551167701065052e-06, + "loss": 0.6036, + "step": 10616 + }, + { + "epoch": 2.8185317934421876, + "grad_norm": 0.4408439808679575, + "learning_rate": 2.7547694622867815e-06, + "loss": 0.588, + "step": 10617 + }, + { + "epoch": 2.8187972919155717, + "grad_norm": 0.4221400608339185, + "learning_rate": 2.75442214949843e-06, + "loss": 0.5193, + "step": 10618 + }, + { + "epoch": 2.8190627903889554, + "grad_norm": 0.42818284074166924, + "learning_rate": 2.754074831748223e-06, + "loss": 0.5349, + "step": 10619 + }, + { + "epoch": 2.819328288862339, + "grad_norm": 0.43532512602667456, + "learning_rate": 2.753727509042935e-06, + "loss": 0.5498, + "step": 10620 + }, + { + "epoch": 2.819593787335723, + "grad_norm": 0.42561874656225274, + "learning_rate": 2.7533801813893397e-06, + "loss": 0.5417, + "step": 10621 + }, + { + "epoch": 2.8198592858091067, + "grad_norm": 0.44509111314623134, + "learning_rate": 2.7530328487942097e-06, + "loss": 0.5784, + "step": 10622 + }, + { + "epoch": 2.8201247842824904, + "grad_norm": 0.42963079375850155, + "learning_rate": 2.7526855112643204e-06, + "loss": 0.5543, + "step": 10623 + }, + { + "epoch": 2.820390282755874, + "grad_norm": 0.43517236755634625, + "learning_rate": 2.7523381688064447e-06, + "loss": 0.5578, + "step": 10624 + }, + { + "epoch": 2.820655781229258, + "grad_norm": 0.4508145134821779, + "learning_rate": 2.7519908214273566e-06, + "loss": 0.5592, + "step": 10625 + }, + { + "epoch": 2.8209212797026417, + "grad_norm": 0.4348584154929015, + "learning_rate": 2.7516434691338304e-06, + "loss": 0.5543, + "step": 10626 + }, + { + "epoch": 2.8211867781760254, + "grad_norm": 0.4421029513540858, + "learning_rate": 2.751296111932641e-06, + "loss": 0.5499, + "step": 10627 + }, + { + "epoch": 2.821452276649409, + "grad_norm": 0.4484014381364127, + "learning_rate": 2.7509487498305615e-06, + "loss": 0.5611, + "step": 10628 + }, + { + "epoch": 2.821717775122793, + "grad_norm": 0.44609023722452157, + "learning_rate": 2.7506013828343674e-06, + "loss": 0.555, + "step": 10629 + }, + { + "epoch": 2.8219832735961767, + "grad_norm": 0.4359919685918703, + "learning_rate": 2.750254010950833e-06, + "loss": 0.5219, + "step": 10630 + }, + { + "epoch": 2.8222487720695604, + "grad_norm": 0.4467371403472016, + "learning_rate": 2.749906634186732e-06, + "loss": 0.5298, + "step": 10631 + }, + { + "epoch": 2.8225142705429445, + "grad_norm": 0.4426489507273086, + "learning_rate": 2.74955925254884e-06, + "loss": 0.5474, + "step": 10632 + }, + { + "epoch": 2.822779769016328, + "grad_norm": 0.4437186878869067, + "learning_rate": 2.749211866043932e-06, + "loss": 0.5577, + "step": 10633 + }, + { + "epoch": 2.8230452674897117, + "grad_norm": 0.4447663690369507, + "learning_rate": 2.748864474678782e-06, + "loss": 0.548, + "step": 10634 + }, + { + "epoch": 2.823310765963096, + "grad_norm": 0.44371540080373734, + "learning_rate": 2.7485170784601654e-06, + "loss": 0.5129, + "step": 10635 + }, + { + "epoch": 2.8235762644364795, + "grad_norm": 0.43376039298415986, + "learning_rate": 2.7481696773948573e-06, + "loss": 0.5481, + "step": 10636 + }, + { + "epoch": 2.823841762909863, + "grad_norm": 0.43534334528229096, + "learning_rate": 2.747822271489633e-06, + "loss": 0.5398, + "step": 10637 + }, + { + "epoch": 2.824107261383247, + "grad_norm": 0.44992331876505093, + "learning_rate": 2.747474860751268e-06, + "loss": 0.5866, + "step": 10638 + }, + { + "epoch": 2.824372759856631, + "grad_norm": 0.4254682402789493, + "learning_rate": 2.7471274451865364e-06, + "loss": 0.5721, + "step": 10639 + }, + { + "epoch": 2.8246382583300145, + "grad_norm": 0.44177179167206426, + "learning_rate": 2.746780024802215e-06, + "loss": 0.5404, + "step": 10640 + }, + { + "epoch": 2.8249037568033986, + "grad_norm": 0.4329871911643497, + "learning_rate": 2.7464325996050795e-06, + "loss": 0.5837, + "step": 10641 + }, + { + "epoch": 2.825169255276782, + "grad_norm": 0.4348738002248126, + "learning_rate": 2.7460851696019043e-06, + "loss": 0.5609, + "step": 10642 + }, + { + "epoch": 2.825434753750166, + "grad_norm": 0.4483380639457842, + "learning_rate": 2.745737734799465e-06, + "loss": 0.548, + "step": 10643 + }, + { + "epoch": 2.82570025222355, + "grad_norm": 0.42716177939380223, + "learning_rate": 2.74539029520454e-06, + "loss": 0.5117, + "step": 10644 + }, + { + "epoch": 2.8259657506969336, + "grad_norm": 0.45642103432381975, + "learning_rate": 2.7450428508239024e-06, + "loss": 0.5116, + "step": 10645 + }, + { + "epoch": 2.826231249170317, + "grad_norm": 0.4396893283828446, + "learning_rate": 2.7446954016643283e-06, + "loss": 0.5614, + "step": 10646 + }, + { + "epoch": 2.8264967476437013, + "grad_norm": 0.42226040100124007, + "learning_rate": 2.7443479477325964e-06, + "loss": 0.5723, + "step": 10647 + }, + { + "epoch": 2.826762246117085, + "grad_norm": 0.4346957939485996, + "learning_rate": 2.7440004890354804e-06, + "loss": 0.5772, + "step": 10648 + }, + { + "epoch": 2.8270277445904686, + "grad_norm": 0.4236382257836881, + "learning_rate": 2.743653025579757e-06, + "loss": 0.5057, + "step": 10649 + }, + { + "epoch": 2.8272932430638527, + "grad_norm": 0.4473666579567541, + "learning_rate": 2.7433055573722047e-06, + "loss": 0.5705, + "step": 10650 + }, + { + "epoch": 2.8275587415372363, + "grad_norm": 0.42682423423317256, + "learning_rate": 2.742958084419597e-06, + "loss": 0.5693, + "step": 10651 + }, + { + "epoch": 2.82782424001062, + "grad_norm": 0.4477324596604937, + "learning_rate": 2.7426106067287117e-06, + "loss": 0.5797, + "step": 10652 + }, + { + "epoch": 2.8280897384840036, + "grad_norm": 0.44378764045073016, + "learning_rate": 2.7422631243063265e-06, + "loss": 0.5419, + "step": 10653 + }, + { + "epoch": 2.828355236957387, + "grad_norm": 0.4464112571700783, + "learning_rate": 2.7419156371592166e-06, + "loss": 0.5737, + "step": 10654 + }, + { + "epoch": 2.8286207354307713, + "grad_norm": 0.4587008636993577, + "learning_rate": 2.7415681452941594e-06, + "loss": 0.5157, + "step": 10655 + }, + { + "epoch": 2.828886233904155, + "grad_norm": 0.44779470980037994, + "learning_rate": 2.7412206487179316e-06, + "loss": 0.5667, + "step": 10656 + }, + { + "epoch": 2.8291517323775386, + "grad_norm": 0.448422829382189, + "learning_rate": 2.7408731474373112e-06, + "loss": 0.5623, + "step": 10657 + }, + { + "epoch": 2.8294172308509227, + "grad_norm": 0.4437055962953726, + "learning_rate": 2.740525641459075e-06, + "loss": 0.5845, + "step": 10658 + }, + { + "epoch": 2.8296827293243063, + "grad_norm": 0.45619250344956913, + "learning_rate": 2.740178130789999e-06, + "loss": 0.5862, + "step": 10659 + }, + { + "epoch": 2.82994822779769, + "grad_norm": 0.4539452892278317, + "learning_rate": 2.7398306154368613e-06, + "loss": 0.5831, + "step": 10660 + }, + { + "epoch": 2.830213726271074, + "grad_norm": 0.4270186235202641, + "learning_rate": 2.7394830954064407e-06, + "loss": 0.5547, + "step": 10661 + }, + { + "epoch": 2.8304792247444577, + "grad_norm": 0.44383344157764537, + "learning_rate": 2.7391355707055118e-06, + "loss": 0.5884, + "step": 10662 + }, + { + "epoch": 2.8307447232178413, + "grad_norm": 0.4409802315806133, + "learning_rate": 2.7387880413408546e-06, + "loss": 0.5531, + "step": 10663 + }, + { + "epoch": 2.8310102216912254, + "grad_norm": 0.4421780167994675, + "learning_rate": 2.7384405073192454e-06, + "loss": 0.5678, + "step": 10664 + }, + { + "epoch": 2.831275720164609, + "grad_norm": 0.43736836288697917, + "learning_rate": 2.7380929686474627e-06, + "loss": 0.5518, + "step": 10665 + }, + { + "epoch": 2.8315412186379927, + "grad_norm": 0.44415480322706846, + "learning_rate": 2.7377454253322842e-06, + "loss": 0.5953, + "step": 10666 + }, + { + "epoch": 2.8318067171113768, + "grad_norm": 0.46039082399154857, + "learning_rate": 2.737397877380489e-06, + "loss": 0.5551, + "step": 10667 + }, + { + "epoch": 2.8320722155847604, + "grad_norm": 0.4320254677483836, + "learning_rate": 2.7370503247988523e-06, + "loss": 0.5235, + "step": 10668 + }, + { + "epoch": 2.832337714058144, + "grad_norm": 0.43583894508457366, + "learning_rate": 2.736702767594154e-06, + "loss": 0.5268, + "step": 10669 + }, + { + "epoch": 2.832603212531528, + "grad_norm": 0.43627043861537007, + "learning_rate": 2.7363552057731736e-06, + "loss": 0.5681, + "step": 10670 + }, + { + "epoch": 2.8328687110049118, + "grad_norm": 0.450135259881002, + "learning_rate": 2.7360076393426867e-06, + "loss": 0.5459, + "step": 10671 + }, + { + "epoch": 2.8331342094782954, + "grad_norm": 0.47295130059901647, + "learning_rate": 2.735660068309473e-06, + "loss": 0.5697, + "step": 10672 + }, + { + "epoch": 2.8333997079516795, + "grad_norm": 0.4513715877264598, + "learning_rate": 2.7353124926803113e-06, + "loss": 0.5937, + "step": 10673 + }, + { + "epoch": 2.833665206425063, + "grad_norm": 0.44250118610240136, + "learning_rate": 2.7349649124619797e-06, + "loss": 0.5773, + "step": 10674 + }, + { + "epoch": 2.833930704898447, + "grad_norm": 0.4393591288004187, + "learning_rate": 2.734617327661257e-06, + "loss": 0.5727, + "step": 10675 + }, + { + "epoch": 2.834196203371831, + "grad_norm": 0.4464833214164496, + "learning_rate": 2.7342697382849228e-06, + "loss": 0.5437, + "step": 10676 + }, + { + "epoch": 2.8344617018452145, + "grad_norm": 0.445081832161792, + "learning_rate": 2.733922144339754e-06, + "loss": 0.5344, + "step": 10677 + }, + { + "epoch": 2.834727200318598, + "grad_norm": 0.4438103958959221, + "learning_rate": 2.7335745458325307e-06, + "loss": 0.5535, + "step": 10678 + }, + { + "epoch": 2.834992698791982, + "grad_norm": 0.4459330265843424, + "learning_rate": 2.733226942770033e-06, + "loss": 0.576, + "step": 10679 + }, + { + "epoch": 2.835258197265366, + "grad_norm": 0.4410880053471279, + "learning_rate": 2.732879335159038e-06, + "loss": 0.5907, + "step": 10680 + }, + { + "epoch": 2.8355236957387495, + "grad_norm": 0.4394805707102283, + "learning_rate": 2.7325317230063257e-06, + "loss": 0.549, + "step": 10681 + }, + { + "epoch": 2.835789194212133, + "grad_norm": 0.46052245355471666, + "learning_rate": 2.7321841063186754e-06, + "loss": 0.5839, + "step": 10682 + }, + { + "epoch": 2.836054692685517, + "grad_norm": 0.45082112542932135, + "learning_rate": 2.7318364851028666e-06, + "loss": 0.5412, + "step": 10683 + }, + { + "epoch": 2.836320191158901, + "grad_norm": 0.4446068158667599, + "learning_rate": 2.7314888593656797e-06, + "loss": 0.5673, + "step": 10684 + }, + { + "epoch": 2.8365856896322845, + "grad_norm": 0.43810989544211665, + "learning_rate": 2.7311412291138924e-06, + "loss": 0.5668, + "step": 10685 + }, + { + "epoch": 2.836851188105668, + "grad_norm": 0.4506932297462287, + "learning_rate": 2.730793594354285e-06, + "loss": 0.56, + "step": 10686 + }, + { + "epoch": 2.8371166865790522, + "grad_norm": 0.45080786409307233, + "learning_rate": 2.730445955093638e-06, + "loss": 0.5432, + "step": 10687 + }, + { + "epoch": 2.837382185052436, + "grad_norm": 0.4430069968994358, + "learning_rate": 2.7300983113387304e-06, + "loss": 0.552, + "step": 10688 + }, + { + "epoch": 2.8376476835258195, + "grad_norm": 0.4351373689010535, + "learning_rate": 2.7297506630963423e-06, + "loss": 0.5369, + "step": 10689 + }, + { + "epoch": 2.8379131819992036, + "grad_norm": 0.4317588986923279, + "learning_rate": 2.7294030103732537e-06, + "loss": 0.5732, + "step": 10690 + }, + { + "epoch": 2.8381786804725873, + "grad_norm": 0.45169959954142747, + "learning_rate": 2.7290553531762442e-06, + "loss": 0.5962, + "step": 10691 + }, + { + "epoch": 2.838444178945971, + "grad_norm": 0.4649722622860929, + "learning_rate": 2.7287076915120952e-06, + "loss": 0.5397, + "step": 10692 + }, + { + "epoch": 2.838709677419355, + "grad_norm": 0.4432483580745817, + "learning_rate": 2.7283600253875863e-06, + "loss": 0.5816, + "step": 10693 + }, + { + "epoch": 2.8389751758927386, + "grad_norm": 0.4404069548426601, + "learning_rate": 2.728012354809497e-06, + "loss": 0.5864, + "step": 10694 + }, + { + "epoch": 2.8392406743661223, + "grad_norm": 0.43347988161478124, + "learning_rate": 2.7276646797846096e-06, + "loss": 0.5278, + "step": 10695 + }, + { + "epoch": 2.8395061728395063, + "grad_norm": 0.4334464892160729, + "learning_rate": 2.7273170003197037e-06, + "loss": 0.5717, + "step": 10696 + }, + { + "epoch": 2.83977167131289, + "grad_norm": 0.4361039324888806, + "learning_rate": 2.726969316421558e-06, + "loss": 0.5205, + "step": 10697 + }, + { + "epoch": 2.8400371697862736, + "grad_norm": 0.42857044148210877, + "learning_rate": 2.7266216280969564e-06, + "loss": 0.5231, + "step": 10698 + }, + { + "epoch": 2.8403026682596577, + "grad_norm": 0.425869053781619, + "learning_rate": 2.726273935352678e-06, + "loss": 0.4966, + "step": 10699 + }, + { + "epoch": 2.8405681667330414, + "grad_norm": 0.4414217548627363, + "learning_rate": 2.725926238195504e-06, + "loss": 0.5761, + "step": 10700 + }, + { + "epoch": 2.840833665206425, + "grad_norm": 0.4397669819776329, + "learning_rate": 2.7255785366322134e-06, + "loss": 0.5731, + "step": 10701 + }, + { + "epoch": 2.841099163679809, + "grad_norm": 0.44793687205104676, + "learning_rate": 2.7252308306695914e-06, + "loss": 0.6089, + "step": 10702 + }, + { + "epoch": 2.8413646621531927, + "grad_norm": 0.44313943155049007, + "learning_rate": 2.7248831203144154e-06, + "loss": 0.561, + "step": 10703 + }, + { + "epoch": 2.8416301606265764, + "grad_norm": 0.44781255407094406, + "learning_rate": 2.724535405573468e-06, + "loss": 0.5778, + "step": 10704 + }, + { + "epoch": 2.8418956590999604, + "grad_norm": 0.4458138437268211, + "learning_rate": 2.724187686453531e-06, + "loss": 0.6014, + "step": 10705 + }, + { + "epoch": 2.842161157573344, + "grad_norm": 0.4396287078028262, + "learning_rate": 2.723839962961385e-06, + "loss": 0.5377, + "step": 10706 + }, + { + "epoch": 2.8424266560467277, + "grad_norm": 0.43263935211202564, + "learning_rate": 2.7234922351038117e-06, + "loss": 0.5612, + "step": 10707 + }, + { + "epoch": 2.8426921545201114, + "grad_norm": 0.449640752268622, + "learning_rate": 2.7231445028875924e-06, + "loss": 0.5506, + "step": 10708 + }, + { + "epoch": 2.842957652993495, + "grad_norm": 0.4425907480121384, + "learning_rate": 2.7227967663195092e-06, + "loss": 0.5848, + "step": 10709 + }, + { + "epoch": 2.843223151466879, + "grad_norm": 0.44000747206345897, + "learning_rate": 2.7224490254063436e-06, + "loss": 0.5847, + "step": 10710 + }, + { + "epoch": 2.8434886499402627, + "grad_norm": 0.44073374703708895, + "learning_rate": 2.7221012801548767e-06, + "loss": 0.5241, + "step": 10711 + }, + { + "epoch": 2.8437541484136464, + "grad_norm": 0.43607825742065365, + "learning_rate": 2.7217535305718914e-06, + "loss": 0.5619, + "step": 10712 + }, + { + "epoch": 2.8440196468870305, + "grad_norm": 0.44613009648645197, + "learning_rate": 2.72140577666417e-06, + "loss": 0.523, + "step": 10713 + }, + { + "epoch": 2.844285145360414, + "grad_norm": 0.4429421186037967, + "learning_rate": 2.7210580184384933e-06, + "loss": 0.5506, + "step": 10714 + }, + { + "epoch": 2.8445506438337977, + "grad_norm": 0.435176668911199, + "learning_rate": 2.7207102559016434e-06, + "loss": 0.579, + "step": 10715 + }, + { + "epoch": 2.844816142307182, + "grad_norm": 0.4446755129678316, + "learning_rate": 2.720362489060404e-06, + "loss": 0.5561, + "step": 10716 + }, + { + "epoch": 2.8450816407805655, + "grad_norm": 0.4681986655281571, + "learning_rate": 2.7200147179215563e-06, + "loss": 0.5488, + "step": 10717 + }, + { + "epoch": 2.845347139253949, + "grad_norm": 0.44203361895308124, + "learning_rate": 2.7196669424918828e-06, + "loss": 0.5782, + "step": 10718 + }, + { + "epoch": 2.845612637727333, + "grad_norm": 0.4502913671114277, + "learning_rate": 2.7193191627781656e-06, + "loss": 0.5982, + "step": 10719 + }, + { + "epoch": 2.845878136200717, + "grad_norm": 0.4475324941183209, + "learning_rate": 2.7189713787871887e-06, + "loss": 0.5543, + "step": 10720 + }, + { + "epoch": 2.8461436346741005, + "grad_norm": 0.4502905979729806, + "learning_rate": 2.718623590525733e-06, + "loss": 0.5787, + "step": 10721 + }, + { + "epoch": 2.8464091331474846, + "grad_norm": 0.42784602819205586, + "learning_rate": 2.7182757980005823e-06, + "loss": 0.5562, + "step": 10722 + }, + { + "epoch": 2.846674631620868, + "grad_norm": 0.44338859926273616, + "learning_rate": 2.7179280012185193e-06, + "loss": 0.5358, + "step": 10723 + }, + { + "epoch": 2.846940130094252, + "grad_norm": 0.45209777829945, + "learning_rate": 2.7175802001863267e-06, + "loss": 0.5828, + "step": 10724 + }, + { + "epoch": 2.847205628567636, + "grad_norm": 0.4532088818726932, + "learning_rate": 2.7172323949107874e-06, + "loss": 0.5591, + "step": 10725 + }, + { + "epoch": 2.8474711270410196, + "grad_norm": 0.4552617692595368, + "learning_rate": 2.716884585398684e-06, + "loss": 0.5327, + "step": 10726 + }, + { + "epoch": 2.847736625514403, + "grad_norm": 0.42108293350760584, + "learning_rate": 2.716536771656801e-06, + "loss": 0.5601, + "step": 10727 + }, + { + "epoch": 2.8480021239877873, + "grad_norm": 0.4478831595439803, + "learning_rate": 2.71618895369192e-06, + "loss": 0.5083, + "step": 10728 + }, + { + "epoch": 2.848267622461171, + "grad_norm": 0.44615603413500154, + "learning_rate": 2.7158411315108254e-06, + "loss": 0.5794, + "step": 10729 + }, + { + "epoch": 2.8485331209345546, + "grad_norm": 0.445896856924272, + "learning_rate": 2.715493305120301e-06, + "loss": 0.5812, + "step": 10730 + }, + { + "epoch": 2.8487986194079387, + "grad_norm": 0.43854884183833776, + "learning_rate": 2.715145474527128e-06, + "loss": 0.547, + "step": 10731 + }, + { + "epoch": 2.8490641178813223, + "grad_norm": 0.4458427547571963, + "learning_rate": 2.7147976397380925e-06, + "loss": 0.5703, + "step": 10732 + }, + { + "epoch": 2.849329616354706, + "grad_norm": 0.4365011106527827, + "learning_rate": 2.7144498007599775e-06, + "loss": 0.5851, + "step": 10733 + }, + { + "epoch": 2.8495951148280896, + "grad_norm": 0.43704255022741134, + "learning_rate": 2.7141019575995657e-06, + "loss": 0.552, + "step": 10734 + }, + { + "epoch": 2.8498606133014737, + "grad_norm": 0.4548942591174282, + "learning_rate": 2.713754110263641e-06, + "loss": 0.6036, + "step": 10735 + }, + { + "epoch": 2.8501261117748573, + "grad_norm": 0.44620037362776677, + "learning_rate": 2.713406258758989e-06, + "loss": 0.5559, + "step": 10736 + }, + { + "epoch": 2.850391610248241, + "grad_norm": 0.4358700255720994, + "learning_rate": 2.7130584030923913e-06, + "loss": 0.5742, + "step": 10737 + }, + { + "epoch": 2.8506571087216246, + "grad_norm": 0.4375251376284897, + "learning_rate": 2.7127105432706335e-06, + "loss": 0.5173, + "step": 10738 + }, + { + "epoch": 2.8509226071950087, + "grad_norm": 0.4354691859850981, + "learning_rate": 2.7123626793004997e-06, + "loss": 0.5564, + "step": 10739 + }, + { + "epoch": 2.8511881056683923, + "grad_norm": 0.43722452970994985, + "learning_rate": 2.7120148111887732e-06, + "loss": 0.5931, + "step": 10740 + }, + { + "epoch": 2.851453604141776, + "grad_norm": 0.4454867063980965, + "learning_rate": 2.711666938942239e-06, + "loss": 0.5217, + "step": 10741 + }, + { + "epoch": 2.85171910261516, + "grad_norm": 0.44099269311670675, + "learning_rate": 2.7113190625676816e-06, + "loss": 0.5609, + "step": 10742 + }, + { + "epoch": 2.8519846010885437, + "grad_norm": 0.43623175418248467, + "learning_rate": 2.710971182071884e-06, + "loss": 0.5685, + "step": 10743 + }, + { + "epoch": 2.8522500995619273, + "grad_norm": 0.4384356765458919, + "learning_rate": 2.710623297461633e-06, + "loss": 0.5194, + "step": 10744 + }, + { + "epoch": 2.8525155980353114, + "grad_norm": 0.4442303572050867, + "learning_rate": 2.7102754087437117e-06, + "loss": 0.5547, + "step": 10745 + }, + { + "epoch": 2.852781096508695, + "grad_norm": 0.44960201823576745, + "learning_rate": 2.709927515924905e-06, + "loss": 0.5857, + "step": 10746 + }, + { + "epoch": 2.8530465949820787, + "grad_norm": 0.45029950101068483, + "learning_rate": 2.7095796190119967e-06, + "loss": 0.5279, + "step": 10747 + }, + { + "epoch": 2.8533120934554628, + "grad_norm": 0.436445103278809, + "learning_rate": 2.709231718011774e-06, + "loss": 0.5582, + "step": 10748 + }, + { + "epoch": 2.8535775919288464, + "grad_norm": 0.44626933424831894, + "learning_rate": 2.7088838129310202e-06, + "loss": 0.5666, + "step": 10749 + }, + { + "epoch": 2.85384309040223, + "grad_norm": 0.4332365834903132, + "learning_rate": 2.7085359037765203e-06, + "loss": 0.5804, + "step": 10750 + }, + { + "epoch": 2.854108588875614, + "grad_norm": 0.45827132023683453, + "learning_rate": 2.7081879905550603e-06, + "loss": 0.5585, + "step": 10751 + }, + { + "epoch": 2.854374087348998, + "grad_norm": 0.44749313232517135, + "learning_rate": 2.7078400732734245e-06, + "loss": 0.5778, + "step": 10752 + }, + { + "epoch": 2.8546395858223814, + "grad_norm": 0.4510749864353565, + "learning_rate": 2.707492151938398e-06, + "loss": 0.5787, + "step": 10753 + }, + { + "epoch": 2.8549050842957655, + "grad_norm": 0.42752932002261185, + "learning_rate": 2.707144226556767e-06, + "loss": 0.5545, + "step": 10754 + }, + { + "epoch": 2.855170582769149, + "grad_norm": 0.43791650215467726, + "learning_rate": 2.706796297135316e-06, + "loss": 0.5458, + "step": 10755 + }, + { + "epoch": 2.855436081242533, + "grad_norm": 0.44658696973588446, + "learning_rate": 2.7064483636808314e-06, + "loss": 0.5472, + "step": 10756 + }, + { + "epoch": 2.855701579715917, + "grad_norm": 0.4688153715222747, + "learning_rate": 2.706100426200098e-06, + "loss": 0.5409, + "step": 10757 + }, + { + "epoch": 2.8559670781893005, + "grad_norm": 0.4427903144420845, + "learning_rate": 2.705752484699901e-06, + "loss": 0.5828, + "step": 10758 + }, + { + "epoch": 2.856232576662684, + "grad_norm": 0.4316400515050232, + "learning_rate": 2.7054045391870274e-06, + "loss": 0.5671, + "step": 10759 + }, + { + "epoch": 2.8564980751360682, + "grad_norm": 0.44803230039598874, + "learning_rate": 2.7050565896682625e-06, + "loss": 0.535, + "step": 10760 + }, + { + "epoch": 2.856763573609452, + "grad_norm": 0.4481997479866655, + "learning_rate": 2.7047086361503917e-06, + "loss": 0.5433, + "step": 10761 + }, + { + "epoch": 2.8570290720828355, + "grad_norm": 0.4348015755909405, + "learning_rate": 2.704360678640202e-06, + "loss": 0.5572, + "step": 10762 + }, + { + "epoch": 2.857294570556219, + "grad_norm": 0.4306697852027576, + "learning_rate": 2.704012717144478e-06, + "loss": 0.5387, + "step": 10763 + }, + { + "epoch": 2.8575600690296032, + "grad_norm": 0.4435402618330487, + "learning_rate": 2.7036647516700066e-06, + "loss": 0.5427, + "step": 10764 + }, + { + "epoch": 2.857825567502987, + "grad_norm": 0.45371648551094274, + "learning_rate": 2.703316782223574e-06, + "loss": 0.5864, + "step": 10765 + }, + { + "epoch": 2.8580910659763705, + "grad_norm": 0.44391622962162264, + "learning_rate": 2.702968808811966e-06, + "loss": 0.5602, + "step": 10766 + }, + { + "epoch": 2.858356564449754, + "grad_norm": 0.4429972812866065, + "learning_rate": 2.7026208314419695e-06, + "loss": 0.5853, + "step": 10767 + }, + { + "epoch": 2.8586220629231383, + "grad_norm": 0.43715014222390575, + "learning_rate": 2.702272850120371e-06, + "loss": 0.5555, + "step": 10768 + }, + { + "epoch": 2.858887561396522, + "grad_norm": 0.43733868211036886, + "learning_rate": 2.7019248648539565e-06, + "loss": 0.5148, + "step": 10769 + }, + { + "epoch": 2.8591530598699055, + "grad_norm": 0.4352574360301688, + "learning_rate": 2.7015768756495127e-06, + "loss": 0.5537, + "step": 10770 + }, + { + "epoch": 2.8594185583432896, + "grad_norm": 0.441906268408107, + "learning_rate": 2.701228882513826e-06, + "loss": 0.5618, + "step": 10771 + }, + { + "epoch": 2.8596840568166733, + "grad_norm": 0.43448023366429356, + "learning_rate": 2.700880885453684e-06, + "loss": 0.5545, + "step": 10772 + }, + { + "epoch": 2.859949555290057, + "grad_norm": 0.4359060938661836, + "learning_rate": 2.700532884475872e-06, + "loss": 0.5483, + "step": 10773 + }, + { + "epoch": 2.860215053763441, + "grad_norm": 0.45498619981843763, + "learning_rate": 2.7001848795871784e-06, + "loss": 0.5516, + "step": 10774 + }, + { + "epoch": 2.8604805522368246, + "grad_norm": 0.45281036419587767, + "learning_rate": 2.699836870794389e-06, + "loss": 0.6146, + "step": 10775 + }, + { + "epoch": 2.8607460507102083, + "grad_norm": 0.4556607934096583, + "learning_rate": 2.6994888581042915e-06, + "loss": 0.5602, + "step": 10776 + }, + { + "epoch": 2.8610115491835924, + "grad_norm": 0.44492451468883587, + "learning_rate": 2.699140841523674e-06, + "loss": 0.5634, + "step": 10777 + }, + { + "epoch": 2.861277047656976, + "grad_norm": 0.4278712653180521, + "learning_rate": 2.698792821059321e-06, + "loss": 0.5117, + "step": 10778 + }, + { + "epoch": 2.8615425461303596, + "grad_norm": 0.4495792879260062, + "learning_rate": 2.6984447967180228e-06, + "loss": 0.5577, + "step": 10779 + }, + { + "epoch": 2.8618080446037437, + "grad_norm": 0.4258108166799316, + "learning_rate": 2.698096768506564e-06, + "loss": 0.5296, + "step": 10780 + }, + { + "epoch": 2.8620735430771274, + "grad_norm": 0.4504538249286639, + "learning_rate": 2.697748736431733e-06, + "loss": 0.5654, + "step": 10781 + }, + { + "epoch": 2.862339041550511, + "grad_norm": 0.43014281297690643, + "learning_rate": 2.6974007005003176e-06, + "loss": 0.5518, + "step": 10782 + }, + { + "epoch": 2.862604540023895, + "grad_norm": 0.4605838312758772, + "learning_rate": 2.6970526607191053e-06, + "loss": 0.5807, + "step": 10783 + }, + { + "epoch": 2.8628700384972787, + "grad_norm": 0.431864417707121, + "learning_rate": 2.6967046170948843e-06, + "loss": 0.5726, + "step": 10784 + }, + { + "epoch": 2.8631355369706624, + "grad_norm": 0.4445798856846652, + "learning_rate": 2.6963565696344413e-06, + "loss": 0.5521, + "step": 10785 + }, + { + "epoch": 2.8634010354440464, + "grad_norm": 0.4404793748357265, + "learning_rate": 2.6960085183445644e-06, + "loss": 0.5597, + "step": 10786 + }, + { + "epoch": 2.86366653391743, + "grad_norm": 0.4525667450486386, + "learning_rate": 2.695660463232041e-06, + "loss": 0.5516, + "step": 10787 + }, + { + "epoch": 2.8639320323908137, + "grad_norm": 0.44162985586253817, + "learning_rate": 2.6953124043036604e-06, + "loss": 0.5496, + "step": 10788 + }, + { + "epoch": 2.8641975308641974, + "grad_norm": 0.45065809105279425, + "learning_rate": 2.694964341566209e-06, + "loss": 0.5594, + "step": 10789 + }, + { + "epoch": 2.8644630293375815, + "grad_norm": 0.4331856923687826, + "learning_rate": 2.694616275026476e-06, + "loss": 0.572, + "step": 10790 + }, + { + "epoch": 2.864728527810965, + "grad_norm": 0.44905390249869104, + "learning_rate": 2.6942682046912492e-06, + "loss": 0.5677, + "step": 10791 + }, + { + "epoch": 2.8649940262843487, + "grad_norm": 0.4459068620119701, + "learning_rate": 2.693920130567316e-06, + "loss": 0.5467, + "step": 10792 + }, + { + "epoch": 2.8652595247577324, + "grad_norm": 0.4447957581873441, + "learning_rate": 2.6935720526614656e-06, + "loss": 0.5658, + "step": 10793 + }, + { + "epoch": 2.8655250232311165, + "grad_norm": 0.44914334064867867, + "learning_rate": 2.693223970980487e-06, + "loss": 0.5707, + "step": 10794 + }, + { + "epoch": 2.8657905217045, + "grad_norm": 0.45208235913744277, + "learning_rate": 2.6928758855311678e-06, + "loss": 0.5969, + "step": 10795 + }, + { + "epoch": 2.8660560201778837, + "grad_norm": 0.45156122580289887, + "learning_rate": 2.6925277963202954e-06, + "loss": 0.5546, + "step": 10796 + }, + { + "epoch": 2.866321518651268, + "grad_norm": 0.4365754983809178, + "learning_rate": 2.6921797033546607e-06, + "loss": 0.5523, + "step": 10797 + }, + { + "epoch": 2.8665870171246515, + "grad_norm": 0.44211215855259733, + "learning_rate": 2.691831606641051e-06, + "loss": 0.571, + "step": 10798 + }, + { + "epoch": 2.866852515598035, + "grad_norm": 0.4531205492943909, + "learning_rate": 2.6914835061862554e-06, + "loss": 0.5866, + "step": 10799 + }, + { + "epoch": 2.867118014071419, + "grad_norm": 0.42162915272733076, + "learning_rate": 2.691135401997063e-06, + "loss": 0.5326, + "step": 10800 + }, + { + "epoch": 2.867383512544803, + "grad_norm": 0.4619213541114661, + "learning_rate": 2.6907872940802615e-06, + "loss": 0.5468, + "step": 10801 + }, + { + "epoch": 2.8676490110181865, + "grad_norm": 0.482958323474219, + "learning_rate": 2.6904391824426405e-06, + "loss": 0.5444, + "step": 10802 + }, + { + "epoch": 2.8679145094915706, + "grad_norm": 0.4347171768582086, + "learning_rate": 2.6900910670909897e-06, + "loss": 0.5312, + "step": 10803 + }, + { + "epoch": 2.868180007964954, + "grad_norm": 0.44191406865669913, + "learning_rate": 2.6897429480320973e-06, + "loss": 0.5593, + "step": 10804 + }, + { + "epoch": 2.868445506438338, + "grad_norm": 0.45942441108805737, + "learning_rate": 2.6893948252727537e-06, + "loss": 0.548, + "step": 10805 + }, + { + "epoch": 2.868711004911722, + "grad_norm": 0.43909599630631496, + "learning_rate": 2.689046698819746e-06, + "loss": 0.5415, + "step": 10806 + }, + { + "epoch": 2.8689765033851056, + "grad_norm": 0.46213125874228994, + "learning_rate": 2.6886985686798656e-06, + "loss": 0.5364, + "step": 10807 + }, + { + "epoch": 2.869242001858489, + "grad_norm": 0.44400531622663764, + "learning_rate": 2.6883504348599014e-06, + "loss": 0.5713, + "step": 10808 + }, + { + "epoch": 2.8695075003318733, + "grad_norm": 0.4521447547927035, + "learning_rate": 2.688002297366642e-06, + "loss": 0.5372, + "step": 10809 + }, + { + "epoch": 2.869772998805257, + "grad_norm": 0.44114769614711064, + "learning_rate": 2.6876541562068775e-06, + "loss": 0.5266, + "step": 10810 + }, + { + "epoch": 2.8700384972786406, + "grad_norm": 0.44883805588543824, + "learning_rate": 2.6873060113873973e-06, + "loss": 0.5416, + "step": 10811 + }, + { + "epoch": 2.8703039957520247, + "grad_norm": 0.4486930777990287, + "learning_rate": 2.6869578629149918e-06, + "loss": 0.5965, + "step": 10812 + }, + { + "epoch": 2.8705694942254083, + "grad_norm": 0.4411784366304622, + "learning_rate": 2.6866097107964495e-06, + "loss": 0.5441, + "step": 10813 + }, + { + "epoch": 2.870834992698792, + "grad_norm": 0.4376672470673291, + "learning_rate": 2.6862615550385616e-06, + "loss": 0.5773, + "step": 10814 + }, + { + "epoch": 2.871100491172176, + "grad_norm": 0.43184379446850796, + "learning_rate": 2.6859133956481174e-06, + "loss": 0.5486, + "step": 10815 + }, + { + "epoch": 2.8713659896455597, + "grad_norm": 0.4479987787890383, + "learning_rate": 2.6855652326319063e-06, + "loss": 0.5914, + "step": 10816 + }, + { + "epoch": 2.8716314881189433, + "grad_norm": 0.43305473354461277, + "learning_rate": 2.68521706599672e-06, + "loss": 0.5753, + "step": 10817 + }, + { + "epoch": 2.871896986592327, + "grad_norm": 0.44439596170296236, + "learning_rate": 2.684868895749346e-06, + "loss": 0.5784, + "step": 10818 + }, + { + "epoch": 2.872162485065711, + "grad_norm": 0.46071555032966255, + "learning_rate": 2.6845207218965763e-06, + "loss": 0.5704, + "step": 10819 + }, + { + "epoch": 2.8724279835390947, + "grad_norm": 0.43250899328799286, + "learning_rate": 2.6841725444452005e-06, + "loss": 0.5533, + "step": 10820 + }, + { + "epoch": 2.8726934820124783, + "grad_norm": 0.44290476232115306, + "learning_rate": 2.6838243634020095e-06, + "loss": 0.5478, + "step": 10821 + }, + { + "epoch": 2.872958980485862, + "grad_norm": 0.4523526018772023, + "learning_rate": 2.6834761787737934e-06, + "loss": 0.5576, + "step": 10822 + }, + { + "epoch": 2.873224478959246, + "grad_norm": 0.440979661564685, + "learning_rate": 2.683127990567343e-06, + "loss": 0.5729, + "step": 10823 + }, + { + "epoch": 2.8734899774326297, + "grad_norm": 0.43162485098917514, + "learning_rate": 2.6827797987894476e-06, + "loss": 0.5326, + "step": 10824 + }, + { + "epoch": 2.8737554759060133, + "grad_norm": 0.43149189949537026, + "learning_rate": 2.6824316034468987e-06, + "loss": 0.5563, + "step": 10825 + }, + { + "epoch": 2.8740209743793974, + "grad_norm": 0.4419009976170839, + "learning_rate": 2.682083404546488e-06, + "loss": 0.5736, + "step": 10826 + }, + { + "epoch": 2.874286472852781, + "grad_norm": 0.43802541637998726, + "learning_rate": 2.6817352020950037e-06, + "loss": 0.5569, + "step": 10827 + }, + { + "epoch": 2.8745519713261647, + "grad_norm": 0.4386676180692943, + "learning_rate": 2.6813869960992385e-06, + "loss": 0.5763, + "step": 10828 + }, + { + "epoch": 2.8748174697995488, + "grad_norm": 0.4392041135494687, + "learning_rate": 2.6810387865659825e-06, + "loss": 0.5903, + "step": 10829 + }, + { + "epoch": 2.8750829682729324, + "grad_norm": 0.4482191398101328, + "learning_rate": 2.6806905735020267e-06, + "loss": 0.5436, + "step": 10830 + }, + { + "epoch": 2.875348466746316, + "grad_norm": 0.44218635440073767, + "learning_rate": 2.6803423569141636e-06, + "loss": 0.5863, + "step": 10831 + }, + { + "epoch": 2.8756139652197, + "grad_norm": 0.43066766131922357, + "learning_rate": 2.679994136809182e-06, + "loss": 0.5648, + "step": 10832 + }, + { + "epoch": 2.875879463693084, + "grad_norm": 0.4452498218090366, + "learning_rate": 2.6796459131938733e-06, + "loss": 0.5835, + "step": 10833 + }, + { + "epoch": 2.8761449621664674, + "grad_norm": 0.45456715861688873, + "learning_rate": 2.6792976860750312e-06, + "loss": 0.5527, + "step": 10834 + }, + { + "epoch": 2.8764104606398515, + "grad_norm": 0.4380656206875285, + "learning_rate": 2.678949455459444e-06, + "loss": 0.5228, + "step": 10835 + }, + { + "epoch": 2.876675959113235, + "grad_norm": 0.46010713423228644, + "learning_rate": 2.678601221353905e-06, + "loss": 0.5588, + "step": 10836 + }, + { + "epoch": 2.876941457586619, + "grad_norm": 0.45977160600196393, + "learning_rate": 2.6782529837652045e-06, + "loss": 0.547, + "step": 10837 + }, + { + "epoch": 2.877206956060003, + "grad_norm": 0.4485596768856643, + "learning_rate": 2.6779047427001336e-06, + "loss": 0.5465, + "step": 10838 + }, + { + "epoch": 2.8774724545333865, + "grad_norm": 0.4471934281327445, + "learning_rate": 2.6775564981654856e-06, + "loss": 0.5426, + "step": 10839 + }, + { + "epoch": 2.87773795300677, + "grad_norm": 0.43891709868681367, + "learning_rate": 2.6772082501680514e-06, + "loss": 0.5622, + "step": 10840 + }, + { + "epoch": 2.8780034514801542, + "grad_norm": 0.4320520571670163, + "learning_rate": 2.6768599987146215e-06, + "loss": 0.5085, + "step": 10841 + }, + { + "epoch": 2.878268949953538, + "grad_norm": 0.4411475385879734, + "learning_rate": 2.676511743811989e-06, + "loss": 0.5256, + "step": 10842 + }, + { + "epoch": 2.8785344484269215, + "grad_norm": 0.452468711228945, + "learning_rate": 2.676163485466946e-06, + "loss": 0.5742, + "step": 10843 + }, + { + "epoch": 2.878799946900305, + "grad_norm": 0.44392858774568406, + "learning_rate": 2.6758152236862827e-06, + "loss": 0.583, + "step": 10844 + }, + { + "epoch": 2.8790654453736892, + "grad_norm": 0.48262724715708205, + "learning_rate": 2.675466958476793e-06, + "loss": 0.5067, + "step": 10845 + }, + { + "epoch": 2.879330943847073, + "grad_norm": 0.45651380054768226, + "learning_rate": 2.6751186898452673e-06, + "loss": 0.5517, + "step": 10846 + }, + { + "epoch": 2.8795964423204565, + "grad_norm": 0.4463889492644063, + "learning_rate": 2.6747704177984983e-06, + "loss": 0.58, + "step": 10847 + }, + { + "epoch": 2.87986194079384, + "grad_norm": 0.4436312994448471, + "learning_rate": 2.6744221423432786e-06, + "loss": 0.5362, + "step": 10848 + }, + { + "epoch": 2.8801274392672243, + "grad_norm": 0.4448488830640065, + "learning_rate": 2.6740738634864e-06, + "loss": 0.5565, + "step": 10849 + }, + { + "epoch": 2.880392937740608, + "grad_norm": 0.4817548404271945, + "learning_rate": 2.6737255812346547e-06, + "loss": 0.5089, + "step": 10850 + }, + { + "epoch": 2.8806584362139915, + "grad_norm": 0.4368878684261856, + "learning_rate": 2.6733772955948352e-06, + "loss": 0.5154, + "step": 10851 + }, + { + "epoch": 2.8809239346873756, + "grad_norm": 0.44767934500164847, + "learning_rate": 2.673029006573735e-06, + "loss": 0.544, + "step": 10852 + }, + { + "epoch": 2.8811894331607593, + "grad_norm": 0.44541810496343487, + "learning_rate": 2.672680714178144e-06, + "loss": 0.5747, + "step": 10853 + }, + { + "epoch": 2.881454931634143, + "grad_norm": 0.43896537345326203, + "learning_rate": 2.6723324184148576e-06, + "loss": 0.5779, + "step": 10854 + }, + { + "epoch": 2.881720430107527, + "grad_norm": 0.4310170528667718, + "learning_rate": 2.671984119290666e-06, + "loss": 0.5598, + "step": 10855 + }, + { + "epoch": 2.8819859285809106, + "grad_norm": 0.44721733651614415, + "learning_rate": 2.6716358168123633e-06, + "loss": 0.5301, + "step": 10856 + }, + { + "epoch": 2.8822514270542943, + "grad_norm": 0.42864646289597147, + "learning_rate": 2.671287510986742e-06, + "loss": 0.542, + "step": 10857 + }, + { + "epoch": 2.8825169255276784, + "grad_norm": 0.4369746610196976, + "learning_rate": 2.6709392018205944e-06, + "loss": 0.5835, + "step": 10858 + }, + { + "epoch": 2.882782424001062, + "grad_norm": 0.44602426416435603, + "learning_rate": 2.6705908893207144e-06, + "loss": 0.5564, + "step": 10859 + }, + { + "epoch": 2.8830479224744456, + "grad_norm": 0.4464058686004329, + "learning_rate": 2.6702425734938948e-06, + "loss": 0.5523, + "step": 10860 + }, + { + "epoch": 2.8833134209478297, + "grad_norm": 0.4458263398218292, + "learning_rate": 2.6698942543469275e-06, + "loss": 0.5692, + "step": 10861 + }, + { + "epoch": 2.8835789194212134, + "grad_norm": 0.44316466620704503, + "learning_rate": 2.669545931886606e-06, + "loss": 0.5786, + "step": 10862 + }, + { + "epoch": 2.883844417894597, + "grad_norm": 0.4312861516714903, + "learning_rate": 2.6691976061197245e-06, + "loss": 0.5548, + "step": 10863 + }, + { + "epoch": 2.884109916367981, + "grad_norm": 0.44255286569736574, + "learning_rate": 2.6688492770530748e-06, + "loss": 0.5859, + "step": 10864 + }, + { + "epoch": 2.8843754148413647, + "grad_norm": 0.43321625083875426, + "learning_rate": 2.668500944693451e-06, + "loss": 0.555, + "step": 10865 + }, + { + "epoch": 2.8846409133147484, + "grad_norm": 0.43531149971538413, + "learning_rate": 2.668152609047646e-06, + "loss": 0.5527, + "step": 10866 + }, + { + "epoch": 2.8849064117881325, + "grad_norm": 0.4361321588632163, + "learning_rate": 2.667804270122454e-06, + "loss": 0.5386, + "step": 10867 + }, + { + "epoch": 2.885171910261516, + "grad_norm": 0.4437577053716983, + "learning_rate": 2.667455927924667e-06, + "loss": 0.5635, + "step": 10868 + }, + { + "epoch": 2.8854374087348997, + "grad_norm": 0.44278390800727474, + "learning_rate": 2.6671075824610803e-06, + "loss": 0.5681, + "step": 10869 + }, + { + "epoch": 2.885702907208284, + "grad_norm": 0.433106111626509, + "learning_rate": 2.6667592337384864e-06, + "loss": 0.5863, + "step": 10870 + }, + { + "epoch": 2.8859684056816675, + "grad_norm": 0.4270240982678621, + "learning_rate": 2.6664108817636785e-06, + "loss": 0.5329, + "step": 10871 + }, + { + "epoch": 2.886233904155051, + "grad_norm": 0.45712822613003545, + "learning_rate": 2.666062526543452e-06, + "loss": 0.5925, + "step": 10872 + }, + { + "epoch": 2.8864994026284347, + "grad_norm": 0.4601936028118898, + "learning_rate": 2.6657141680845994e-06, + "loss": 0.5338, + "step": 10873 + }, + { + "epoch": 2.886764901101819, + "grad_norm": 0.4472989096495747, + "learning_rate": 2.665365806393914e-06, + "loss": 0.567, + "step": 10874 + }, + { + "epoch": 2.8870303995752025, + "grad_norm": 0.44955009092777526, + "learning_rate": 2.665017441478191e-06, + "loss": 0.6097, + "step": 10875 + }, + { + "epoch": 2.887295898048586, + "grad_norm": 0.43231338361898963, + "learning_rate": 2.6646690733442236e-06, + "loss": 0.5594, + "step": 10876 + }, + { + "epoch": 2.8875613965219697, + "grad_norm": 0.45075223683003657, + "learning_rate": 2.6643207019988065e-06, + "loss": 0.5877, + "step": 10877 + }, + { + "epoch": 2.887826894995354, + "grad_norm": 0.44544448527423086, + "learning_rate": 2.6639723274487334e-06, + "loss": 0.525, + "step": 10878 + }, + { + "epoch": 2.8880923934687375, + "grad_norm": 0.434334548974687, + "learning_rate": 2.6636239497007984e-06, + "loss": 0.5315, + "step": 10879 + }, + { + "epoch": 2.888357891942121, + "grad_norm": 0.44885227514256315, + "learning_rate": 2.6632755687617955e-06, + "loss": 0.5432, + "step": 10880 + }, + { + "epoch": 2.888623390415505, + "grad_norm": 0.43988230272360723, + "learning_rate": 2.662927184638519e-06, + "loss": 0.5626, + "step": 10881 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.45518679626984343, + "learning_rate": 2.662578797337764e-06, + "loss": 0.5822, + "step": 10882 + }, + { + "epoch": 2.8891543873622725, + "grad_norm": 0.4467505118143057, + "learning_rate": 2.6622304068663246e-06, + "loss": 0.5386, + "step": 10883 + }, + { + "epoch": 2.8894198858356566, + "grad_norm": 0.43306594349569794, + "learning_rate": 2.661882013230994e-06, + "loss": 0.5628, + "step": 10884 + }, + { + "epoch": 2.88968538430904, + "grad_norm": 0.41041798195591706, + "learning_rate": 2.6615336164385685e-06, + "loss": 0.5045, + "step": 10885 + }, + { + "epoch": 2.889950882782424, + "grad_norm": 0.45199972987522774, + "learning_rate": 2.6611852164958423e-06, + "loss": 0.5451, + "step": 10886 + }, + { + "epoch": 2.890216381255808, + "grad_norm": 0.4592370261856821, + "learning_rate": 2.660836813409609e-06, + "loss": 0.5668, + "step": 10887 + }, + { + "epoch": 2.8904818797291916, + "grad_norm": 0.4470436497488048, + "learning_rate": 2.660488407186664e-06, + "loss": 0.5981, + "step": 10888 + }, + { + "epoch": 2.890747378202575, + "grad_norm": 0.4361218504831239, + "learning_rate": 2.6601399978338034e-06, + "loss": 0.5514, + "step": 10889 + }, + { + "epoch": 2.8910128766759593, + "grad_norm": 0.44530977881106987, + "learning_rate": 2.6597915853578195e-06, + "loss": 0.5637, + "step": 10890 + }, + { + "epoch": 2.891278375149343, + "grad_norm": 0.43167158787203563, + "learning_rate": 2.6594431697655087e-06, + "loss": 0.5463, + "step": 10891 + }, + { + "epoch": 2.8915438736227266, + "grad_norm": 0.4340141491318965, + "learning_rate": 2.659094751063666e-06, + "loss": 0.5481, + "step": 10892 + }, + { + "epoch": 2.8918093720961107, + "grad_norm": 0.4442181747184613, + "learning_rate": 2.658746329259086e-06, + "loss": 0.54, + "step": 10893 + }, + { + "epoch": 2.8920748705694943, + "grad_norm": 0.44184859518917863, + "learning_rate": 2.6583979043585633e-06, + "loss": 0.5304, + "step": 10894 + }, + { + "epoch": 2.892340369042878, + "grad_norm": 0.450055242843919, + "learning_rate": 2.658049476368895e-06, + "loss": 0.5498, + "step": 10895 + }, + { + "epoch": 2.892605867516262, + "grad_norm": 0.45316290903820783, + "learning_rate": 2.657701045296874e-06, + "loss": 0.5602, + "step": 10896 + }, + { + "epoch": 2.8928713659896457, + "grad_norm": 0.4401253783443589, + "learning_rate": 2.657352611149296e-06, + "loss": 0.5591, + "step": 10897 + }, + { + "epoch": 2.8931368644630293, + "grad_norm": 0.43622151426619354, + "learning_rate": 2.6570041739329576e-06, + "loss": 0.5572, + "step": 10898 + }, + { + "epoch": 2.893402362936413, + "grad_norm": 0.42662031380810866, + "learning_rate": 2.6566557336546533e-06, + "loss": 0.5469, + "step": 10899 + }, + { + "epoch": 2.893667861409797, + "grad_norm": 0.43024760001087314, + "learning_rate": 2.656307290321179e-06, + "loss": 0.5457, + "step": 10900 + }, + { + "epoch": 2.8939333598831807, + "grad_norm": 0.4467549013135979, + "learning_rate": 2.65595884393933e-06, + "loss": 0.5519, + "step": 10901 + }, + { + "epoch": 2.8941988583565643, + "grad_norm": 0.44258297677957514, + "learning_rate": 2.6556103945159005e-06, + "loss": 0.5813, + "step": 10902 + }, + { + "epoch": 2.894464356829948, + "grad_norm": 0.45282890438368784, + "learning_rate": 2.6552619420576876e-06, + "loss": 0.5332, + "step": 10903 + }, + { + "epoch": 2.894729855303332, + "grad_norm": 0.4341136622860847, + "learning_rate": 2.654913486571487e-06, + "loss": 0.5847, + "step": 10904 + }, + { + "epoch": 2.8949953537767157, + "grad_norm": 0.44171869462126057, + "learning_rate": 2.654565028064094e-06, + "loss": 0.5655, + "step": 10905 + }, + { + "epoch": 2.8952608522500993, + "grad_norm": 0.43393078047895756, + "learning_rate": 2.654216566542305e-06, + "loss": 0.5659, + "step": 10906 + }, + { + "epoch": 2.8955263507234834, + "grad_norm": 0.43635181981188764, + "learning_rate": 2.653868102012915e-06, + "loss": 0.5548, + "step": 10907 + }, + { + "epoch": 2.895791849196867, + "grad_norm": 0.4261915598496779, + "learning_rate": 2.65351963448272e-06, + "loss": 0.5504, + "step": 10908 + }, + { + "epoch": 2.8960573476702507, + "grad_norm": 0.42926860439761766, + "learning_rate": 2.653171163958517e-06, + "loss": 0.5611, + "step": 10909 + }, + { + "epoch": 2.8963228461436348, + "grad_norm": 0.43997026515276516, + "learning_rate": 2.6528226904471006e-06, + "loss": 0.5587, + "step": 10910 + }, + { + "epoch": 2.8965883446170184, + "grad_norm": 0.42474534302224715, + "learning_rate": 2.652474213955267e-06, + "loss": 0.5479, + "step": 10911 + }, + { + "epoch": 2.896853843090402, + "grad_norm": 0.4348882855324774, + "learning_rate": 2.6521257344898137e-06, + "loss": 0.5421, + "step": 10912 + }, + { + "epoch": 2.897119341563786, + "grad_norm": 0.450593347622781, + "learning_rate": 2.6517772520575362e-06, + "loss": 0.5893, + "step": 10913 + }, + { + "epoch": 2.89738484003717, + "grad_norm": 0.44003034685439146, + "learning_rate": 2.6514287666652305e-06, + "loss": 0.5354, + "step": 10914 + }, + { + "epoch": 2.8976503385105534, + "grad_norm": 0.44192082772191166, + "learning_rate": 2.6510802783196933e-06, + "loss": 0.5334, + "step": 10915 + }, + { + "epoch": 2.8979158369839375, + "grad_norm": 0.4331852355918647, + "learning_rate": 2.6507317870277205e-06, + "loss": 0.5716, + "step": 10916 + }, + { + "epoch": 2.898181335457321, + "grad_norm": 0.4589238668487718, + "learning_rate": 2.650383292796109e-06, + "loss": 0.5968, + "step": 10917 + }, + { + "epoch": 2.898446833930705, + "grad_norm": 0.43782726818763074, + "learning_rate": 2.650034795631655e-06, + "loss": 0.5378, + "step": 10918 + }, + { + "epoch": 2.898712332404089, + "grad_norm": 0.4461233497566889, + "learning_rate": 2.649686295541155e-06, + "loss": 0.5292, + "step": 10919 + }, + { + "epoch": 2.8989778308774725, + "grad_norm": 0.45222055609815054, + "learning_rate": 2.6493377925314057e-06, + "loss": 0.5606, + "step": 10920 + }, + { + "epoch": 2.899243329350856, + "grad_norm": 0.45637331622513116, + "learning_rate": 2.6489892866092037e-06, + "loss": 0.5889, + "step": 10921 + }, + { + "epoch": 2.8995088278242402, + "grad_norm": 0.44055113014127406, + "learning_rate": 2.648640777781346e-06, + "loss": 0.5621, + "step": 10922 + }, + { + "epoch": 2.899774326297624, + "grad_norm": 0.43328442092076996, + "learning_rate": 2.6482922660546296e-06, + "loss": 0.5627, + "step": 10923 + }, + { + "epoch": 2.9000398247710075, + "grad_norm": 0.43746414192602867, + "learning_rate": 2.6479437514358506e-06, + "loss": 0.5809, + "step": 10924 + }, + { + "epoch": 2.9003053232443916, + "grad_norm": 0.44263605524024885, + "learning_rate": 2.6475952339318064e-06, + "loss": 0.5745, + "step": 10925 + }, + { + "epoch": 2.9005708217177752, + "grad_norm": 0.448103484572625, + "learning_rate": 2.6472467135492943e-06, + "loss": 0.5595, + "step": 10926 + }, + { + "epoch": 2.900836320191159, + "grad_norm": 0.4462869568646484, + "learning_rate": 2.6468981902951097e-06, + "loss": 0.5632, + "step": 10927 + }, + { + "epoch": 2.9011018186645425, + "grad_norm": 0.4384724991957383, + "learning_rate": 2.646549664176051e-06, + "loss": 0.5504, + "step": 10928 + }, + { + "epoch": 2.9013673171379266, + "grad_norm": 0.4298794056285689, + "learning_rate": 2.6462011351989153e-06, + "loss": 0.5615, + "step": 10929 + }, + { + "epoch": 2.9016328156113103, + "grad_norm": 0.44376785333814506, + "learning_rate": 2.6458526033704986e-06, + "loss": 0.5579, + "step": 10930 + }, + { + "epoch": 2.901898314084694, + "grad_norm": 0.4340368406954021, + "learning_rate": 2.6455040686975997e-06, + "loss": 0.5491, + "step": 10931 + }, + { + "epoch": 2.9021638125580775, + "grad_norm": 0.44708777893009755, + "learning_rate": 2.6451555311870154e-06, + "loss": 0.5868, + "step": 10932 + }, + { + "epoch": 2.9024293110314616, + "grad_norm": 0.44025855844739836, + "learning_rate": 2.6448069908455424e-06, + "loss": 0.5989, + "step": 10933 + }, + { + "epoch": 2.9026948095048453, + "grad_norm": 0.4462573107282268, + "learning_rate": 2.644458447679979e-06, + "loss": 0.5606, + "step": 10934 + }, + { + "epoch": 2.902960307978229, + "grad_norm": 0.4413153831115396, + "learning_rate": 2.644109901697122e-06, + "loss": 0.5816, + "step": 10935 + }, + { + "epoch": 2.903225806451613, + "grad_norm": 0.4363934289521345, + "learning_rate": 2.6437613529037693e-06, + "loss": 0.5433, + "step": 10936 + }, + { + "epoch": 2.9034913049249966, + "grad_norm": 0.4300405345290328, + "learning_rate": 2.643412801306718e-06, + "loss": 0.5549, + "step": 10937 + }, + { + "epoch": 2.9037568033983803, + "grad_norm": 0.4350433349067469, + "learning_rate": 2.643064246912766e-06, + "loss": 0.5593, + "step": 10938 + }, + { + "epoch": 2.9040223018717644, + "grad_norm": 0.4463047863693827, + "learning_rate": 2.642715689728711e-06, + "loss": 0.5603, + "step": 10939 + }, + { + "epoch": 2.904287800345148, + "grad_norm": 0.43459082214595607, + "learning_rate": 2.6423671297613496e-06, + "loss": 0.5424, + "step": 10940 + }, + { + "epoch": 2.9045532988185316, + "grad_norm": 0.4387189873759832, + "learning_rate": 2.642018567017482e-06, + "loss": 0.5567, + "step": 10941 + }, + { + "epoch": 2.9048187972919157, + "grad_norm": 0.44360597920127154, + "learning_rate": 2.6416700015039037e-06, + "loss": 0.5597, + "step": 10942 + }, + { + "epoch": 2.9050842957652994, + "grad_norm": 0.428596800162275, + "learning_rate": 2.641321433227414e-06, + "loss": 0.5441, + "step": 10943 + }, + { + "epoch": 2.905349794238683, + "grad_norm": 0.4316249702421703, + "learning_rate": 2.6409728621948106e-06, + "loss": 0.513, + "step": 10944 + }, + { + "epoch": 2.905615292712067, + "grad_norm": 0.4455273894116888, + "learning_rate": 2.6406242884128906e-06, + "loss": 0.5803, + "step": 10945 + }, + { + "epoch": 2.9058807911854507, + "grad_norm": 0.436338016312051, + "learning_rate": 2.6402757118884525e-06, + "loss": 0.5218, + "step": 10946 + }, + { + "epoch": 2.9061462896588344, + "grad_norm": 0.44455546765062054, + "learning_rate": 2.6399271326282957e-06, + "loss": 0.5517, + "step": 10947 + }, + { + "epoch": 2.9064117881322185, + "grad_norm": 0.439299393094528, + "learning_rate": 2.6395785506392164e-06, + "loss": 0.5553, + "step": 10948 + }, + { + "epoch": 2.906677286605602, + "grad_norm": 0.44216456744201266, + "learning_rate": 2.6392299659280134e-06, + "loss": 0.5533, + "step": 10949 + }, + { + "epoch": 2.9069427850789857, + "grad_norm": 0.444469068450367, + "learning_rate": 2.6388813785014857e-06, + "loss": 0.5368, + "step": 10950 + }, + { + "epoch": 2.90720828355237, + "grad_norm": 0.4359474600841476, + "learning_rate": 2.6385327883664304e-06, + "loss": 0.5514, + "step": 10951 + }, + { + "epoch": 2.9074737820257535, + "grad_norm": 0.46208991896781965, + "learning_rate": 2.638184195529647e-06, + "loss": 0.5673, + "step": 10952 + }, + { + "epoch": 2.907739280499137, + "grad_norm": 0.45757539982651235, + "learning_rate": 2.637835599997934e-06, + "loss": 0.5781, + "step": 10953 + }, + { + "epoch": 2.9080047789725207, + "grad_norm": 0.4560009721163749, + "learning_rate": 2.6374870017780878e-06, + "loss": 0.5324, + "step": 10954 + }, + { + "epoch": 2.908270277445905, + "grad_norm": 0.4498071639255698, + "learning_rate": 2.63713840087691e-06, + "loss": 0.5715, + "step": 10955 + }, + { + "epoch": 2.9085357759192885, + "grad_norm": 0.44222843965919817, + "learning_rate": 2.636789797301197e-06, + "loss": 0.5073, + "step": 10956 + }, + { + "epoch": 2.908801274392672, + "grad_norm": 0.4468151695723339, + "learning_rate": 2.636441191057748e-06, + "loss": 0.5754, + "step": 10957 + }, + { + "epoch": 2.9090667728660558, + "grad_norm": 0.424683660695068, + "learning_rate": 2.6360925821533617e-06, + "loss": 0.5249, + "step": 10958 + }, + { + "epoch": 2.90933227133944, + "grad_norm": 0.44089192749465134, + "learning_rate": 2.6357439705948366e-06, + "loss": 0.593, + "step": 10959 + }, + { + "epoch": 2.9095977698128235, + "grad_norm": 0.4552840074979933, + "learning_rate": 2.635395356388972e-06, + "loss": 0.5529, + "step": 10960 + }, + { + "epoch": 2.909863268286207, + "grad_norm": 0.45090216434136277, + "learning_rate": 2.6350467395425665e-06, + "loss": 0.5834, + "step": 10961 + }, + { + "epoch": 2.910128766759591, + "grad_norm": 0.44725579933570425, + "learning_rate": 2.634698120062419e-06, + "loss": 0.5887, + "step": 10962 + }, + { + "epoch": 2.910394265232975, + "grad_norm": 0.4397702648724235, + "learning_rate": 2.634349497955328e-06, + "loss": 0.594, + "step": 10963 + }, + { + "epoch": 2.9106597637063585, + "grad_norm": 0.4491331641583865, + "learning_rate": 2.6340008732280935e-06, + "loss": 0.593, + "step": 10964 + }, + { + "epoch": 2.9109252621797426, + "grad_norm": 0.43013541405874617, + "learning_rate": 2.6336522458875137e-06, + "loss": 0.5698, + "step": 10965 + }, + { + "epoch": 2.911190760653126, + "grad_norm": 0.4422740470419798, + "learning_rate": 2.6333036159403873e-06, + "loss": 0.5471, + "step": 10966 + }, + { + "epoch": 2.91145625912651, + "grad_norm": 0.43800276207178707, + "learning_rate": 2.632954983393514e-06, + "loss": 0.5245, + "step": 10967 + }, + { + "epoch": 2.911721757599894, + "grad_norm": 0.43542102705990954, + "learning_rate": 2.6326063482536933e-06, + "loss": 0.5134, + "step": 10968 + }, + { + "epoch": 2.9119872560732776, + "grad_norm": 0.43387884694884193, + "learning_rate": 2.6322577105277243e-06, + "loss": 0.5401, + "step": 10969 + }, + { + "epoch": 2.912252754546661, + "grad_norm": 0.4327696913375593, + "learning_rate": 2.631909070222406e-06, + "loss": 0.5175, + "step": 10970 + }, + { + "epoch": 2.9125182530200453, + "grad_norm": 0.4510040472030239, + "learning_rate": 2.631560427344538e-06, + "loss": 0.5731, + "step": 10971 + }, + { + "epoch": 2.912783751493429, + "grad_norm": 0.43554688223001164, + "learning_rate": 2.6312117819009192e-06, + "loss": 0.5489, + "step": 10972 + }, + { + "epoch": 2.9130492499668126, + "grad_norm": 0.42539966900676873, + "learning_rate": 2.6308631338983497e-06, + "loss": 0.5194, + "step": 10973 + }, + { + "epoch": 2.9133147484401967, + "grad_norm": 0.4425387934346445, + "learning_rate": 2.6305144833436287e-06, + "loss": 0.5295, + "step": 10974 + }, + { + "epoch": 2.9135802469135803, + "grad_norm": 0.44746049042270636, + "learning_rate": 2.630165830243555e-06, + "loss": 0.5549, + "step": 10975 + }, + { + "epoch": 2.913845745386964, + "grad_norm": 0.44276129815038756, + "learning_rate": 2.6298171746049294e-06, + "loss": 0.5696, + "step": 10976 + }, + { + "epoch": 2.914111243860348, + "grad_norm": 0.45662944551679474, + "learning_rate": 2.6294685164345513e-06, + "loss": 0.5262, + "step": 10977 + }, + { + "epoch": 2.9143767423337317, + "grad_norm": 0.44721510646927704, + "learning_rate": 2.6291198557392206e-06, + "loss": 0.5303, + "step": 10978 + }, + { + "epoch": 2.9146422408071153, + "grad_norm": 0.4518031286990219, + "learning_rate": 2.6287711925257354e-06, + "loss": 0.5688, + "step": 10979 + }, + { + "epoch": 2.9149077392804994, + "grad_norm": 0.483696395781235, + "learning_rate": 2.6284225268008975e-06, + "loss": 0.5444, + "step": 10980 + }, + { + "epoch": 2.915173237753883, + "grad_norm": 0.4390419868148622, + "learning_rate": 2.628073858571506e-06, + "loss": 0.5591, + "step": 10981 + }, + { + "epoch": 2.9154387362272667, + "grad_norm": 0.45552141284448083, + "learning_rate": 2.62772518784436e-06, + "loss": 0.5933, + "step": 10982 + }, + { + "epoch": 2.9157042347006503, + "grad_norm": 0.4381512500155813, + "learning_rate": 2.6273765146262603e-06, + "loss": 0.5687, + "step": 10983 + }, + { + "epoch": 2.9159697331740344, + "grad_norm": 0.4285549965447163, + "learning_rate": 2.627027838924007e-06, + "loss": 0.5388, + "step": 10984 + }, + { + "epoch": 2.916235231647418, + "grad_norm": 0.4410277820701708, + "learning_rate": 2.6266791607443996e-06, + "loss": 0.5747, + "step": 10985 + }, + { + "epoch": 2.9165007301208017, + "grad_norm": 0.42977891314856626, + "learning_rate": 2.626330480094238e-06, + "loss": 0.5452, + "step": 10986 + }, + { + "epoch": 2.9167662285941853, + "grad_norm": 0.4533809508689432, + "learning_rate": 2.625981796980323e-06, + "loss": 0.5645, + "step": 10987 + }, + { + "epoch": 2.9170317270675694, + "grad_norm": 0.44291686828721816, + "learning_rate": 2.6256331114094547e-06, + "loss": 0.5745, + "step": 10988 + }, + { + "epoch": 2.917297225540953, + "grad_norm": 0.4306466116476081, + "learning_rate": 2.625284423388433e-06, + "loss": 0.5227, + "step": 10989 + }, + { + "epoch": 2.9175627240143367, + "grad_norm": 0.45560227157112076, + "learning_rate": 2.624935732924059e-06, + "loss": 0.5711, + "step": 10990 + }, + { + "epoch": 2.917828222487721, + "grad_norm": 0.4507941930167085, + "learning_rate": 2.6245870400231314e-06, + "loss": 0.5478, + "step": 10991 + }, + { + "epoch": 2.9180937209611044, + "grad_norm": 0.44799917897270103, + "learning_rate": 2.624238344692451e-06, + "loss": 0.5883, + "step": 10992 + }, + { + "epoch": 2.918359219434488, + "grad_norm": 0.4608444081651207, + "learning_rate": 2.6238896469388203e-06, + "loss": 0.5838, + "step": 10993 + }, + { + "epoch": 2.918624717907872, + "grad_norm": 0.4445833888366233, + "learning_rate": 2.623540946769037e-06, + "loss": 0.5631, + "step": 10994 + }, + { + "epoch": 2.918890216381256, + "grad_norm": 0.4441179093401636, + "learning_rate": 2.623192244189903e-06, + "loss": 0.5746, + "step": 10995 + }, + { + "epoch": 2.9191557148546394, + "grad_norm": 0.4425067355772803, + "learning_rate": 2.6228435392082187e-06, + "loss": 0.565, + "step": 10996 + }, + { + "epoch": 2.9194212133280235, + "grad_norm": 0.45033100772278595, + "learning_rate": 2.6224948318307846e-06, + "loss": 0.5712, + "step": 10997 + }, + { + "epoch": 2.919686711801407, + "grad_norm": 0.4433845948136801, + "learning_rate": 2.6221461220644017e-06, + "loss": 0.5134, + "step": 10998 + }, + { + "epoch": 2.919952210274791, + "grad_norm": 0.4403039576170179, + "learning_rate": 2.62179740991587e-06, + "loss": 0.5766, + "step": 10999 + }, + { + "epoch": 2.920217708748175, + "grad_norm": 0.4433079939625189, + "learning_rate": 2.6214486953919905e-06, + "loss": 0.5615, + "step": 11000 + }, + { + "epoch": 2.9204832072215585, + "grad_norm": 0.42658088355665535, + "learning_rate": 2.6210999784995643e-06, + "loss": 0.5544, + "step": 11001 + }, + { + "epoch": 2.920748705694942, + "grad_norm": 0.44425514010645295, + "learning_rate": 2.6207512592453917e-06, + "loss": 0.5684, + "step": 11002 + }, + { + "epoch": 2.9210142041683262, + "grad_norm": 0.48177515772204954, + "learning_rate": 2.620402537636274e-06, + "loss": 0.5384, + "step": 11003 + }, + { + "epoch": 2.92127970264171, + "grad_norm": 0.4495142565900942, + "learning_rate": 2.6200538136790123e-06, + "loss": 0.5277, + "step": 11004 + }, + { + "epoch": 2.9215452011150935, + "grad_norm": 0.4631604728166726, + "learning_rate": 2.6197050873804063e-06, + "loss": 0.5524, + "step": 11005 + }, + { + "epoch": 2.9218106995884776, + "grad_norm": 0.4413804730273908, + "learning_rate": 2.6193563587472594e-06, + "loss": 0.5735, + "step": 11006 + }, + { + "epoch": 2.9220761980618613, + "grad_norm": 0.4542835779621849, + "learning_rate": 2.6190076277863706e-06, + "loss": 0.5928, + "step": 11007 + }, + { + "epoch": 2.922341696535245, + "grad_norm": 0.4478147334315177, + "learning_rate": 2.6186588945045417e-06, + "loss": 0.5593, + "step": 11008 + }, + { + "epoch": 2.922607195008629, + "grad_norm": 0.45343390685899143, + "learning_rate": 2.6183101589085735e-06, + "loss": 0.5522, + "step": 11009 + }, + { + "epoch": 2.9228726934820126, + "grad_norm": 0.44374661218782385, + "learning_rate": 2.6179614210052684e-06, + "loss": 0.5702, + "step": 11010 + }, + { + "epoch": 2.9231381919553963, + "grad_norm": 0.4438398912198788, + "learning_rate": 2.617612680801426e-06, + "loss": 0.5263, + "step": 11011 + }, + { + "epoch": 2.92340369042878, + "grad_norm": 0.43049070232928893, + "learning_rate": 2.617263938303849e-06, + "loss": 0.5437, + "step": 11012 + }, + { + "epoch": 2.9236691889021635, + "grad_norm": 0.4294351270321998, + "learning_rate": 2.616915193519337e-06, + "loss": 0.5328, + "step": 11013 + }, + { + "epoch": 2.9239346873755476, + "grad_norm": 0.45029589715941537, + "learning_rate": 2.6165664464546938e-06, + "loss": 0.5778, + "step": 11014 + }, + { + "epoch": 2.9242001858489313, + "grad_norm": 0.43297638513041714, + "learning_rate": 2.6162176971167188e-06, + "loss": 0.5347, + "step": 11015 + }, + { + "epoch": 2.924465684322315, + "grad_norm": 0.44810356443483884, + "learning_rate": 2.6158689455122145e-06, + "loss": 0.5747, + "step": 11016 + }, + { + "epoch": 2.924731182795699, + "grad_norm": 0.4488597834661214, + "learning_rate": 2.615520191647981e-06, + "loss": 0.5714, + "step": 11017 + }, + { + "epoch": 2.9249966812690826, + "grad_norm": 0.4461198690935552, + "learning_rate": 2.615171435530822e-06, + "loss": 0.5371, + "step": 11018 + }, + { + "epoch": 2.9252621797424663, + "grad_norm": 0.4281483751753625, + "learning_rate": 2.6148226771675384e-06, + "loss": 0.5613, + "step": 11019 + }, + { + "epoch": 2.9255276782158504, + "grad_norm": 0.44711760039536225, + "learning_rate": 2.6144739165649303e-06, + "loss": 0.5599, + "step": 11020 + }, + { + "epoch": 2.925793176689234, + "grad_norm": 0.45194636137474825, + "learning_rate": 2.6141251537298012e-06, + "loss": 0.5705, + "step": 11021 + }, + { + "epoch": 2.9260586751626176, + "grad_norm": 0.4549169775333854, + "learning_rate": 2.6137763886689517e-06, + "loss": 0.5363, + "step": 11022 + }, + { + "epoch": 2.9263241736360017, + "grad_norm": 0.4452998104532959, + "learning_rate": 2.613427621389184e-06, + "loss": 0.5263, + "step": 11023 + }, + { + "epoch": 2.9265896721093854, + "grad_norm": 0.43003348577132655, + "learning_rate": 2.613078851897301e-06, + "loss": 0.5648, + "step": 11024 + }, + { + "epoch": 2.926855170582769, + "grad_norm": 0.44389173885840344, + "learning_rate": 2.6127300802001025e-06, + "loss": 0.5466, + "step": 11025 + }, + { + "epoch": 2.927120669056153, + "grad_norm": 0.4287882558063798, + "learning_rate": 2.612381306304392e-06, + "loss": 0.528, + "step": 11026 + }, + { + "epoch": 2.9273861675295367, + "grad_norm": 0.4537860839216422, + "learning_rate": 2.6120325302169707e-06, + "loss": 0.5805, + "step": 11027 + }, + { + "epoch": 2.9276516660029204, + "grad_norm": 0.43466951533592624, + "learning_rate": 2.6116837519446407e-06, + "loss": 0.5524, + "step": 11028 + }, + { + "epoch": 2.9279171644763045, + "grad_norm": 0.43625863951260246, + "learning_rate": 2.611334971494204e-06, + "loss": 0.5735, + "step": 11029 + }, + { + "epoch": 2.928182662949688, + "grad_norm": 0.4389701497554812, + "learning_rate": 2.6109861888724638e-06, + "loss": 0.5218, + "step": 11030 + }, + { + "epoch": 2.9284481614230717, + "grad_norm": 0.4451170867562754, + "learning_rate": 2.6106374040862193e-06, + "loss": 0.5651, + "step": 11031 + }, + { + "epoch": 2.928713659896456, + "grad_norm": 0.444052952661565, + "learning_rate": 2.610288617142275e-06, + "loss": 0.5136, + "step": 11032 + }, + { + "epoch": 2.9289791583698395, + "grad_norm": 0.43545532118872726, + "learning_rate": 2.6099398280474337e-06, + "loss": 0.5682, + "step": 11033 + }, + { + "epoch": 2.929244656843223, + "grad_norm": 0.43339552867254505, + "learning_rate": 2.609591036808496e-06, + "loss": 0.5567, + "step": 11034 + }, + { + "epoch": 2.929510155316607, + "grad_norm": 0.4277448711729227, + "learning_rate": 2.6092422434322646e-06, + "loss": 0.5422, + "step": 11035 + }, + { + "epoch": 2.929775653789991, + "grad_norm": 0.43189615094205963, + "learning_rate": 2.6088934479255425e-06, + "loss": 0.5608, + "step": 11036 + }, + { + "epoch": 2.9300411522633745, + "grad_norm": 0.4349166586562321, + "learning_rate": 2.6085446502951313e-06, + "loss": 0.5619, + "step": 11037 + }, + { + "epoch": 2.930306650736758, + "grad_norm": 0.4355390012269658, + "learning_rate": 2.6081958505478332e-06, + "loss": 0.5342, + "step": 11038 + }, + { + "epoch": 2.930572149210142, + "grad_norm": 0.4441870301727794, + "learning_rate": 2.6078470486904515e-06, + "loss": 0.5557, + "step": 11039 + }, + { + "epoch": 2.930837647683526, + "grad_norm": 0.4525027666561795, + "learning_rate": 2.607498244729788e-06, + "loss": 0.5649, + "step": 11040 + }, + { + "epoch": 2.9311031461569095, + "grad_norm": 0.45828534235312707, + "learning_rate": 2.6071494386726458e-06, + "loss": 0.521, + "step": 11041 + }, + { + "epoch": 2.931368644630293, + "grad_norm": 0.4494191794458083, + "learning_rate": 2.606800630525827e-06, + "loss": 0.5422, + "step": 11042 + }, + { + "epoch": 2.931634143103677, + "grad_norm": 0.4392500616799736, + "learning_rate": 2.6064518202961347e-06, + "loss": 0.5353, + "step": 11043 + }, + { + "epoch": 2.931899641577061, + "grad_norm": 0.4341415948515654, + "learning_rate": 2.606103007990371e-06, + "loss": 0.5886, + "step": 11044 + }, + { + "epoch": 2.9321651400504445, + "grad_norm": 0.45245781210341557, + "learning_rate": 2.6057541936153392e-06, + "loss": 0.577, + "step": 11045 + }, + { + "epoch": 2.9324306385238286, + "grad_norm": 0.4784986902228421, + "learning_rate": 2.6054053771778413e-06, + "loss": 0.5643, + "step": 11046 + }, + { + "epoch": 2.932696136997212, + "grad_norm": 0.44408022124947666, + "learning_rate": 2.605056558684681e-06, + "loss": 0.5473, + "step": 11047 + }, + { + "epoch": 2.932961635470596, + "grad_norm": 0.43647939474380354, + "learning_rate": 2.60470773814266e-06, + "loss": 0.5407, + "step": 11048 + }, + { + "epoch": 2.93322713394398, + "grad_norm": 0.4519577409803698, + "learning_rate": 2.604358915558582e-06, + "loss": 0.5622, + "step": 11049 + }, + { + "epoch": 2.9334926324173636, + "grad_norm": 0.45201614717443855, + "learning_rate": 2.6040100909392498e-06, + "loss": 0.5651, + "step": 11050 + }, + { + "epoch": 2.933758130890747, + "grad_norm": 0.44193766833575415, + "learning_rate": 2.6036612642914656e-06, + "loss": 0.5925, + "step": 11051 + }, + { + "epoch": 2.9340236293641313, + "grad_norm": 0.4574321497212871, + "learning_rate": 2.603312435622033e-06, + "loss": 0.5609, + "step": 11052 + }, + { + "epoch": 2.934289127837515, + "grad_norm": 0.450262875763668, + "learning_rate": 2.602963604937756e-06, + "loss": 0.615, + "step": 11053 + }, + { + "epoch": 2.9345546263108986, + "grad_norm": 0.4518255935760746, + "learning_rate": 2.6026147722454355e-06, + "loss": 0.5779, + "step": 11054 + }, + { + "epoch": 2.9348201247842827, + "grad_norm": 0.4480873814632304, + "learning_rate": 2.602265937551876e-06, + "loss": 0.5265, + "step": 11055 + }, + { + "epoch": 2.9350856232576663, + "grad_norm": 0.4428034290581282, + "learning_rate": 2.601917100863881e-06, + "loss": 0.5916, + "step": 11056 + }, + { + "epoch": 2.93535112173105, + "grad_norm": 0.44940435041218696, + "learning_rate": 2.6015682621882523e-06, + "loss": 0.5668, + "step": 11057 + }, + { + "epoch": 2.935616620204434, + "grad_norm": 0.4399758352164834, + "learning_rate": 2.6012194215317938e-06, + "loss": 0.5799, + "step": 11058 + }, + { + "epoch": 2.9358821186778177, + "grad_norm": 0.43656769776092696, + "learning_rate": 2.6008705789013084e-06, + "loss": 0.5601, + "step": 11059 + }, + { + "epoch": 2.9361476171512013, + "grad_norm": 0.43168107847423265, + "learning_rate": 2.6005217343035997e-06, + "loss": 0.587, + "step": 11060 + }, + { + "epoch": 2.9364131156245854, + "grad_norm": 0.4325850623991144, + "learning_rate": 2.6001728877454717e-06, + "loss": 0.5576, + "step": 11061 + }, + { + "epoch": 2.936678614097969, + "grad_norm": 0.4351396388721992, + "learning_rate": 2.5998240392337274e-06, + "loss": 0.5458, + "step": 11062 + }, + { + "epoch": 2.9369441125713527, + "grad_norm": 0.4328401611566337, + "learning_rate": 2.5994751887751694e-06, + "loss": 0.5291, + "step": 11063 + }, + { + "epoch": 2.9372096110447368, + "grad_norm": 0.44967691269533355, + "learning_rate": 2.599126336376602e-06, + "loss": 0.5631, + "step": 11064 + }, + { + "epoch": 2.9374751095181204, + "grad_norm": 0.44010537129460875, + "learning_rate": 2.5987774820448282e-06, + "loss": 0.5743, + "step": 11065 + }, + { + "epoch": 2.937740607991504, + "grad_norm": 0.4550942766624417, + "learning_rate": 2.5984286257866513e-06, + "loss": 0.5237, + "step": 11066 + }, + { + "epoch": 2.9380061064648877, + "grad_norm": 0.45081598069681766, + "learning_rate": 2.5980797676088754e-06, + "loss": 0.5456, + "step": 11067 + }, + { + "epoch": 2.9382716049382713, + "grad_norm": 0.4480864310945247, + "learning_rate": 2.597730907518304e-06, + "loss": 0.5917, + "step": 11068 + }, + { + "epoch": 2.9385371034116554, + "grad_norm": 0.4550612706211758, + "learning_rate": 2.5973820455217404e-06, + "loss": 0.537, + "step": 11069 + }, + { + "epoch": 2.938802601885039, + "grad_norm": 0.44689868514503867, + "learning_rate": 2.5970331816259885e-06, + "loss": 0.562, + "step": 11070 + }, + { + "epoch": 2.9390681003584227, + "grad_norm": 0.4562716141300421, + "learning_rate": 2.5966843158378524e-06, + "loss": 0.5601, + "step": 11071 + }, + { + "epoch": 2.939333598831807, + "grad_norm": 0.4554198177235054, + "learning_rate": 2.596335448164135e-06, + "loss": 0.5914, + "step": 11072 + }, + { + "epoch": 2.9395990973051904, + "grad_norm": 0.42981824147669506, + "learning_rate": 2.5959865786116412e-06, + "loss": 0.5554, + "step": 11073 + }, + { + "epoch": 2.939864595778574, + "grad_norm": 0.44808024991165263, + "learning_rate": 2.595637707187173e-06, + "loss": 0.5871, + "step": 11074 + }, + { + "epoch": 2.940130094251958, + "grad_norm": 0.4543846402797517, + "learning_rate": 2.5952888338975356e-06, + "loss": 0.5888, + "step": 11075 + }, + { + "epoch": 2.940395592725342, + "grad_norm": 0.43603148151115634, + "learning_rate": 2.5949399587495333e-06, + "loss": 0.5759, + "step": 11076 + }, + { + "epoch": 2.9406610911987254, + "grad_norm": 0.4365660463555581, + "learning_rate": 2.5945910817499687e-06, + "loss": 0.5265, + "step": 11077 + }, + { + "epoch": 2.9409265896721095, + "grad_norm": 0.44096177315977575, + "learning_rate": 2.5942422029056464e-06, + "loss": 0.5612, + "step": 11078 + }, + { + "epoch": 2.941192088145493, + "grad_norm": 0.4436147080214331, + "learning_rate": 2.593893322223371e-06, + "loss": 0.5613, + "step": 11079 + }, + { + "epoch": 2.941457586618877, + "grad_norm": 0.4280195742595993, + "learning_rate": 2.593544439709945e-06, + "loss": 0.5553, + "step": 11080 + }, + { + "epoch": 2.941723085092261, + "grad_norm": 0.44350844930564665, + "learning_rate": 2.5931955553721745e-06, + "loss": 0.5442, + "step": 11081 + }, + { + "epoch": 2.9419885835656445, + "grad_norm": 0.43582046238552663, + "learning_rate": 2.592846669216862e-06, + "loss": 0.5755, + "step": 11082 + }, + { + "epoch": 2.942254082039028, + "grad_norm": 0.44460755027030535, + "learning_rate": 2.5924977812508124e-06, + "loss": 0.5743, + "step": 11083 + }, + { + "epoch": 2.9425195805124122, + "grad_norm": 0.4451981153873848, + "learning_rate": 2.592148891480829e-06, + "loss": 0.565, + "step": 11084 + }, + { + "epoch": 2.942785078985796, + "grad_norm": 0.43662527942919516, + "learning_rate": 2.5917999999137173e-06, + "loss": 0.5599, + "step": 11085 + }, + { + "epoch": 2.9430505774591795, + "grad_norm": 0.44888326471622875, + "learning_rate": 2.5914511065562804e-06, + "loss": 0.5437, + "step": 11086 + }, + { + "epoch": 2.9433160759325636, + "grad_norm": 0.43990252224141785, + "learning_rate": 2.5911022114153224e-06, + "loss": 0.5601, + "step": 11087 + }, + { + "epoch": 2.9435815744059473, + "grad_norm": 0.4423886506568044, + "learning_rate": 2.590753314497649e-06, + "loss": 0.5671, + "step": 11088 + }, + { + "epoch": 2.943847072879331, + "grad_norm": 0.43678266037343344, + "learning_rate": 2.590404415810063e-06, + "loss": 0.5929, + "step": 11089 + }, + { + "epoch": 2.944112571352715, + "grad_norm": 0.4418658032055756, + "learning_rate": 2.5900555153593703e-06, + "loss": 0.5356, + "step": 11090 + }, + { + "epoch": 2.9443780698260986, + "grad_norm": 0.4483786560026015, + "learning_rate": 2.589706613152375e-06, + "loss": 0.5808, + "step": 11091 + }, + { + "epoch": 2.9446435682994823, + "grad_norm": 0.4491662270321082, + "learning_rate": 2.5893577091958805e-06, + "loss": 0.5908, + "step": 11092 + }, + { + "epoch": 2.944909066772866, + "grad_norm": 0.44214251950510036, + "learning_rate": 2.5890088034966916e-06, + "loss": 0.5513, + "step": 11093 + }, + { + "epoch": 2.94517456524625, + "grad_norm": 0.44011214924044634, + "learning_rate": 2.5886598960616137e-06, + "loss": 0.5426, + "step": 11094 + }, + { + "epoch": 2.9454400637196336, + "grad_norm": 0.4407352441614994, + "learning_rate": 2.5883109868974506e-06, + "loss": 0.541, + "step": 11095 + }, + { + "epoch": 2.9457055621930173, + "grad_norm": 0.4504348396439044, + "learning_rate": 2.5879620760110065e-06, + "loss": 0.5432, + "step": 11096 + }, + { + "epoch": 2.945971060666401, + "grad_norm": 0.45832096799768707, + "learning_rate": 2.587613163409087e-06, + "loss": 0.5929, + "step": 11097 + }, + { + "epoch": 2.946236559139785, + "grad_norm": 0.4436309234137818, + "learning_rate": 2.5872642490984955e-06, + "loss": 0.5539, + "step": 11098 + }, + { + "epoch": 2.9465020576131686, + "grad_norm": 0.4512333676411674, + "learning_rate": 2.5869153330860393e-06, + "loss": 0.6047, + "step": 11099 + }, + { + "epoch": 2.9467675560865523, + "grad_norm": 0.45423819386854863, + "learning_rate": 2.5865664153785197e-06, + "loss": 0.5884, + "step": 11100 + }, + { + "epoch": 2.9470330545599364, + "grad_norm": 0.4558499481753614, + "learning_rate": 2.5862174959827437e-06, + "loss": 0.5641, + "step": 11101 + }, + { + "epoch": 2.94729855303332, + "grad_norm": 0.449877721033026, + "learning_rate": 2.5858685749055157e-06, + "loss": 0.5557, + "step": 11102 + }, + { + "epoch": 2.9475640515067036, + "grad_norm": 0.4346694747806558, + "learning_rate": 2.5855196521536397e-06, + "loss": 0.5674, + "step": 11103 + }, + { + "epoch": 2.9478295499800877, + "grad_norm": 0.4391238912896865, + "learning_rate": 2.585170727733921e-06, + "loss": 0.5483, + "step": 11104 + }, + { + "epoch": 2.9480950484534714, + "grad_norm": 0.4323549237160803, + "learning_rate": 2.5848218016531645e-06, + "loss": 0.5355, + "step": 11105 + }, + { + "epoch": 2.948360546926855, + "grad_norm": 0.43725840323667203, + "learning_rate": 2.584472873918176e-06, + "loss": 0.5308, + "step": 11106 + }, + { + "epoch": 2.948626045400239, + "grad_norm": 0.4456587492658111, + "learning_rate": 2.5841239445357585e-06, + "loss": 0.5741, + "step": 11107 + }, + { + "epoch": 2.9488915438736227, + "grad_norm": 0.4450286369348951, + "learning_rate": 2.5837750135127192e-06, + "loss": 0.5413, + "step": 11108 + }, + { + "epoch": 2.9491570423470064, + "grad_norm": 0.43196300493471296, + "learning_rate": 2.5834260808558614e-06, + "loss": 0.5783, + "step": 11109 + }, + { + "epoch": 2.9494225408203905, + "grad_norm": 0.46152166268122263, + "learning_rate": 2.5830771465719905e-06, + "loss": 0.6018, + "step": 11110 + }, + { + "epoch": 2.949688039293774, + "grad_norm": 0.44343546298115655, + "learning_rate": 2.582728210667913e-06, + "loss": 0.5831, + "step": 11111 + }, + { + "epoch": 2.9499535377671577, + "grad_norm": 0.4274778167214706, + "learning_rate": 2.5823792731504315e-06, + "loss": 0.5372, + "step": 11112 + }, + { + "epoch": 2.950219036240542, + "grad_norm": 0.4261531732814335, + "learning_rate": 2.582030334026353e-06, + "loss": 0.5071, + "step": 11113 + }, + { + "epoch": 2.9504845347139255, + "grad_norm": 0.4542437566840947, + "learning_rate": 2.5816813933024826e-06, + "loss": 0.5482, + "step": 11114 + }, + { + "epoch": 2.950750033187309, + "grad_norm": 0.45177727370652365, + "learning_rate": 2.581332450985624e-06, + "loss": 0.5661, + "step": 11115 + }, + { + "epoch": 2.951015531660693, + "grad_norm": 0.4460958136793645, + "learning_rate": 2.5809835070825844e-06, + "loss": 0.5523, + "step": 11116 + }, + { + "epoch": 2.951281030134077, + "grad_norm": 0.4475475966169349, + "learning_rate": 2.5806345616001684e-06, + "loss": 0.5379, + "step": 11117 + }, + { + "epoch": 2.9515465286074605, + "grad_norm": 0.4545354377571334, + "learning_rate": 2.58028561454518e-06, + "loss": 0.512, + "step": 11118 + }, + { + "epoch": 2.9518120270808446, + "grad_norm": 0.4411803945683953, + "learning_rate": 2.5799366659244263e-06, + "loss": 0.5623, + "step": 11119 + }, + { + "epoch": 2.952077525554228, + "grad_norm": 0.45280608820145873, + "learning_rate": 2.579587715744712e-06, + "loss": 0.5693, + "step": 11120 + }, + { + "epoch": 2.952343024027612, + "grad_norm": 0.4474194631108395, + "learning_rate": 2.5792387640128413e-06, + "loss": 0.5775, + "step": 11121 + }, + { + "epoch": 2.9526085225009955, + "grad_norm": 0.43979584780312064, + "learning_rate": 2.5788898107356215e-06, + "loss": 0.5617, + "step": 11122 + }, + { + "epoch": 2.952874020974379, + "grad_norm": 0.4509841326931812, + "learning_rate": 2.578540855919857e-06, + "loss": 0.5663, + "step": 11123 + }, + { + "epoch": 2.953139519447763, + "grad_norm": 0.4650130743622842, + "learning_rate": 2.578191899572353e-06, + "loss": 0.5489, + "step": 11124 + }, + { + "epoch": 2.953405017921147, + "grad_norm": 0.43224846161431096, + "learning_rate": 2.577842941699917e-06, + "loss": 0.5311, + "step": 11125 + }, + { + "epoch": 2.9536705163945305, + "grad_norm": 0.42911234841879764, + "learning_rate": 2.577493982309352e-06, + "loss": 0.5511, + "step": 11126 + }, + { + "epoch": 2.9539360148679146, + "grad_norm": 0.4297791775910588, + "learning_rate": 2.5771450214074644e-06, + "loss": 0.527, + "step": 11127 + }, + { + "epoch": 2.954201513341298, + "grad_norm": 0.4443164092023868, + "learning_rate": 2.576796059001061e-06, + "loss": 0.5709, + "step": 11128 + }, + { + "epoch": 2.954467011814682, + "grad_norm": 0.45503776831619025, + "learning_rate": 2.5764470950969457e-06, + "loss": 0.5533, + "step": 11129 + }, + { + "epoch": 2.954732510288066, + "grad_norm": 0.4585870546941405, + "learning_rate": 2.576098129701925e-06, + "loss": 0.5338, + "step": 11130 + }, + { + "epoch": 2.9549980087614496, + "grad_norm": 0.41715428023443785, + "learning_rate": 2.575749162822805e-06, + "loss": 0.5237, + "step": 11131 + }, + { + "epoch": 2.955263507234833, + "grad_norm": 0.4537409310070673, + "learning_rate": 2.57540019446639e-06, + "loss": 0.5458, + "step": 11132 + }, + { + "epoch": 2.9555290057082173, + "grad_norm": 0.4460184122879938, + "learning_rate": 2.575051224639486e-06, + "loss": 0.584, + "step": 11133 + }, + { + "epoch": 2.955794504181601, + "grad_norm": 0.45158707179764196, + "learning_rate": 2.5747022533489006e-06, + "loss": 0.5677, + "step": 11134 + }, + { + "epoch": 2.9560600026549846, + "grad_norm": 0.4564504884895011, + "learning_rate": 2.574353280601438e-06, + "loss": 0.5886, + "step": 11135 + }, + { + "epoch": 2.9563255011283687, + "grad_norm": 0.4499236932639771, + "learning_rate": 2.5740043064039043e-06, + "loss": 0.5735, + "step": 11136 + }, + { + "epoch": 2.9565909996017523, + "grad_norm": 0.4448097964990401, + "learning_rate": 2.5736553307631055e-06, + "loss": 0.5739, + "step": 11137 + }, + { + "epoch": 2.956856498075136, + "grad_norm": 0.44452156301113244, + "learning_rate": 2.5733063536858477e-06, + "loss": 0.5546, + "step": 11138 + }, + { + "epoch": 2.95712199654852, + "grad_norm": 0.4651078589480799, + "learning_rate": 2.572957375178936e-06, + "loss": 0.5833, + "step": 11139 + }, + { + "epoch": 2.9573874950219037, + "grad_norm": 0.44711143349524607, + "learning_rate": 2.5726083952491774e-06, + "loss": 0.5643, + "step": 11140 + }, + { + "epoch": 2.9576529934952873, + "grad_norm": 0.4360262731811912, + "learning_rate": 2.5722594139033767e-06, + "loss": 0.5563, + "step": 11141 + }, + { + "epoch": 2.9579184919686714, + "grad_norm": 0.44083417525704455, + "learning_rate": 2.571910431148341e-06, + "loss": 0.5837, + "step": 11142 + }, + { + "epoch": 2.958183990442055, + "grad_norm": 0.45805104980099365, + "learning_rate": 2.5715614469908754e-06, + "loss": 0.5638, + "step": 11143 + }, + { + "epoch": 2.9584494889154387, + "grad_norm": 0.4824903499669946, + "learning_rate": 2.5712124614377863e-06, + "loss": 0.5778, + "step": 11144 + }, + { + "epoch": 2.9587149873888228, + "grad_norm": 0.44034568417154685, + "learning_rate": 2.570863474495881e-06, + "loss": 0.61, + "step": 11145 + }, + { + "epoch": 2.9589804858622064, + "grad_norm": 0.4442593600810979, + "learning_rate": 2.5705144861719634e-06, + "loss": 0.5399, + "step": 11146 + }, + { + "epoch": 2.95924598433559, + "grad_norm": 0.44356610250779144, + "learning_rate": 2.5701654964728406e-06, + "loss": 0.5627, + "step": 11147 + }, + { + "epoch": 2.9595114828089737, + "grad_norm": 0.4370759971601249, + "learning_rate": 2.5698165054053197e-06, + "loss": 0.557, + "step": 11148 + }, + { + "epoch": 2.9597769812823578, + "grad_norm": 0.44161034389561826, + "learning_rate": 2.5694675129762055e-06, + "loss": 0.5548, + "step": 11149 + }, + { + "epoch": 2.9600424797557414, + "grad_norm": 0.4473385732591696, + "learning_rate": 2.569118519192304e-06, + "loss": 0.5403, + "step": 11150 + }, + { + "epoch": 2.960307978229125, + "grad_norm": 0.44767028642156964, + "learning_rate": 2.5687695240604225e-06, + "loss": 0.5516, + "step": 11151 + }, + { + "epoch": 2.9605734767025087, + "grad_norm": 0.4578836447155603, + "learning_rate": 2.5684205275873677e-06, + "loss": 0.5414, + "step": 11152 + }, + { + "epoch": 2.960838975175893, + "grad_norm": 0.45567472316892366, + "learning_rate": 2.568071529779944e-06, + "loss": 0.5698, + "step": 11153 + }, + { + "epoch": 2.9611044736492764, + "grad_norm": 0.44852035417122443, + "learning_rate": 2.56772253064496e-06, + "loss": 0.5387, + "step": 11154 + }, + { + "epoch": 2.96136997212266, + "grad_norm": 0.45738140434987584, + "learning_rate": 2.56737353018922e-06, + "loss": 0.5341, + "step": 11155 + }, + { + "epoch": 2.961635470596044, + "grad_norm": 0.4444677584711385, + "learning_rate": 2.567024528419531e-06, + "loss": 0.5276, + "step": 11156 + }, + { + "epoch": 2.961900969069428, + "grad_norm": 0.4585276386673583, + "learning_rate": 2.5666755253427006e-06, + "loss": 0.5658, + "step": 11157 + }, + { + "epoch": 2.9621664675428114, + "grad_norm": 0.4457923312854235, + "learning_rate": 2.5663265209655337e-06, + "loss": 0.5474, + "step": 11158 + }, + { + "epoch": 2.9624319660161955, + "grad_norm": 0.44070254998793534, + "learning_rate": 2.565977515294837e-06, + "loss": 0.5643, + "step": 11159 + }, + { + "epoch": 2.962697464489579, + "grad_norm": 0.4482683100572228, + "learning_rate": 2.5656285083374165e-06, + "loss": 0.5086, + "step": 11160 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 0.42554826850508143, + "learning_rate": 2.5652795001000807e-06, + "loss": 0.5073, + "step": 11161 + }, + { + "epoch": 2.963228461436347, + "grad_norm": 0.44158487755664483, + "learning_rate": 2.5649304905896337e-06, + "loss": 0.5523, + "step": 11162 + }, + { + "epoch": 2.9634939599097305, + "grad_norm": 0.4308073564463114, + "learning_rate": 2.564581479812884e-06, + "loss": 0.5206, + "step": 11163 + }, + { + "epoch": 2.963759458383114, + "grad_norm": 0.44033185425820426, + "learning_rate": 2.564232467776638e-06, + "loss": 0.5759, + "step": 11164 + }, + { + "epoch": 2.9640249568564982, + "grad_norm": 0.4433405727176709, + "learning_rate": 2.5638834544877e-06, + "loss": 0.5296, + "step": 11165 + }, + { + "epoch": 2.964290455329882, + "grad_norm": 0.43797905620138805, + "learning_rate": 2.5635344399528792e-06, + "loss": 0.5272, + "step": 11166 + }, + { + "epoch": 2.9645559538032655, + "grad_norm": 0.44396817666226107, + "learning_rate": 2.563185424178981e-06, + "loss": 0.5521, + "step": 11167 + }, + { + "epoch": 2.9648214522766496, + "grad_norm": 0.4362669506909384, + "learning_rate": 2.5628364071728124e-06, + "loss": 0.5759, + "step": 11168 + }, + { + "epoch": 2.9650869507500333, + "grad_norm": 0.4427213236912875, + "learning_rate": 2.56248738894118e-06, + "loss": 0.5669, + "step": 11169 + }, + { + "epoch": 2.965352449223417, + "grad_norm": 0.44995568915927986, + "learning_rate": 2.56213836949089e-06, + "loss": 0.5442, + "step": 11170 + }, + { + "epoch": 2.965617947696801, + "grad_norm": 0.444973114730161, + "learning_rate": 2.561789348828751e-06, + "loss": 0.5112, + "step": 11171 + }, + { + "epoch": 2.9658834461701846, + "grad_norm": 0.4452493064924916, + "learning_rate": 2.561440326961567e-06, + "loss": 0.54, + "step": 11172 + }, + { + "epoch": 2.9661489446435683, + "grad_norm": 0.4434570878155933, + "learning_rate": 2.561091303896147e-06, + "loss": 0.5669, + "step": 11173 + }, + { + "epoch": 2.9664144431169523, + "grad_norm": 0.4491126738307586, + "learning_rate": 2.5607422796392973e-06, + "loss": 0.532, + "step": 11174 + }, + { + "epoch": 2.966679941590336, + "grad_norm": 0.44919851989330994, + "learning_rate": 2.5603932541978232e-06, + "loss": 0.5703, + "step": 11175 + }, + { + "epoch": 2.9669454400637196, + "grad_norm": 0.45269339791523555, + "learning_rate": 2.5600442275785337e-06, + "loss": 0.565, + "step": 11176 + }, + { + "epoch": 2.9672109385371033, + "grad_norm": 0.4475549090862826, + "learning_rate": 2.5596951997882348e-06, + "loss": 0.5552, + "step": 11177 + }, + { + "epoch": 2.9674764370104874, + "grad_norm": 0.43623797402419323, + "learning_rate": 2.5593461708337326e-06, + "loss": 0.532, + "step": 11178 + }, + { + "epoch": 2.967741935483871, + "grad_norm": 0.4439721671894238, + "learning_rate": 2.558997140721835e-06, + "loss": 0.5377, + "step": 11179 + }, + { + "epoch": 2.9680074339572546, + "grad_norm": 0.4574268675798015, + "learning_rate": 2.5586481094593492e-06, + "loss": 0.5737, + "step": 11180 + }, + { + "epoch": 2.9682729324306383, + "grad_norm": 0.4368260741932774, + "learning_rate": 2.5582990770530815e-06, + "loss": 0.5712, + "step": 11181 + }, + { + "epoch": 2.9685384309040224, + "grad_norm": 0.44386028892392754, + "learning_rate": 2.5579500435098385e-06, + "loss": 0.5633, + "step": 11182 + }, + { + "epoch": 2.968803929377406, + "grad_norm": 0.4404921604657042, + "learning_rate": 2.557601008836429e-06, + "loss": 0.5375, + "step": 11183 + }, + { + "epoch": 2.9690694278507896, + "grad_norm": 0.4403208807612028, + "learning_rate": 2.557251973039658e-06, + "loss": 0.513, + "step": 11184 + }, + { + "epoch": 2.9693349263241737, + "grad_norm": 0.4474727524533836, + "learning_rate": 2.5569029361263325e-06, + "loss": 0.5774, + "step": 11185 + }, + { + "epoch": 2.9696004247975574, + "grad_norm": 0.4384155398363317, + "learning_rate": 2.556553898103262e-06, + "loss": 0.566, + "step": 11186 + }, + { + "epoch": 2.969865923270941, + "grad_norm": 0.4425439239664622, + "learning_rate": 2.556204858977251e-06, + "loss": 0.5788, + "step": 11187 + }, + { + "epoch": 2.970131421744325, + "grad_norm": 0.4364060757202946, + "learning_rate": 2.555855818755108e-06, + "loss": 0.5988, + "step": 11188 + }, + { + "epoch": 2.9703969202177087, + "grad_norm": 0.4404714440366446, + "learning_rate": 2.5555067774436393e-06, + "loss": 0.5453, + "step": 11189 + }, + { + "epoch": 2.9706624186910924, + "grad_norm": 0.43708389691028626, + "learning_rate": 2.5551577350496533e-06, + "loss": 0.551, + "step": 11190 + }, + { + "epoch": 2.9709279171644765, + "grad_norm": 0.452898497623725, + "learning_rate": 2.5548086915799562e-06, + "loss": 0.5864, + "step": 11191 + }, + { + "epoch": 2.97119341563786, + "grad_norm": 0.44190403868135003, + "learning_rate": 2.5544596470413552e-06, + "loss": 0.5398, + "step": 11192 + }, + { + "epoch": 2.9714589141112437, + "grad_norm": 0.4457609789472404, + "learning_rate": 2.5541106014406575e-06, + "loss": 0.5442, + "step": 11193 + }, + { + "epoch": 2.971724412584628, + "grad_norm": 0.4327364815287397, + "learning_rate": 2.553761554784671e-06, + "loss": 0.567, + "step": 11194 + }, + { + "epoch": 2.9719899110580115, + "grad_norm": 0.42877446518098794, + "learning_rate": 2.553412507080203e-06, + "loss": 0.5107, + "step": 11195 + }, + { + "epoch": 2.972255409531395, + "grad_norm": 0.4419232045802643, + "learning_rate": 2.553063458334059e-06, + "loss": 0.5847, + "step": 11196 + }, + { + "epoch": 2.972520908004779, + "grad_norm": 0.44555691797066427, + "learning_rate": 2.5527144085530485e-06, + "loss": 0.5535, + "step": 11197 + }, + { + "epoch": 2.972786406478163, + "grad_norm": 0.44789682559753, + "learning_rate": 2.5523653577439778e-06, + "loss": 0.5201, + "step": 11198 + }, + { + "epoch": 2.9730519049515465, + "grad_norm": 0.4371590915579939, + "learning_rate": 2.5520163059136543e-06, + "loss": 0.5367, + "step": 11199 + }, + { + "epoch": 2.9733174034249306, + "grad_norm": 0.4566851955363735, + "learning_rate": 2.5516672530688864e-06, + "loss": 0.5719, + "step": 11200 + }, + { + "epoch": 2.973582901898314, + "grad_norm": 0.4546836277741624, + "learning_rate": 2.5513181992164797e-06, + "loss": 0.5584, + "step": 11201 + }, + { + "epoch": 2.973848400371698, + "grad_norm": 0.4672614081173667, + "learning_rate": 2.550969144363242e-06, + "loss": 0.5675, + "step": 11202 + }, + { + "epoch": 2.9741138988450815, + "grad_norm": 0.45308862699834485, + "learning_rate": 2.5506200885159826e-06, + "loss": 0.6106, + "step": 11203 + }, + { + "epoch": 2.9743793973184656, + "grad_norm": 0.42813710358242996, + "learning_rate": 2.5502710316815066e-06, + "loss": 0.5551, + "step": 11204 + }, + { + "epoch": 2.974644895791849, + "grad_norm": 0.4573943558674008, + "learning_rate": 2.5499219738666226e-06, + "loss": 0.6021, + "step": 11205 + }, + { + "epoch": 2.974910394265233, + "grad_norm": 0.441446808633667, + "learning_rate": 2.5495729150781373e-06, + "loss": 0.5412, + "step": 11206 + }, + { + "epoch": 2.9751758927386165, + "grad_norm": 0.44655667804289145, + "learning_rate": 2.5492238553228592e-06, + "loss": 0.597, + "step": 11207 + }, + { + "epoch": 2.9754413912120006, + "grad_norm": 0.4427038334510096, + "learning_rate": 2.5488747946075955e-06, + "loss": 0.5816, + "step": 11208 + }, + { + "epoch": 2.975706889685384, + "grad_norm": 0.4373391851956795, + "learning_rate": 2.5485257329391537e-06, + "loss": 0.5514, + "step": 11209 + }, + { + "epoch": 2.975972388158768, + "grad_norm": 0.45132898161562895, + "learning_rate": 2.5481766703243415e-06, + "loss": 0.5598, + "step": 11210 + }, + { + "epoch": 2.976237886632152, + "grad_norm": 0.446054565972664, + "learning_rate": 2.547827606769966e-06, + "loss": 0.5303, + "step": 11211 + }, + { + "epoch": 2.9765033851055356, + "grad_norm": 0.44279398414427396, + "learning_rate": 2.5474785422828354e-06, + "loss": 0.5809, + "step": 11212 + }, + { + "epoch": 2.9767688835789192, + "grad_norm": 0.44780435073734165, + "learning_rate": 2.5471294768697562e-06, + "loss": 0.5702, + "step": 11213 + }, + { + "epoch": 2.9770343820523033, + "grad_norm": 0.4491173114669352, + "learning_rate": 2.546780410537537e-06, + "loss": 0.5887, + "step": 11214 + }, + { + "epoch": 2.977299880525687, + "grad_norm": 0.4363332678582455, + "learning_rate": 2.5464313432929855e-06, + "loss": 0.5657, + "step": 11215 + }, + { + "epoch": 2.9775653789990706, + "grad_norm": 0.4379776537135228, + "learning_rate": 2.546082275142909e-06, + "loss": 0.5572, + "step": 11216 + }, + { + "epoch": 2.9778308774724547, + "grad_norm": 0.44327443102852543, + "learning_rate": 2.545733206094116e-06, + "loss": 0.5362, + "step": 11217 + }, + { + "epoch": 2.9780963759458383, + "grad_norm": 0.4451125827389643, + "learning_rate": 2.5453841361534128e-06, + "loss": 0.5724, + "step": 11218 + }, + { + "epoch": 2.978361874419222, + "grad_norm": 0.4455697676567192, + "learning_rate": 2.545035065327608e-06, + "loss": 0.5762, + "step": 11219 + }, + { + "epoch": 2.978627372892606, + "grad_norm": 0.447484012520238, + "learning_rate": 2.5446859936235097e-06, + "loss": 0.5547, + "step": 11220 + }, + { + "epoch": 2.9788928713659897, + "grad_norm": 0.4446284701393264, + "learning_rate": 2.544336921047924e-06, + "loss": 0.5489, + "step": 11221 + }, + { + "epoch": 2.9791583698393733, + "grad_norm": 0.4401726567402075, + "learning_rate": 2.5439878476076604e-06, + "loss": 0.5132, + "step": 11222 + }, + { + "epoch": 2.9794238683127574, + "grad_norm": 0.4322749602899099, + "learning_rate": 2.543638773309526e-06, + "loss": 0.5397, + "step": 11223 + }, + { + "epoch": 2.979689366786141, + "grad_norm": 0.4384372150760701, + "learning_rate": 2.5432896981603285e-06, + "loss": 0.5623, + "step": 11224 + }, + { + "epoch": 2.9799548652595247, + "grad_norm": 0.4431273328655571, + "learning_rate": 2.5429406221668757e-06, + "loss": 0.5351, + "step": 11225 + }, + { + "epoch": 2.9802203637329088, + "grad_norm": 0.4422551689912942, + "learning_rate": 2.5425915453359766e-06, + "loss": 0.5305, + "step": 11226 + }, + { + "epoch": 2.9804858622062924, + "grad_norm": 0.46812144802553396, + "learning_rate": 2.5422424676744373e-06, + "loss": 0.5909, + "step": 11227 + }, + { + "epoch": 2.980751360679676, + "grad_norm": 0.45310916069138585, + "learning_rate": 2.5418933891890665e-06, + "loss": 0.5493, + "step": 11228 + }, + { + "epoch": 2.98101685915306, + "grad_norm": 0.4482978301963584, + "learning_rate": 2.5415443098866727e-06, + "loss": 0.5534, + "step": 11229 + }, + { + "epoch": 2.981282357626444, + "grad_norm": 0.44932228990485235, + "learning_rate": 2.541195229774062e-06, + "loss": 0.5749, + "step": 11230 + }, + { + "epoch": 2.9815478560998274, + "grad_norm": 0.4423315263556984, + "learning_rate": 2.5408461488580444e-06, + "loss": 0.5697, + "step": 11231 + }, + { + "epoch": 2.981813354573211, + "grad_norm": 0.44244361038119795, + "learning_rate": 2.5404970671454272e-06, + "loss": 0.5822, + "step": 11232 + }, + { + "epoch": 2.982078853046595, + "grad_norm": 0.4357755471056965, + "learning_rate": 2.540147984643017e-06, + "loss": 0.5639, + "step": 11233 + }, + { + "epoch": 2.982344351519979, + "grad_norm": 0.43880735247830027, + "learning_rate": 2.5397989013576223e-06, + "loss": 0.5741, + "step": 11234 + }, + { + "epoch": 2.9826098499933624, + "grad_norm": 0.4415388720919388, + "learning_rate": 2.539449817296053e-06, + "loss": 0.5469, + "step": 11235 + }, + { + "epoch": 2.982875348466746, + "grad_norm": 0.44034337085961955, + "learning_rate": 2.539100732465115e-06, + "loss": 0.5689, + "step": 11236 + }, + { + "epoch": 2.98314084694013, + "grad_norm": 0.44986388773439695, + "learning_rate": 2.538751646871617e-06, + "loss": 0.5684, + "step": 11237 + }, + { + "epoch": 2.983406345413514, + "grad_norm": 0.44472683084986, + "learning_rate": 2.538402560522368e-06, + "loss": 0.5702, + "step": 11238 + }, + { + "epoch": 2.9836718438868974, + "grad_norm": 0.45826683300142207, + "learning_rate": 2.5380534734241733e-06, + "loss": 0.6002, + "step": 11239 + }, + { + "epoch": 2.9839373423602815, + "grad_norm": 0.4432175779110682, + "learning_rate": 2.5377043855838446e-06, + "loss": 0.5741, + "step": 11240 + }, + { + "epoch": 2.984202840833665, + "grad_norm": 0.4399840176483594, + "learning_rate": 2.5373552970081866e-06, + "loss": 0.5601, + "step": 11241 + }, + { + "epoch": 2.984468339307049, + "grad_norm": 0.44741860754699964, + "learning_rate": 2.537006207704009e-06, + "loss": 0.5517, + "step": 11242 + }, + { + "epoch": 2.984733837780433, + "grad_norm": 0.44998287673306, + "learning_rate": 2.53665711767812e-06, + "loss": 0.5762, + "step": 11243 + }, + { + "epoch": 2.9849993362538165, + "grad_norm": 0.43809905060054016, + "learning_rate": 2.5363080269373274e-06, + "loss": 0.5629, + "step": 11244 + }, + { + "epoch": 2.9852648347272, + "grad_norm": 0.4448452922040201, + "learning_rate": 2.5359589354884396e-06, + "loss": 0.5761, + "step": 11245 + }, + { + "epoch": 2.9855303332005843, + "grad_norm": 0.449341935571204, + "learning_rate": 2.535609843338265e-06, + "loss": 0.5639, + "step": 11246 + }, + { + "epoch": 2.985795831673968, + "grad_norm": 0.4289894038335368, + "learning_rate": 2.53526075049361e-06, + "loss": 0.5384, + "step": 11247 + }, + { + "epoch": 2.9860613301473515, + "grad_norm": 0.44458250738500216, + "learning_rate": 2.5349116569612846e-06, + "loss": 0.5544, + "step": 11248 + }, + { + "epoch": 2.9863268286207356, + "grad_norm": 0.4401993868245061, + "learning_rate": 2.5345625627480967e-06, + "loss": 0.5586, + "step": 11249 + }, + { + "epoch": 2.9865923270941193, + "grad_norm": 0.4458296969510183, + "learning_rate": 2.5342134678608545e-06, + "loss": 0.5639, + "step": 11250 + }, + { + "epoch": 2.986857825567503, + "grad_norm": 0.45195500940060224, + "learning_rate": 2.533864372306365e-06, + "loss": 0.5373, + "step": 11251 + }, + { + "epoch": 2.987123324040887, + "grad_norm": 0.4365351646667504, + "learning_rate": 2.533515276091437e-06, + "loss": 0.5724, + "step": 11252 + }, + { + "epoch": 2.9873888225142706, + "grad_norm": 0.43290288455455056, + "learning_rate": 2.53316617922288e-06, + "loss": 0.5565, + "step": 11253 + }, + { + "epoch": 2.9876543209876543, + "grad_norm": 0.42887338332466673, + "learning_rate": 2.532817081707501e-06, + "loss": 0.5239, + "step": 11254 + }, + { + "epoch": 2.9879198194610384, + "grad_norm": 0.4449098984978563, + "learning_rate": 2.5324679835521095e-06, + "loss": 0.547, + "step": 11255 + }, + { + "epoch": 2.988185317934422, + "grad_norm": 0.4634421634240405, + "learning_rate": 2.5321188847635114e-06, + "loss": 0.5539, + "step": 11256 + }, + { + "epoch": 2.9884508164078056, + "grad_norm": 0.4373462119829847, + "learning_rate": 2.531769785348517e-06, + "loss": 0.5811, + "step": 11257 + }, + { + "epoch": 2.9887163148811893, + "grad_norm": 0.44354990633169195, + "learning_rate": 2.5314206853139345e-06, + "loss": 0.5576, + "step": 11258 + }, + { + "epoch": 2.9889818133545734, + "grad_norm": 0.44503574667033263, + "learning_rate": 2.5310715846665705e-06, + "loss": 0.5663, + "step": 11259 + }, + { + "epoch": 2.989247311827957, + "grad_norm": 0.4356749565603212, + "learning_rate": 2.5307224834132353e-06, + "loss": 0.5679, + "step": 11260 + }, + { + "epoch": 2.9895128103013406, + "grad_norm": 0.4430748615799877, + "learning_rate": 2.530373381560736e-06, + "loss": 0.5387, + "step": 11261 + }, + { + "epoch": 2.9897783087747243, + "grad_norm": 0.4496859478461454, + "learning_rate": 2.5300242791158817e-06, + "loss": 0.5696, + "step": 11262 + }, + { + "epoch": 2.9900438072481084, + "grad_norm": 0.4479595208313046, + "learning_rate": 2.5296751760854803e-06, + "loss": 0.58, + "step": 11263 + }, + { + "epoch": 2.990309305721492, + "grad_norm": 0.4347060236430883, + "learning_rate": 2.5293260724763415e-06, + "loss": 0.5432, + "step": 11264 + }, + { + "epoch": 2.9905748041948756, + "grad_norm": 0.4587011621086973, + "learning_rate": 2.5289769682952708e-06, + "loss": 0.5768, + "step": 11265 + }, + { + "epoch": 2.9908403026682597, + "grad_norm": 0.45993271716877443, + "learning_rate": 2.5286278635490796e-06, + "loss": 0.5617, + "step": 11266 + }, + { + "epoch": 2.9911058011416434, + "grad_norm": 0.4593435656454506, + "learning_rate": 2.528278758244574e-06, + "loss": 0.6153, + "step": 11267 + }, + { + "epoch": 2.991371299615027, + "grad_norm": 0.45793928278450274, + "learning_rate": 2.5279296523885636e-06, + "loss": 0.5955, + "step": 11268 + }, + { + "epoch": 2.991636798088411, + "grad_norm": 0.4403072305829855, + "learning_rate": 2.527580545987857e-06, + "loss": 0.5283, + "step": 11269 + }, + { + "epoch": 2.9919022965617947, + "grad_norm": 0.4429184407114556, + "learning_rate": 2.5272314390492613e-06, + "loss": 0.5672, + "step": 11270 + }, + { + "epoch": 2.9921677950351784, + "grad_norm": 0.454936784487101, + "learning_rate": 2.5268823315795864e-06, + "loss": 0.5599, + "step": 11271 + }, + { + "epoch": 2.9924332935085625, + "grad_norm": 0.463014701366042, + "learning_rate": 2.526533223585641e-06, + "loss": 0.5506, + "step": 11272 + }, + { + "epoch": 2.992698791981946, + "grad_norm": 0.4334662432865807, + "learning_rate": 2.5261841150742316e-06, + "loss": 0.5611, + "step": 11273 + }, + { + "epoch": 2.9929642904553297, + "grad_norm": 0.4365591613659824, + "learning_rate": 2.5258350060521685e-06, + "loss": 0.5455, + "step": 11274 + }, + { + "epoch": 2.993229788928714, + "grad_norm": 0.4549911110574605, + "learning_rate": 2.5254858965262598e-06, + "loss": 0.5421, + "step": 11275 + }, + { + "epoch": 2.9934952874020975, + "grad_norm": 0.45847394629016647, + "learning_rate": 2.525136786503313e-06, + "loss": 0.5363, + "step": 11276 + }, + { + "epoch": 2.993760785875481, + "grad_norm": 0.49382089052403316, + "learning_rate": 2.5247876759901373e-06, + "loss": 0.5778, + "step": 11277 + }, + { + "epoch": 2.994026284348865, + "grad_norm": 0.4338624895993122, + "learning_rate": 2.524438564993542e-06, + "loss": 0.5517, + "step": 11278 + }, + { + "epoch": 2.994291782822249, + "grad_norm": 0.4533933217734302, + "learning_rate": 2.5240894535203343e-06, + "loss": 0.5032, + "step": 11279 + }, + { + "epoch": 2.9945572812956325, + "grad_norm": 0.4562039665686122, + "learning_rate": 2.5237403415773226e-06, + "loss": 0.544, + "step": 11280 + }, + { + "epoch": 2.9948227797690166, + "grad_norm": 0.45701338523967777, + "learning_rate": 2.5233912291713168e-06, + "loss": 0.5622, + "step": 11281 + }, + { + "epoch": 2.9950882782424, + "grad_norm": 0.4599025888326689, + "learning_rate": 2.5230421163091252e-06, + "loss": 0.5566, + "step": 11282 + }, + { + "epoch": 2.995353776715784, + "grad_norm": 0.4535592936562848, + "learning_rate": 2.5226930029975556e-06, + "loss": 0.573, + "step": 11283 + }, + { + "epoch": 2.995619275189168, + "grad_norm": 0.4381908355222801, + "learning_rate": 2.522343889243417e-06, + "loss": 0.5759, + "step": 11284 + }, + { + "epoch": 2.9958847736625516, + "grad_norm": 0.42897915627450794, + "learning_rate": 2.521994775053517e-06, + "loss": 0.5615, + "step": 11285 + }, + { + "epoch": 2.996150272135935, + "grad_norm": 0.4520380469089561, + "learning_rate": 2.521645660434666e-06, + "loss": 0.5787, + "step": 11286 + }, + { + "epoch": 2.996415770609319, + "grad_norm": 0.4717392481113668, + "learning_rate": 2.5212965453936717e-06, + "loss": 0.5812, + "step": 11287 + }, + { + "epoch": 2.996681269082703, + "grad_norm": 0.4511349127266952, + "learning_rate": 2.520947429937342e-06, + "loss": 0.5641, + "step": 11288 + }, + { + "epoch": 2.9969467675560866, + "grad_norm": 0.45068160769747145, + "learning_rate": 2.520598314072486e-06, + "loss": 0.5666, + "step": 11289 + }, + { + "epoch": 2.99721226602947, + "grad_norm": 0.45313071251959997, + "learning_rate": 2.520249197805913e-06, + "loss": 0.5601, + "step": 11290 + }, + { + "epoch": 2.997477764502854, + "grad_norm": 0.44555595334324477, + "learning_rate": 2.5199000811444305e-06, + "loss": 0.5332, + "step": 11291 + }, + { + "epoch": 2.997743262976238, + "grad_norm": 0.45432961178884324, + "learning_rate": 2.5195509640948486e-06, + "loss": 0.5612, + "step": 11292 + }, + { + "epoch": 2.9980087614496216, + "grad_norm": 0.4428788659288057, + "learning_rate": 2.5192018466639744e-06, + "loss": 0.5613, + "step": 11293 + }, + { + "epoch": 2.9982742599230052, + "grad_norm": 0.45634298802191126, + "learning_rate": 2.518852728858617e-06, + "loss": 0.5894, + "step": 11294 + }, + { + "epoch": 2.9985397583963893, + "grad_norm": 0.4505354320959237, + "learning_rate": 2.518503610685586e-06, + "loss": 0.5787, + "step": 11295 + }, + { + "epoch": 2.998805256869773, + "grad_norm": 0.4419037294560243, + "learning_rate": 2.5181544921516887e-06, + "loss": 0.5518, + "step": 11296 + }, + { + "epoch": 2.9990707553431566, + "grad_norm": 0.4535601858542848, + "learning_rate": 2.517805373263734e-06, + "loss": 0.5382, + "step": 11297 + }, + { + "epoch": 2.9993362538165407, + "grad_norm": 0.43313963798149757, + "learning_rate": 2.5174562540285314e-06, + "loss": 0.5277, + "step": 11298 + }, + { + "epoch": 2.9996017522899243, + "grad_norm": 0.4428862138715205, + "learning_rate": 2.5171071344528893e-06, + "loss": 0.5325, + "step": 11299 + }, + { + "epoch": 2.999867250763308, + "grad_norm": 0.4361884270649867, + "learning_rate": 2.5167580145436153e-06, + "loss": 0.5691, + "step": 11300 + }, + { + "epoch": 3.0, + "grad_norm": 0.4361884270649867, + "learning_rate": 2.5164088943075204e-06, + "loss": 0.5764, + "step": 11301 + }, + { + "epoch": 3.0002654984733836, + "grad_norm": 0.7088511998776494, + "learning_rate": 2.516059773751411e-06, + "loss": 0.5466, + "step": 11302 + }, + { + "epoch": 3.0005309969467677, + "grad_norm": 0.42910669148857644, + "learning_rate": 2.5157106528820967e-06, + "loss": 0.5125, + "step": 11303 + }, + { + "epoch": 3.0007964954201514, + "grad_norm": 0.4374062010008459, + "learning_rate": 2.515361531706387e-06, + "loss": 0.53, + "step": 11304 + }, + { + "epoch": 3.001061993893535, + "grad_norm": 0.4310181577069742, + "learning_rate": 2.5150124102310883e-06, + "loss": 0.5118, + "step": 11305 + }, + { + "epoch": 3.001327492366919, + "grad_norm": 0.44924336997676784, + "learning_rate": 2.5146632884630116e-06, + "loss": 0.5503, + "step": 11306 + }, + { + "epoch": 3.0015929908403027, + "grad_norm": 0.44907038022732887, + "learning_rate": 2.514314166408965e-06, + "loss": 0.5176, + "step": 11307 + }, + { + "epoch": 3.0018584893136864, + "grad_norm": 0.43672237946731995, + "learning_rate": 2.5139650440757567e-06, + "loss": 0.5216, + "step": 11308 + }, + { + "epoch": 3.00212398778707, + "grad_norm": 0.4415531017760964, + "learning_rate": 2.5136159214701965e-06, + "loss": 0.5302, + "step": 11309 + }, + { + "epoch": 3.002389486260454, + "grad_norm": 0.4447553178754593, + "learning_rate": 2.5132667985990926e-06, + "loss": 0.5441, + "step": 11310 + }, + { + "epoch": 3.0026549847338377, + "grad_norm": 0.45334867250251065, + "learning_rate": 2.512917675469253e-06, + "loss": 0.5392, + "step": 11311 + }, + { + "epoch": 3.0029204832072214, + "grad_norm": 0.45426628460225166, + "learning_rate": 2.5125685520874876e-06, + "loss": 0.5706, + "step": 11312 + }, + { + "epoch": 3.0031859816806055, + "grad_norm": 0.4397767439356563, + "learning_rate": 2.5122194284606043e-06, + "loss": 0.5522, + "step": 11313 + }, + { + "epoch": 3.003451480153989, + "grad_norm": 0.44772936916434014, + "learning_rate": 2.5118703045954124e-06, + "loss": 0.5387, + "step": 11314 + }, + { + "epoch": 3.0037169786273727, + "grad_norm": 0.441492028197051, + "learning_rate": 2.5115211804987205e-06, + "loss": 0.5922, + "step": 11315 + }, + { + "epoch": 3.003982477100757, + "grad_norm": 0.4706905287672035, + "learning_rate": 2.5111720561773366e-06, + "loss": 0.537, + "step": 11316 + }, + { + "epoch": 3.0042479755741405, + "grad_norm": 0.45222299309792324, + "learning_rate": 2.5108229316380712e-06, + "loss": 0.5596, + "step": 11317 + }, + { + "epoch": 3.004513474047524, + "grad_norm": 0.45510779905336923, + "learning_rate": 2.510473806887732e-06, + "loss": 0.5387, + "step": 11318 + }, + { + "epoch": 3.004778972520908, + "grad_norm": 0.44521453971224245, + "learning_rate": 2.5101246819331283e-06, + "loss": 0.522, + "step": 11319 + }, + { + "epoch": 3.005044470994292, + "grad_norm": 0.4497136237839376, + "learning_rate": 2.5097755567810677e-06, + "loss": 0.563, + "step": 11320 + }, + { + "epoch": 3.0053099694676755, + "grad_norm": 0.4350823791017069, + "learning_rate": 2.5094264314383602e-06, + "loss": 0.5461, + "step": 11321 + }, + { + "epoch": 3.005575467941059, + "grad_norm": 0.45808616727861845, + "learning_rate": 2.509077305911814e-06, + "loss": 0.5405, + "step": 11322 + }, + { + "epoch": 3.005840966414443, + "grad_norm": 0.4610149900937319, + "learning_rate": 2.508728180208238e-06, + "loss": 0.5188, + "step": 11323 + }, + { + "epoch": 3.006106464887827, + "grad_norm": 0.45516075150157503, + "learning_rate": 2.508379054334442e-06, + "loss": 0.5825, + "step": 11324 + }, + { + "epoch": 3.0063719633612105, + "grad_norm": 0.4459249469923601, + "learning_rate": 2.5080299282972333e-06, + "loss": 0.499, + "step": 11325 + }, + { + "epoch": 3.0066374618345946, + "grad_norm": 0.43985195152481044, + "learning_rate": 2.5076808021034204e-06, + "loss": 0.5645, + "step": 11326 + }, + { + "epoch": 3.006902960307978, + "grad_norm": 0.4498148165442387, + "learning_rate": 2.5073316757598147e-06, + "loss": 0.5106, + "step": 11327 + }, + { + "epoch": 3.007168458781362, + "grad_norm": 0.45112105530185515, + "learning_rate": 2.5069825492732223e-06, + "loss": 0.5428, + "step": 11328 + }, + { + "epoch": 3.007433957254746, + "grad_norm": 0.46333586177429326, + "learning_rate": 2.5066334226504537e-06, + "loss": 0.5325, + "step": 11329 + }, + { + "epoch": 3.0076994557281296, + "grad_norm": 0.43746005869075444, + "learning_rate": 2.506284295898317e-06, + "loss": 0.5593, + "step": 11330 + }, + { + "epoch": 3.007964954201513, + "grad_norm": 0.45077058278880017, + "learning_rate": 2.505935169023621e-06, + "loss": 0.5502, + "step": 11331 + }, + { + "epoch": 3.0082304526748973, + "grad_norm": 0.42898475421673216, + "learning_rate": 2.5055860420331746e-06, + "loss": 0.5427, + "step": 11332 + }, + { + "epoch": 3.008495951148281, + "grad_norm": 0.4527510304832329, + "learning_rate": 2.505236914933787e-06, + "loss": 0.5502, + "step": 11333 + }, + { + "epoch": 3.0087614496216646, + "grad_norm": 0.448571279366419, + "learning_rate": 2.5048877877322665e-06, + "loss": 0.5629, + "step": 11334 + }, + { + "epoch": 3.0090269480950487, + "grad_norm": 0.452246714074237, + "learning_rate": 2.5045386604354223e-06, + "loss": 0.5528, + "step": 11335 + }, + { + "epoch": 3.0092924465684323, + "grad_norm": 0.4530729964734495, + "learning_rate": 2.5041895330500627e-06, + "loss": 0.559, + "step": 11336 + }, + { + "epoch": 3.009557945041816, + "grad_norm": 0.44230561413267777, + "learning_rate": 2.5038404055829967e-06, + "loss": 0.5599, + "step": 11337 + }, + { + "epoch": 3.0098234435151996, + "grad_norm": 0.4420693238205891, + "learning_rate": 2.5034912780410347e-06, + "loss": 0.5585, + "step": 11338 + }, + { + "epoch": 3.0100889419885837, + "grad_norm": 0.45392691094090687, + "learning_rate": 2.503142150430983e-06, + "loss": 0.5341, + "step": 11339 + }, + { + "epoch": 3.0103544404619673, + "grad_norm": 0.4499986656580863, + "learning_rate": 2.5027930227596525e-06, + "loss": 0.5437, + "step": 11340 + }, + { + "epoch": 3.010619938935351, + "grad_norm": 0.4567706752591637, + "learning_rate": 2.502443895033851e-06, + "loss": 0.5411, + "step": 11341 + }, + { + "epoch": 3.010885437408735, + "grad_norm": 0.4302456838102514, + "learning_rate": 2.502094767260387e-06, + "loss": 0.5395, + "step": 11342 + }, + { + "epoch": 3.0111509358821187, + "grad_norm": 0.4436728909899451, + "learning_rate": 2.5017456394460706e-06, + "loss": 0.5393, + "step": 11343 + }, + { + "epoch": 3.0114164343555023, + "grad_norm": 0.44738266338846805, + "learning_rate": 2.5013965115977097e-06, + "loss": 0.5487, + "step": 11344 + }, + { + "epoch": 3.0116819328288864, + "grad_norm": 0.45087354958090115, + "learning_rate": 2.501047383722113e-06, + "loss": 0.5362, + "step": 11345 + }, + { + "epoch": 3.01194743130227, + "grad_norm": 0.46587253926241795, + "learning_rate": 2.5006982558260902e-06, + "loss": 0.5685, + "step": 11346 + }, + { + "epoch": 3.0122129297756537, + "grad_norm": 0.444328133728684, + "learning_rate": 2.50034912791645e-06, + "loss": 0.5585, + "step": 11347 + }, + { + "epoch": 3.0124784282490378, + "grad_norm": 0.44497849621861746, + "learning_rate": 2.5e-06, + "loss": 0.5421, + "step": 11348 + }, + { + "epoch": 3.0127439267224214, + "grad_norm": 0.4577891992918872, + "learning_rate": 2.499650872083551e-06, + "loss": 0.5855, + "step": 11349 + }, + { + "epoch": 3.013009425195805, + "grad_norm": 0.4588169041142913, + "learning_rate": 2.49930174417391e-06, + "loss": 0.5869, + "step": 11350 + }, + { + "epoch": 3.0132749236691887, + "grad_norm": 0.44168509455356286, + "learning_rate": 2.4989526162778876e-06, + "loss": 0.5496, + "step": 11351 + }, + { + "epoch": 3.013540422142573, + "grad_norm": 0.4596326319097752, + "learning_rate": 2.4986034884022916e-06, + "loss": 0.5276, + "step": 11352 + }, + { + "epoch": 3.0138059206159564, + "grad_norm": 0.42844809859535943, + "learning_rate": 2.4982543605539302e-06, + "loss": 0.5446, + "step": 11353 + }, + { + "epoch": 3.01407141908934, + "grad_norm": 0.4422038485215486, + "learning_rate": 2.4979052327396133e-06, + "loss": 0.5294, + "step": 11354 + }, + { + "epoch": 3.014336917562724, + "grad_norm": 0.4564835458031461, + "learning_rate": 2.49755610496615e-06, + "loss": 0.5505, + "step": 11355 + }, + { + "epoch": 3.014602416036108, + "grad_norm": 0.4275397382890219, + "learning_rate": 2.497206977240348e-06, + "loss": 0.533, + "step": 11356 + }, + { + "epoch": 3.0148679145094914, + "grad_norm": 0.462186128208261, + "learning_rate": 2.4968578495690177e-06, + "loss": 0.5554, + "step": 11357 + }, + { + "epoch": 3.0151334129828755, + "grad_norm": 0.4393311992796125, + "learning_rate": 2.4965087219589666e-06, + "loss": 0.5196, + "step": 11358 + }, + { + "epoch": 3.015398911456259, + "grad_norm": 0.44654358100077113, + "learning_rate": 2.4961595944170037e-06, + "loss": 0.5613, + "step": 11359 + }, + { + "epoch": 3.015664409929643, + "grad_norm": 0.46649968442610984, + "learning_rate": 2.495810466949938e-06, + "loss": 0.554, + "step": 11360 + }, + { + "epoch": 3.015929908403027, + "grad_norm": 0.44304371775862483, + "learning_rate": 2.495461339564579e-06, + "loss": 0.552, + "step": 11361 + }, + { + "epoch": 3.0161954068764105, + "grad_norm": 0.4473936442225, + "learning_rate": 2.4951122122677343e-06, + "loss": 0.5569, + "step": 11362 + }, + { + "epoch": 3.016460905349794, + "grad_norm": 0.44715890733901936, + "learning_rate": 2.4947630850662143e-06, + "loss": 0.5516, + "step": 11363 + }, + { + "epoch": 3.016726403823178, + "grad_norm": 0.4618170394987225, + "learning_rate": 2.494413957966826e-06, + "loss": 0.5698, + "step": 11364 + }, + { + "epoch": 3.016991902296562, + "grad_norm": 0.4388459869937891, + "learning_rate": 2.49406483097638e-06, + "loss": 0.5454, + "step": 11365 + }, + { + "epoch": 3.0172574007699455, + "grad_norm": 0.4475060055813505, + "learning_rate": 2.493715704101684e-06, + "loss": 0.5447, + "step": 11366 + }, + { + "epoch": 3.017522899243329, + "grad_norm": 0.43930536026052397, + "learning_rate": 2.4933665773495467e-06, + "loss": 0.5131, + "step": 11367 + }, + { + "epoch": 3.0177883977167133, + "grad_norm": 0.449186539159897, + "learning_rate": 2.4930174507267776e-06, + "loss": 0.5791, + "step": 11368 + }, + { + "epoch": 3.018053896190097, + "grad_norm": 0.4438609294339896, + "learning_rate": 2.4926683242401865e-06, + "loss": 0.5448, + "step": 11369 + }, + { + "epoch": 3.0183193946634805, + "grad_norm": 0.46441727170252234, + "learning_rate": 2.49231919789658e-06, + "loss": 0.5662, + "step": 11370 + }, + { + "epoch": 3.0185848931368646, + "grad_norm": 0.4287865256742984, + "learning_rate": 2.4919700717027676e-06, + "loss": 0.5133, + "step": 11371 + }, + { + "epoch": 3.0188503916102483, + "grad_norm": 0.4392193479756321, + "learning_rate": 2.4916209456655583e-06, + "loss": 0.547, + "step": 11372 + }, + { + "epoch": 3.019115890083632, + "grad_norm": 0.4531874375286059, + "learning_rate": 2.4912718197917616e-06, + "loss": 0.5501, + "step": 11373 + }, + { + "epoch": 3.019381388557016, + "grad_norm": 0.44694930485001355, + "learning_rate": 2.490922694088187e-06, + "loss": 0.5645, + "step": 11374 + }, + { + "epoch": 3.0196468870303996, + "grad_norm": 0.4488028913189199, + "learning_rate": 2.490573568561641e-06, + "loss": 0.5551, + "step": 11375 + }, + { + "epoch": 3.0199123855037833, + "grad_norm": 0.458656658866499, + "learning_rate": 2.490224443218933e-06, + "loss": 0.5419, + "step": 11376 + }, + { + "epoch": 3.0201778839771674, + "grad_norm": 0.4518699617197473, + "learning_rate": 2.489875318066873e-06, + "loss": 0.5599, + "step": 11377 + }, + { + "epoch": 3.020443382450551, + "grad_norm": 0.4424438856330224, + "learning_rate": 2.4895261931122684e-06, + "loss": 0.5507, + "step": 11378 + }, + { + "epoch": 3.0207088809239346, + "grad_norm": 0.4578868787621122, + "learning_rate": 2.489177068361929e-06, + "loss": 0.5747, + "step": 11379 + }, + { + "epoch": 3.0209743793973183, + "grad_norm": 0.4556148976423923, + "learning_rate": 2.488827943822664e-06, + "loss": 0.5694, + "step": 11380 + }, + { + "epoch": 3.0212398778707024, + "grad_norm": 0.4359781928000787, + "learning_rate": 2.4884788195012804e-06, + "loss": 0.5442, + "step": 11381 + }, + { + "epoch": 3.021505376344086, + "grad_norm": 0.48363372886598793, + "learning_rate": 2.4881296954045884e-06, + "loss": 0.5346, + "step": 11382 + }, + { + "epoch": 3.0217708748174696, + "grad_norm": 0.4604823458742792, + "learning_rate": 2.487780571539396e-06, + "loss": 0.5618, + "step": 11383 + }, + { + "epoch": 3.0220363732908537, + "grad_norm": 0.44826304092119007, + "learning_rate": 2.4874314479125128e-06, + "loss": 0.5477, + "step": 11384 + }, + { + "epoch": 3.0223018717642374, + "grad_norm": 0.46236290842185895, + "learning_rate": 2.487082324530747e-06, + "loss": 0.5435, + "step": 11385 + }, + { + "epoch": 3.022567370237621, + "grad_norm": 0.45563110594913037, + "learning_rate": 2.4867332014009086e-06, + "loss": 0.5423, + "step": 11386 + }, + { + "epoch": 3.022832868711005, + "grad_norm": 0.46984263522281516, + "learning_rate": 2.4863840785298044e-06, + "loss": 0.5657, + "step": 11387 + }, + { + "epoch": 3.0230983671843887, + "grad_norm": 0.4522311850864079, + "learning_rate": 2.4860349559242437e-06, + "loss": 0.531, + "step": 11388 + }, + { + "epoch": 3.0233638656577724, + "grad_norm": 0.4538910231581137, + "learning_rate": 2.4856858335910354e-06, + "loss": 0.5383, + "step": 11389 + }, + { + "epoch": 3.0236293641311565, + "grad_norm": 0.4577613548755966, + "learning_rate": 2.485336711536989e-06, + "loss": 0.5289, + "step": 11390 + }, + { + "epoch": 3.02389486260454, + "grad_norm": 0.45878909876504076, + "learning_rate": 2.484987589768912e-06, + "loss": 0.5734, + "step": 11391 + }, + { + "epoch": 3.0241603610779237, + "grad_norm": 0.45236812046271174, + "learning_rate": 2.4846384682936148e-06, + "loss": 0.5695, + "step": 11392 + }, + { + "epoch": 3.0244258595513074, + "grad_norm": 0.4629286805437994, + "learning_rate": 2.484289347117904e-06, + "loss": 0.544, + "step": 11393 + }, + { + "epoch": 3.0246913580246915, + "grad_norm": 0.44551195360080476, + "learning_rate": 2.48394022624859e-06, + "loss": 0.5525, + "step": 11394 + }, + { + "epoch": 3.024956856498075, + "grad_norm": 0.44159570643075696, + "learning_rate": 2.4835911056924804e-06, + "loss": 0.5629, + "step": 11395 + }, + { + "epoch": 3.0252223549714587, + "grad_norm": 0.46099124349039006, + "learning_rate": 2.4832419854563847e-06, + "loss": 0.5338, + "step": 11396 + }, + { + "epoch": 3.025487853444843, + "grad_norm": 0.4608636196539273, + "learning_rate": 2.482892865547112e-06, + "loss": 0.5572, + "step": 11397 + }, + { + "epoch": 3.0257533519182265, + "grad_norm": 0.4293688698602899, + "learning_rate": 2.4825437459714694e-06, + "loss": 0.5089, + "step": 11398 + }, + { + "epoch": 3.02601885039161, + "grad_norm": 0.4740745299547139, + "learning_rate": 2.4821946267362664e-06, + "loss": 0.562, + "step": 11399 + }, + { + "epoch": 3.026284348864994, + "grad_norm": 0.4548374340272445, + "learning_rate": 2.481845507848312e-06, + "loss": 0.576, + "step": 11400 + }, + { + "epoch": 3.026549847338378, + "grad_norm": 0.460284036389579, + "learning_rate": 2.4814963893144146e-06, + "loss": 0.5576, + "step": 11401 + }, + { + "epoch": 3.0268153458117615, + "grad_norm": 0.46264279862012797, + "learning_rate": 2.4811472711413833e-06, + "loss": 0.5388, + "step": 11402 + }, + { + "epoch": 3.0270808442851456, + "grad_norm": 0.4721359342583942, + "learning_rate": 2.480798153336027e-06, + "loss": 0.5791, + "step": 11403 + }, + { + "epoch": 3.027346342758529, + "grad_norm": 0.4601367627973896, + "learning_rate": 2.4804490359051526e-06, + "loss": 0.5458, + "step": 11404 + }, + { + "epoch": 3.027611841231913, + "grad_norm": 0.44459101852355476, + "learning_rate": 2.48009991885557e-06, + "loss": 0.5747, + "step": 11405 + }, + { + "epoch": 3.0278773397052965, + "grad_norm": 0.45568392275733954, + "learning_rate": 2.479750802194088e-06, + "loss": 0.5931, + "step": 11406 + }, + { + "epoch": 3.0281428381786806, + "grad_norm": 0.4615132918076059, + "learning_rate": 2.479401685927514e-06, + "loss": 0.5743, + "step": 11407 + }, + { + "epoch": 3.028408336652064, + "grad_norm": 0.45084536070410025, + "learning_rate": 2.4790525700626586e-06, + "loss": 0.558, + "step": 11408 + }, + { + "epoch": 3.028673835125448, + "grad_norm": 0.45907923066713635, + "learning_rate": 2.4787034546063296e-06, + "loss": 0.5495, + "step": 11409 + }, + { + "epoch": 3.028939333598832, + "grad_norm": 0.4520536155943597, + "learning_rate": 2.4783543395653347e-06, + "loss": 0.5714, + "step": 11410 + }, + { + "epoch": 3.0292048320722156, + "grad_norm": 0.4461748518055214, + "learning_rate": 2.4780052249464832e-06, + "loss": 0.5776, + "step": 11411 + }, + { + "epoch": 3.029470330545599, + "grad_norm": 0.45685862231758895, + "learning_rate": 2.477656110756584e-06, + "loss": 0.5339, + "step": 11412 + }, + { + "epoch": 3.0297358290189833, + "grad_norm": 0.45441640011848605, + "learning_rate": 2.477306997002445e-06, + "loss": 0.5632, + "step": 11413 + }, + { + "epoch": 3.030001327492367, + "grad_norm": 0.4658014663903666, + "learning_rate": 2.476957883690875e-06, + "loss": 0.4945, + "step": 11414 + }, + { + "epoch": 3.0302668259657506, + "grad_norm": 0.4654265790050967, + "learning_rate": 2.476608770828684e-06, + "loss": 0.5104, + "step": 11415 + }, + { + "epoch": 3.0305323244391347, + "grad_norm": 0.4422412980407415, + "learning_rate": 2.4762596584226782e-06, + "loss": 0.4911, + "step": 11416 + }, + { + "epoch": 3.0307978229125183, + "grad_norm": 0.45044438081551114, + "learning_rate": 2.4759105464796665e-06, + "loss": 0.5421, + "step": 11417 + }, + { + "epoch": 3.031063321385902, + "grad_norm": 0.4524247600756253, + "learning_rate": 2.4755614350064587e-06, + "loss": 0.5797, + "step": 11418 + }, + { + "epoch": 3.0313288198592856, + "grad_norm": 0.46186139446113184, + "learning_rate": 2.4752123240098627e-06, + "loss": 0.5822, + "step": 11419 + }, + { + "epoch": 3.0315943183326697, + "grad_norm": 0.4824886988658856, + "learning_rate": 2.474863213496687e-06, + "loss": 0.5581, + "step": 11420 + }, + { + "epoch": 3.0318598168060533, + "grad_norm": 0.4709144087088842, + "learning_rate": 2.4745141034737415e-06, + "loss": 0.558, + "step": 11421 + }, + { + "epoch": 3.032125315279437, + "grad_norm": 0.46370638437820283, + "learning_rate": 2.4741649939478323e-06, + "loss": 0.5339, + "step": 11422 + }, + { + "epoch": 3.032390813752821, + "grad_norm": 0.4420209722156511, + "learning_rate": 2.473815884925769e-06, + "loss": 0.5648, + "step": 11423 + }, + { + "epoch": 3.0326563122262047, + "grad_norm": 0.453175139656296, + "learning_rate": 2.47346677641436e-06, + "loss": 0.5447, + "step": 11424 + }, + { + "epoch": 3.0329218106995883, + "grad_norm": 0.4503144579551726, + "learning_rate": 2.4731176684204136e-06, + "loss": 0.5451, + "step": 11425 + }, + { + "epoch": 3.0331873091729724, + "grad_norm": 0.4816394255639505, + "learning_rate": 2.472768560950739e-06, + "loss": 0.5749, + "step": 11426 + }, + { + "epoch": 3.033452807646356, + "grad_norm": 0.46793345867577285, + "learning_rate": 2.4724194540121442e-06, + "loss": 0.556, + "step": 11427 + }, + { + "epoch": 3.0337183061197397, + "grad_norm": 0.45962176416738665, + "learning_rate": 2.472070347611437e-06, + "loss": 0.5622, + "step": 11428 + }, + { + "epoch": 3.033983804593124, + "grad_norm": 0.4421272493962858, + "learning_rate": 2.4717212417554266e-06, + "loss": 0.5373, + "step": 11429 + }, + { + "epoch": 3.0342493030665074, + "grad_norm": 0.440114721170632, + "learning_rate": 2.471372136450921e-06, + "loss": 0.5419, + "step": 11430 + }, + { + "epoch": 3.034514801539891, + "grad_norm": 0.4658157923920979, + "learning_rate": 2.471023031704729e-06, + "loss": 0.5345, + "step": 11431 + }, + { + "epoch": 3.034780300013275, + "grad_norm": 0.4673664289382252, + "learning_rate": 2.47067392752366e-06, + "loss": 0.5575, + "step": 11432 + }, + { + "epoch": 3.035045798486659, + "grad_norm": 0.44119260005810235, + "learning_rate": 2.47032482391452e-06, + "loss": 0.567, + "step": 11433 + }, + { + "epoch": 3.0353112969600424, + "grad_norm": 0.4598118725679513, + "learning_rate": 2.4699757208841187e-06, + "loss": 0.5212, + "step": 11434 + }, + { + "epoch": 3.035576795433426, + "grad_norm": 0.4402465974638158, + "learning_rate": 2.4696266184392643e-06, + "loss": 0.5263, + "step": 11435 + }, + { + "epoch": 3.03584229390681, + "grad_norm": 0.46282977718900964, + "learning_rate": 2.469277516586765e-06, + "loss": 0.5728, + "step": 11436 + }, + { + "epoch": 3.036107792380194, + "grad_norm": 0.4268958013636579, + "learning_rate": 2.46892841533343e-06, + "loss": 0.5296, + "step": 11437 + }, + { + "epoch": 3.0363732908535774, + "grad_norm": 0.4512747007998144, + "learning_rate": 2.468579314686067e-06, + "loss": 0.5727, + "step": 11438 + }, + { + "epoch": 3.0366387893269615, + "grad_norm": 0.4526868229880013, + "learning_rate": 2.468230214651484e-06, + "loss": 0.5363, + "step": 11439 + }, + { + "epoch": 3.036904287800345, + "grad_norm": 0.4411217271053345, + "learning_rate": 2.4678811152364894e-06, + "loss": 0.5874, + "step": 11440 + }, + { + "epoch": 3.037169786273729, + "grad_norm": 0.4661813424573556, + "learning_rate": 2.4675320164478917e-06, + "loss": 0.5507, + "step": 11441 + }, + { + "epoch": 3.037435284747113, + "grad_norm": 0.4660355791348338, + "learning_rate": 2.467182918292499e-06, + "loss": 0.5592, + "step": 11442 + }, + { + "epoch": 3.0377007832204965, + "grad_norm": 0.452467469998572, + "learning_rate": 2.4668338207771202e-06, + "loss": 0.5491, + "step": 11443 + }, + { + "epoch": 3.03796628169388, + "grad_norm": 0.4456360523507704, + "learning_rate": 2.4664847239085633e-06, + "loss": 0.5728, + "step": 11444 + }, + { + "epoch": 3.0382317801672643, + "grad_norm": 0.4750361505643859, + "learning_rate": 2.466135627693636e-06, + "loss": 0.5541, + "step": 11445 + }, + { + "epoch": 3.038497278640648, + "grad_norm": 0.44628077167141544, + "learning_rate": 2.4657865321391468e-06, + "loss": 0.5455, + "step": 11446 + }, + { + "epoch": 3.0387627771140315, + "grad_norm": 0.4479812647464731, + "learning_rate": 2.4654374372519037e-06, + "loss": 0.5181, + "step": 11447 + }, + { + "epoch": 3.039028275587415, + "grad_norm": 0.44891300276050117, + "learning_rate": 2.4650883430387154e-06, + "loss": 0.5574, + "step": 11448 + }, + { + "epoch": 3.0392937740607993, + "grad_norm": 0.4682428612653018, + "learning_rate": 2.4647392495063913e-06, + "loss": 0.5194, + "step": 11449 + }, + { + "epoch": 3.039559272534183, + "grad_norm": 0.4589105144797848, + "learning_rate": 2.464390156661737e-06, + "loss": 0.4998, + "step": 11450 + }, + { + "epoch": 3.0398247710075665, + "grad_norm": 0.431689246109316, + "learning_rate": 2.4640410645115612e-06, + "loss": 0.5203, + "step": 11451 + }, + { + "epoch": 3.0400902694809506, + "grad_norm": 0.42166902092057346, + "learning_rate": 2.463691973062673e-06, + "loss": 0.4825, + "step": 11452 + }, + { + "epoch": 3.0403557679543343, + "grad_norm": 0.44382833823487516, + "learning_rate": 2.4633428823218807e-06, + "loss": 0.5229, + "step": 11453 + }, + { + "epoch": 3.040621266427718, + "grad_norm": 0.44025221838001893, + "learning_rate": 2.4629937922959913e-06, + "loss": 0.5533, + "step": 11454 + }, + { + "epoch": 3.040886764901102, + "grad_norm": 0.45506126405588343, + "learning_rate": 2.4626447029918147e-06, + "loss": 0.5439, + "step": 11455 + }, + { + "epoch": 3.0411522633744856, + "grad_norm": 0.46269440553358676, + "learning_rate": 2.462295614416157e-06, + "loss": 0.5532, + "step": 11456 + }, + { + "epoch": 3.0414177618478693, + "grad_norm": 0.4488400571491357, + "learning_rate": 2.461946526575827e-06, + "loss": 0.5529, + "step": 11457 + }, + { + "epoch": 3.0416832603212534, + "grad_norm": 0.45583398523540014, + "learning_rate": 2.461597439477633e-06, + "loss": 0.5316, + "step": 11458 + }, + { + "epoch": 3.041948758794637, + "grad_norm": 0.4692884638864521, + "learning_rate": 2.4612483531283833e-06, + "loss": 0.5671, + "step": 11459 + }, + { + "epoch": 3.0422142572680206, + "grad_norm": 0.4598455806394097, + "learning_rate": 2.460899267534885e-06, + "loss": 0.5229, + "step": 11460 + }, + { + "epoch": 3.0424797557414043, + "grad_norm": 0.4595506967018965, + "learning_rate": 2.460550182703948e-06, + "loss": 0.5593, + "step": 11461 + }, + { + "epoch": 3.0427452542147884, + "grad_norm": 0.45046398542995214, + "learning_rate": 2.460201098642378e-06, + "loss": 0.5641, + "step": 11462 + }, + { + "epoch": 3.043010752688172, + "grad_norm": 0.44732556270990476, + "learning_rate": 2.4598520153569838e-06, + "loss": 0.5609, + "step": 11463 + }, + { + "epoch": 3.0432762511615556, + "grad_norm": 0.4545783203477136, + "learning_rate": 2.459502932854574e-06, + "loss": 0.564, + "step": 11464 + }, + { + "epoch": 3.0435417496349397, + "grad_norm": 0.43955721065108855, + "learning_rate": 2.4591538511419556e-06, + "loss": 0.526, + "step": 11465 + }, + { + "epoch": 3.0438072481083234, + "grad_norm": 0.44704662769064885, + "learning_rate": 2.458804770225938e-06, + "loss": 0.5746, + "step": 11466 + }, + { + "epoch": 3.044072746581707, + "grad_norm": 0.4574944374101059, + "learning_rate": 2.4584556901133286e-06, + "loss": 0.5761, + "step": 11467 + }, + { + "epoch": 3.044338245055091, + "grad_norm": 0.4552512514485266, + "learning_rate": 2.458106610810934e-06, + "loss": 0.5315, + "step": 11468 + }, + { + "epoch": 3.0446037435284747, + "grad_norm": 0.45002115574984586, + "learning_rate": 2.4577575323255636e-06, + "loss": 0.5346, + "step": 11469 + }, + { + "epoch": 3.0448692420018584, + "grad_norm": 0.4635156150751205, + "learning_rate": 2.457408454664024e-06, + "loss": 0.5742, + "step": 11470 + }, + { + "epoch": 3.0451347404752425, + "grad_norm": 0.44604689242087014, + "learning_rate": 2.4570593778331243e-06, + "loss": 0.5518, + "step": 11471 + }, + { + "epoch": 3.045400238948626, + "grad_norm": 0.465332592678385, + "learning_rate": 2.4567103018396724e-06, + "loss": 0.5575, + "step": 11472 + }, + { + "epoch": 3.0456657374220097, + "grad_norm": 0.4409123989373969, + "learning_rate": 2.456361226690475e-06, + "loss": 0.5702, + "step": 11473 + }, + { + "epoch": 3.0459312358953934, + "grad_norm": 0.4528984396863483, + "learning_rate": 2.45601215239234e-06, + "loss": 0.5345, + "step": 11474 + }, + { + "epoch": 3.0461967343687775, + "grad_norm": 0.45021953590273034, + "learning_rate": 2.4556630789520766e-06, + "loss": 0.5802, + "step": 11475 + }, + { + "epoch": 3.046462232842161, + "grad_norm": 0.4456075104486512, + "learning_rate": 2.455314006376491e-06, + "loss": 0.5115, + "step": 11476 + }, + { + "epoch": 3.0467277313155448, + "grad_norm": 0.4557474169109319, + "learning_rate": 2.4549649346723924e-06, + "loss": 0.5363, + "step": 11477 + }, + { + "epoch": 3.046993229788929, + "grad_norm": 0.4598259352250728, + "learning_rate": 2.4546158638465885e-06, + "loss": 0.5433, + "step": 11478 + }, + { + "epoch": 3.0472587282623125, + "grad_norm": 0.4538640448948252, + "learning_rate": 2.4542667939058853e-06, + "loss": 0.5873, + "step": 11479 + }, + { + "epoch": 3.047524226735696, + "grad_norm": 0.4478177261734345, + "learning_rate": 2.4539177248570913e-06, + "loss": 0.5454, + "step": 11480 + }, + { + "epoch": 3.04778972520908, + "grad_norm": 0.44676755689722647, + "learning_rate": 2.453568656707015e-06, + "loss": 0.5212, + "step": 11481 + }, + { + "epoch": 3.048055223682464, + "grad_norm": 0.4482395685456643, + "learning_rate": 2.4532195894624634e-06, + "loss": 0.5576, + "step": 11482 + }, + { + "epoch": 3.0483207221558475, + "grad_norm": 0.4466585304793646, + "learning_rate": 2.4528705231302438e-06, + "loss": 0.5433, + "step": 11483 + }, + { + "epoch": 3.0485862206292316, + "grad_norm": 0.43974039635877255, + "learning_rate": 2.452521457717166e-06, + "loss": 0.5085, + "step": 11484 + }, + { + "epoch": 3.048851719102615, + "grad_norm": 0.4368953045821346, + "learning_rate": 2.452172393230035e-06, + "loss": 0.5348, + "step": 11485 + }, + { + "epoch": 3.049117217575999, + "grad_norm": 0.4488894099768197, + "learning_rate": 2.4518233296756593e-06, + "loss": 0.5718, + "step": 11486 + }, + { + "epoch": 3.049382716049383, + "grad_norm": 0.4466445759825953, + "learning_rate": 2.4514742670608467e-06, + "loss": 0.5563, + "step": 11487 + }, + { + "epoch": 3.0496482145227666, + "grad_norm": 0.4453188661397635, + "learning_rate": 2.451125205392405e-06, + "loss": 0.5522, + "step": 11488 + }, + { + "epoch": 3.04991371299615, + "grad_norm": 0.45543942893167844, + "learning_rate": 2.4507761446771408e-06, + "loss": 0.5552, + "step": 11489 + }, + { + "epoch": 3.050179211469534, + "grad_norm": 0.45348011277465744, + "learning_rate": 2.4504270849218635e-06, + "loss": 0.5706, + "step": 11490 + }, + { + "epoch": 3.050444709942918, + "grad_norm": 0.4515964587208077, + "learning_rate": 2.4500780261333782e-06, + "loss": 0.5252, + "step": 11491 + }, + { + "epoch": 3.0507102084163016, + "grad_norm": 0.4469369034678833, + "learning_rate": 2.449728968318494e-06, + "loss": 0.575, + "step": 11492 + }, + { + "epoch": 3.0509757068896852, + "grad_norm": 0.45225586994229167, + "learning_rate": 2.4493799114840182e-06, + "loss": 0.5579, + "step": 11493 + }, + { + "epoch": 3.0512412053630693, + "grad_norm": 0.446453571558936, + "learning_rate": 2.449030855636758e-06, + "loss": 0.5491, + "step": 11494 + }, + { + "epoch": 3.051506703836453, + "grad_norm": 0.4444792187149838, + "learning_rate": 2.4486818007835216e-06, + "loss": 0.5004, + "step": 11495 + }, + { + "epoch": 3.0517722023098366, + "grad_norm": 0.45411176555128274, + "learning_rate": 2.448332746931115e-06, + "loss": 0.5378, + "step": 11496 + }, + { + "epoch": 3.0520377007832207, + "grad_norm": 0.44553150626962895, + "learning_rate": 2.447983694086346e-06, + "loss": 0.5182, + "step": 11497 + }, + { + "epoch": 3.0523031992566043, + "grad_norm": 0.46040204880343016, + "learning_rate": 2.447634642256023e-06, + "loss": 0.5525, + "step": 11498 + }, + { + "epoch": 3.052568697729988, + "grad_norm": 0.4352916696340324, + "learning_rate": 2.447285591446952e-06, + "loss": 0.5566, + "step": 11499 + }, + { + "epoch": 3.052834196203372, + "grad_norm": 0.44962685288740484, + "learning_rate": 2.4469365416659412e-06, + "loss": 0.5099, + "step": 11500 + }, + { + "epoch": 3.0530996946767557, + "grad_norm": 0.4467704702648933, + "learning_rate": 2.4465874929197985e-06, + "loss": 0.5629, + "step": 11501 + }, + { + "epoch": 3.0533651931501393, + "grad_norm": 0.4671102020198274, + "learning_rate": 2.4462384452153297e-06, + "loss": 0.5634, + "step": 11502 + }, + { + "epoch": 3.053630691623523, + "grad_norm": 0.4478606016998195, + "learning_rate": 2.4458893985593433e-06, + "loss": 0.5387, + "step": 11503 + }, + { + "epoch": 3.053896190096907, + "grad_norm": 0.44633057224235834, + "learning_rate": 2.4455403529586456e-06, + "loss": 0.5572, + "step": 11504 + }, + { + "epoch": 3.0541616885702907, + "grad_norm": 0.46042069445424316, + "learning_rate": 2.4451913084200446e-06, + "loss": 0.5697, + "step": 11505 + }, + { + "epoch": 3.0544271870436743, + "grad_norm": 0.4620157840088359, + "learning_rate": 2.444842264950347e-06, + "loss": 0.5137, + "step": 11506 + }, + { + "epoch": 3.0546926855170584, + "grad_norm": 0.4537792633119077, + "learning_rate": 2.444493222556362e-06, + "loss": 0.5719, + "step": 11507 + }, + { + "epoch": 3.054958183990442, + "grad_norm": 0.45516943355134776, + "learning_rate": 2.444144181244893e-06, + "loss": 0.4969, + "step": 11508 + }, + { + "epoch": 3.0552236824638257, + "grad_norm": 0.4408842774101969, + "learning_rate": 2.4437951410227495e-06, + "loss": 0.5579, + "step": 11509 + }, + { + "epoch": 3.05548918093721, + "grad_norm": 0.4363983321233346, + "learning_rate": 2.4434461018967386e-06, + "loss": 0.491, + "step": 11510 + }, + { + "epoch": 3.0557546794105934, + "grad_norm": 0.43490518177426785, + "learning_rate": 2.4430970638736675e-06, + "loss": 0.5576, + "step": 11511 + }, + { + "epoch": 3.056020177883977, + "grad_norm": 0.4473062649657584, + "learning_rate": 2.4427480269603425e-06, + "loss": 0.5505, + "step": 11512 + }, + { + "epoch": 3.056285676357361, + "grad_norm": 0.4390747360393103, + "learning_rate": 2.4423989911635726e-06, + "loss": 0.5124, + "step": 11513 + }, + { + "epoch": 3.056551174830745, + "grad_norm": 0.44516121163425104, + "learning_rate": 2.4420499564901623e-06, + "loss": 0.5289, + "step": 11514 + }, + { + "epoch": 3.0568166733041284, + "grad_norm": 0.4552494495428004, + "learning_rate": 2.4417009229469193e-06, + "loss": 0.5275, + "step": 11515 + }, + { + "epoch": 3.057082171777512, + "grad_norm": 0.4406367144434546, + "learning_rate": 2.4413518905406516e-06, + "loss": 0.5493, + "step": 11516 + }, + { + "epoch": 3.057347670250896, + "grad_norm": 0.4515884422774925, + "learning_rate": 2.4410028592781655e-06, + "loss": 0.5156, + "step": 11517 + }, + { + "epoch": 3.05761316872428, + "grad_norm": 0.4518130833544343, + "learning_rate": 2.4406538291662682e-06, + "loss": 0.542, + "step": 11518 + }, + { + "epoch": 3.0578786671976634, + "grad_norm": 0.4440896476881966, + "learning_rate": 2.4403048002117665e-06, + "loss": 0.5021, + "step": 11519 + }, + { + "epoch": 3.0581441656710475, + "grad_norm": 0.4464073705498716, + "learning_rate": 2.439955772421467e-06, + "loss": 0.5796, + "step": 11520 + }, + { + "epoch": 3.058409664144431, + "grad_norm": 0.4624404475540543, + "learning_rate": 2.439606745802177e-06, + "loss": 0.5248, + "step": 11521 + }, + { + "epoch": 3.058675162617815, + "grad_norm": 0.4380193380351636, + "learning_rate": 2.439257720360704e-06, + "loss": 0.553, + "step": 11522 + }, + { + "epoch": 3.058940661091199, + "grad_norm": 0.4601663339204156, + "learning_rate": 2.4389086961038533e-06, + "loss": 0.5369, + "step": 11523 + }, + { + "epoch": 3.0592061595645825, + "grad_norm": 0.44994543785756863, + "learning_rate": 2.438559673038434e-06, + "loss": 0.5589, + "step": 11524 + }, + { + "epoch": 3.059471658037966, + "grad_norm": 0.46132521521794523, + "learning_rate": 2.43821065117125e-06, + "loss": 0.5522, + "step": 11525 + }, + { + "epoch": 3.0597371565113503, + "grad_norm": 0.49299844006221116, + "learning_rate": 2.4378616305091103e-06, + "loss": 0.5427, + "step": 11526 + }, + { + "epoch": 3.060002654984734, + "grad_norm": 0.45914739592182297, + "learning_rate": 2.4375126110588206e-06, + "loss": 0.5724, + "step": 11527 + }, + { + "epoch": 3.0602681534581175, + "grad_norm": 0.43819681858109005, + "learning_rate": 2.437163592827188e-06, + "loss": 0.5789, + "step": 11528 + }, + { + "epoch": 3.060533651931501, + "grad_norm": 0.4503792571486426, + "learning_rate": 2.4368145758210186e-06, + "loss": 0.5396, + "step": 11529 + }, + { + "epoch": 3.0607991504048853, + "grad_norm": 0.4594128537492662, + "learning_rate": 2.436465560047122e-06, + "loss": 0.544, + "step": 11530 + }, + { + "epoch": 3.061064648878269, + "grad_norm": 0.43268266209467476, + "learning_rate": 2.436116545512301e-06, + "loss": 0.5376, + "step": 11531 + }, + { + "epoch": 3.0613301473516525, + "grad_norm": 0.46559244937416505, + "learning_rate": 2.435767532223363e-06, + "loss": 0.5708, + "step": 11532 + }, + { + "epoch": 3.0615956458250366, + "grad_norm": 0.45138391853619286, + "learning_rate": 2.435418520187116e-06, + "loss": 0.5375, + "step": 11533 + }, + { + "epoch": 3.0618611442984203, + "grad_norm": 0.44885863464374204, + "learning_rate": 2.435069509410366e-06, + "loss": 0.5698, + "step": 11534 + }, + { + "epoch": 3.062126642771804, + "grad_norm": 0.44409449564925874, + "learning_rate": 2.4347204998999197e-06, + "loss": 0.5367, + "step": 11535 + }, + { + "epoch": 3.062392141245188, + "grad_norm": 0.45114788788934496, + "learning_rate": 2.434371491662584e-06, + "loss": 0.5324, + "step": 11536 + }, + { + "epoch": 3.0626576397185716, + "grad_norm": 0.45492169583577613, + "learning_rate": 2.434022484705164e-06, + "loss": 0.5946, + "step": 11537 + }, + { + "epoch": 3.0629231381919553, + "grad_norm": 0.44846522809331124, + "learning_rate": 2.433673479034467e-06, + "loss": 0.5552, + "step": 11538 + }, + { + "epoch": 3.0631886366653394, + "grad_norm": 0.46785575414042085, + "learning_rate": 2.4333244746573002e-06, + "loss": 0.5712, + "step": 11539 + }, + { + "epoch": 3.063454135138723, + "grad_norm": 0.46627529599984335, + "learning_rate": 2.432975471580469e-06, + "loss": 0.5668, + "step": 11540 + }, + { + "epoch": 3.0637196336121066, + "grad_norm": 0.4629146126354187, + "learning_rate": 2.4326264698107804e-06, + "loss": 0.5241, + "step": 11541 + }, + { + "epoch": 3.0639851320854907, + "grad_norm": 0.446532924828594, + "learning_rate": 2.432277469355042e-06, + "loss": 0.5326, + "step": 11542 + }, + { + "epoch": 3.0642506305588744, + "grad_norm": 0.44857166204253596, + "learning_rate": 2.4319284702200567e-06, + "loss": 0.545, + "step": 11543 + }, + { + "epoch": 3.064516129032258, + "grad_norm": 0.4684434820632688, + "learning_rate": 2.4315794724126336e-06, + "loss": 0.5306, + "step": 11544 + }, + { + "epoch": 3.0647816275056416, + "grad_norm": 0.4328742834057493, + "learning_rate": 2.431230475939578e-06, + "loss": 0.5201, + "step": 11545 + }, + { + "epoch": 3.0650471259790257, + "grad_norm": 0.4400257722068868, + "learning_rate": 2.4308814808076968e-06, + "loss": 0.5481, + "step": 11546 + }, + { + "epoch": 3.0653126244524094, + "grad_norm": 0.4578908908132894, + "learning_rate": 2.430532487023796e-06, + "loss": 0.5853, + "step": 11547 + }, + { + "epoch": 3.065578122925793, + "grad_norm": 0.45167266866409145, + "learning_rate": 2.4301834945946816e-06, + "loss": 0.5263, + "step": 11548 + }, + { + "epoch": 3.065843621399177, + "grad_norm": 0.45622879951136924, + "learning_rate": 2.42983450352716e-06, + "loss": 0.5612, + "step": 11549 + }, + { + "epoch": 3.0661091198725607, + "grad_norm": 0.4394482852577356, + "learning_rate": 2.4294855138280374e-06, + "loss": 0.5554, + "step": 11550 + }, + { + "epoch": 3.0663746183459444, + "grad_norm": 0.4488521127691896, + "learning_rate": 2.42913652550412e-06, + "loss": 0.5526, + "step": 11551 + }, + { + "epoch": 3.0666401168193285, + "grad_norm": 0.45438099409443067, + "learning_rate": 2.4287875385622137e-06, + "loss": 0.522, + "step": 11552 + }, + { + "epoch": 3.066905615292712, + "grad_norm": 0.44363928815443066, + "learning_rate": 2.428438553009126e-06, + "loss": 0.5281, + "step": 11553 + }, + { + "epoch": 3.0671711137660957, + "grad_norm": 0.4728075625948259, + "learning_rate": 2.42808956885166e-06, + "loss": 0.5897, + "step": 11554 + }, + { + "epoch": 3.06743661223948, + "grad_norm": 0.46362124702313573, + "learning_rate": 2.4277405860966237e-06, + "loss": 0.5903, + "step": 11555 + }, + { + "epoch": 3.0677021107128635, + "grad_norm": 0.45956941663315426, + "learning_rate": 2.427391604750823e-06, + "loss": 0.5339, + "step": 11556 + }, + { + "epoch": 3.067967609186247, + "grad_norm": 0.46096392490348004, + "learning_rate": 2.427042624821064e-06, + "loss": 0.5242, + "step": 11557 + }, + { + "epoch": 3.0682331076596308, + "grad_norm": 0.4506014384219963, + "learning_rate": 2.4266936463141523e-06, + "loss": 0.5642, + "step": 11558 + }, + { + "epoch": 3.068498606133015, + "grad_norm": 0.4445336789356763, + "learning_rate": 2.4263446692368957e-06, + "loss": 0.5364, + "step": 11559 + }, + { + "epoch": 3.0687641046063985, + "grad_norm": 0.4597987065181883, + "learning_rate": 2.4259956935960966e-06, + "loss": 0.5486, + "step": 11560 + }, + { + "epoch": 3.069029603079782, + "grad_norm": 0.4767547964968403, + "learning_rate": 2.4256467193985628e-06, + "loss": 0.5474, + "step": 11561 + }, + { + "epoch": 3.069295101553166, + "grad_norm": 0.4518195364294145, + "learning_rate": 2.4252977466511e-06, + "loss": 0.5378, + "step": 11562 + }, + { + "epoch": 3.06956060002655, + "grad_norm": 0.4335076815977469, + "learning_rate": 2.424948775360514e-06, + "loss": 0.5293, + "step": 11563 + }, + { + "epoch": 3.0698260984999335, + "grad_norm": 0.45401332283368523, + "learning_rate": 2.424599805533611e-06, + "loss": 0.5532, + "step": 11564 + }, + { + "epoch": 3.0700915969733176, + "grad_norm": 0.4579603467196406, + "learning_rate": 2.424250837177197e-06, + "loss": 0.5403, + "step": 11565 + }, + { + "epoch": 3.070357095446701, + "grad_norm": 0.47083952294624326, + "learning_rate": 2.4239018702980756e-06, + "loss": 0.5638, + "step": 11566 + }, + { + "epoch": 3.070622593920085, + "grad_norm": 0.44780449709131237, + "learning_rate": 2.423552904903055e-06, + "loss": 0.4998, + "step": 11567 + }, + { + "epoch": 3.070888092393469, + "grad_norm": 0.44363813725461565, + "learning_rate": 2.4232039409989397e-06, + "loss": 0.538, + "step": 11568 + }, + { + "epoch": 3.0711535908668526, + "grad_norm": 0.4614748067148487, + "learning_rate": 2.422854978592535e-06, + "loss": 0.5662, + "step": 11569 + }, + { + "epoch": 3.071419089340236, + "grad_norm": 0.44742343949730995, + "learning_rate": 2.4225060176906493e-06, + "loss": 0.5652, + "step": 11570 + }, + { + "epoch": 3.07168458781362, + "grad_norm": 0.45955378399355984, + "learning_rate": 2.422157058300084e-06, + "loss": 0.5279, + "step": 11571 + }, + { + "epoch": 3.071950086287004, + "grad_norm": 0.4379411837210556, + "learning_rate": 2.4218081004276474e-06, + "loss": 0.525, + "step": 11572 + }, + { + "epoch": 3.0722155847603876, + "grad_norm": 0.4390836364029525, + "learning_rate": 2.4214591440801438e-06, + "loss": 0.5458, + "step": 11573 + }, + { + "epoch": 3.0724810832337712, + "grad_norm": 0.4630505390553413, + "learning_rate": 2.4211101892643793e-06, + "loss": 0.5316, + "step": 11574 + }, + { + "epoch": 3.0727465817071553, + "grad_norm": 0.4421934962279115, + "learning_rate": 2.4207612359871587e-06, + "loss": 0.5079, + "step": 11575 + }, + { + "epoch": 3.073012080180539, + "grad_norm": 0.43016732646785627, + "learning_rate": 2.4204122842552896e-06, + "loss": 0.4962, + "step": 11576 + }, + { + "epoch": 3.0732775786539226, + "grad_norm": 0.4426651867660987, + "learning_rate": 2.4200633340755745e-06, + "loss": 0.5519, + "step": 11577 + }, + { + "epoch": 3.0735430771273067, + "grad_norm": 0.4498533546661004, + "learning_rate": 2.419714385454821e-06, + "loss": 0.5424, + "step": 11578 + }, + { + "epoch": 3.0738085756006903, + "grad_norm": 0.4484563663286284, + "learning_rate": 2.419365438399833e-06, + "loss": 0.5365, + "step": 11579 + }, + { + "epoch": 3.074074074074074, + "grad_norm": 0.43730048726453674, + "learning_rate": 2.419016492917416e-06, + "loss": 0.559, + "step": 11580 + }, + { + "epoch": 3.074339572547458, + "grad_norm": 0.4633450421778798, + "learning_rate": 2.418667549014376e-06, + "loss": 0.5554, + "step": 11581 + }, + { + "epoch": 3.0746050710208417, + "grad_norm": 0.45532692397865465, + "learning_rate": 2.4183186066975182e-06, + "loss": 0.5612, + "step": 11582 + }, + { + "epoch": 3.0748705694942253, + "grad_norm": 0.4566625176032235, + "learning_rate": 2.4179696659736474e-06, + "loss": 0.5285, + "step": 11583 + }, + { + "epoch": 3.075136067967609, + "grad_norm": 0.4431252456370272, + "learning_rate": 2.417620726849569e-06, + "loss": 0.5742, + "step": 11584 + }, + { + "epoch": 3.075401566440993, + "grad_norm": 0.4505934205915547, + "learning_rate": 2.417271789332088e-06, + "loss": 0.5351, + "step": 11585 + }, + { + "epoch": 3.0756670649143767, + "grad_norm": 0.4500751514309848, + "learning_rate": 2.4169228534280094e-06, + "loss": 0.5317, + "step": 11586 + }, + { + "epoch": 3.0759325633877603, + "grad_norm": 0.4659284138188159, + "learning_rate": 2.416573919144139e-06, + "loss": 0.5175, + "step": 11587 + }, + { + "epoch": 3.0761980618611444, + "grad_norm": 0.4493922908947943, + "learning_rate": 2.416224986487282e-06, + "loss": 0.58, + "step": 11588 + }, + { + "epoch": 3.076463560334528, + "grad_norm": 0.4667893481498818, + "learning_rate": 2.4158760554642423e-06, + "loss": 0.6034, + "step": 11589 + }, + { + "epoch": 3.0767290588079117, + "grad_norm": 0.44960921373662366, + "learning_rate": 2.4155271260818254e-06, + "loss": 0.5262, + "step": 11590 + }, + { + "epoch": 3.076994557281296, + "grad_norm": 0.47163116028064456, + "learning_rate": 2.415178198346836e-06, + "loss": 0.5805, + "step": 11591 + }, + { + "epoch": 3.0772600557546794, + "grad_norm": 0.45691547889692313, + "learning_rate": 2.4148292722660798e-06, + "loss": 0.5693, + "step": 11592 + }, + { + "epoch": 3.077525554228063, + "grad_norm": 0.4543799342822913, + "learning_rate": 2.414480347846362e-06, + "loss": 0.5139, + "step": 11593 + }, + { + "epoch": 3.077791052701447, + "grad_norm": 0.44919058844425375, + "learning_rate": 2.414131425094486e-06, + "loss": 0.5377, + "step": 11594 + }, + { + "epoch": 3.078056551174831, + "grad_norm": 0.45385467004272423, + "learning_rate": 2.413782504017257e-06, + "loss": 0.5333, + "step": 11595 + }, + { + "epoch": 3.0783220496482144, + "grad_norm": 0.456646600906358, + "learning_rate": 2.413433584621481e-06, + "loss": 0.5529, + "step": 11596 + }, + { + "epoch": 3.0785875481215985, + "grad_norm": 0.4628491841440177, + "learning_rate": 2.413084666913962e-06, + "loss": 0.552, + "step": 11597 + }, + { + "epoch": 3.078853046594982, + "grad_norm": 0.43762217548632976, + "learning_rate": 2.4127357509015045e-06, + "loss": 0.5567, + "step": 11598 + }, + { + "epoch": 3.079118545068366, + "grad_norm": 0.442194338312714, + "learning_rate": 2.412386836590914e-06, + "loss": 0.536, + "step": 11599 + }, + { + "epoch": 3.0793840435417494, + "grad_norm": 0.43828667231465074, + "learning_rate": 2.4120379239889944e-06, + "loss": 0.5367, + "step": 11600 + }, + { + "epoch": 3.0796495420151335, + "grad_norm": 0.44867218626389876, + "learning_rate": 2.4116890131025503e-06, + "loss": 0.5749, + "step": 11601 + }, + { + "epoch": 3.079915040488517, + "grad_norm": 0.4601512494034246, + "learning_rate": 2.4113401039383867e-06, + "loss": 0.5599, + "step": 11602 + }, + { + "epoch": 3.080180538961901, + "grad_norm": 0.43663244776695664, + "learning_rate": 2.410991196503309e-06, + "loss": 0.5661, + "step": 11603 + }, + { + "epoch": 3.080446037435285, + "grad_norm": 0.4479055628569839, + "learning_rate": 2.41064229080412e-06, + "loss": 0.6069, + "step": 11604 + }, + { + "epoch": 3.0807115359086685, + "grad_norm": 0.4575099913725818, + "learning_rate": 2.4102933868476263e-06, + "loss": 0.562, + "step": 11605 + }, + { + "epoch": 3.080977034382052, + "grad_norm": 0.45618847925682265, + "learning_rate": 2.4099444846406305e-06, + "loss": 0.5676, + "step": 11606 + }, + { + "epoch": 3.0812425328554363, + "grad_norm": 0.4532304945237393, + "learning_rate": 2.4095955841899372e-06, + "loss": 0.5424, + "step": 11607 + }, + { + "epoch": 3.08150803132882, + "grad_norm": 0.4529452313010965, + "learning_rate": 2.409246685502352e-06, + "loss": 0.544, + "step": 11608 + }, + { + "epoch": 3.0817735298022035, + "grad_norm": 0.4481273594910094, + "learning_rate": 2.408897788584678e-06, + "loss": 0.5476, + "step": 11609 + }, + { + "epoch": 3.0820390282755876, + "grad_norm": 0.4424810980465838, + "learning_rate": 2.408548893443721e-06, + "loss": 0.5383, + "step": 11610 + }, + { + "epoch": 3.0823045267489713, + "grad_norm": 0.4502165308519758, + "learning_rate": 2.408200000086284e-06, + "loss": 0.5416, + "step": 11611 + }, + { + "epoch": 3.082570025222355, + "grad_norm": 0.44556179145103497, + "learning_rate": 2.407851108519172e-06, + "loss": 0.5431, + "step": 11612 + }, + { + "epoch": 3.0828355236957385, + "grad_norm": 0.4523120791059323, + "learning_rate": 2.407502218749189e-06, + "loss": 0.5523, + "step": 11613 + }, + { + "epoch": 3.0831010221691226, + "grad_norm": 0.4489360115967645, + "learning_rate": 2.4071533307831386e-06, + "loss": 0.5901, + "step": 11614 + }, + { + "epoch": 3.0833665206425063, + "grad_norm": 0.4602028790946236, + "learning_rate": 2.406804444627826e-06, + "loss": 0.5387, + "step": 11615 + }, + { + "epoch": 3.08363201911589, + "grad_norm": 0.4352331176447486, + "learning_rate": 2.4064555602900557e-06, + "loss": 0.5346, + "step": 11616 + }, + { + "epoch": 3.083897517589274, + "grad_norm": 0.4685939320709742, + "learning_rate": 2.4061066777766302e-06, + "loss": 0.5455, + "step": 11617 + }, + { + "epoch": 3.0841630160626576, + "grad_norm": 0.4553900045941404, + "learning_rate": 2.4057577970943544e-06, + "loss": 0.5548, + "step": 11618 + }, + { + "epoch": 3.0844285145360413, + "grad_norm": 0.45309101627219217, + "learning_rate": 2.4054089182500325e-06, + "loss": 0.539, + "step": 11619 + }, + { + "epoch": 3.0846940130094254, + "grad_norm": 0.4555084307680555, + "learning_rate": 2.405060041250467e-06, + "loss": 0.5423, + "step": 11620 + }, + { + "epoch": 3.084959511482809, + "grad_norm": 0.44265387932136907, + "learning_rate": 2.4047111661024644e-06, + "loss": 0.5475, + "step": 11621 + }, + { + "epoch": 3.0852250099561926, + "grad_norm": 0.4573892909316085, + "learning_rate": 2.4043622928128283e-06, + "loss": 0.5436, + "step": 11622 + }, + { + "epoch": 3.0854905084295767, + "grad_norm": 0.46787178157115983, + "learning_rate": 2.4040134213883604e-06, + "loss": 0.6086, + "step": 11623 + }, + { + "epoch": 3.0857560069029604, + "grad_norm": 0.44104109923536905, + "learning_rate": 2.403664551835866e-06, + "loss": 0.5712, + "step": 11624 + }, + { + "epoch": 3.086021505376344, + "grad_norm": 0.45532908072829414, + "learning_rate": 2.4033156841621484e-06, + "loss": 0.559, + "step": 11625 + }, + { + "epoch": 3.0862870038497277, + "grad_norm": 0.4511078639346571, + "learning_rate": 2.402966818374012e-06, + "loss": 0.5619, + "step": 11626 + }, + { + "epoch": 3.0865525023231117, + "grad_norm": 0.45810234006875333, + "learning_rate": 2.40261795447826e-06, + "loss": 0.5606, + "step": 11627 + }, + { + "epoch": 3.0868180007964954, + "grad_norm": 0.45580471688257507, + "learning_rate": 2.402269092481697e-06, + "loss": 0.5184, + "step": 11628 + }, + { + "epoch": 3.087083499269879, + "grad_norm": 0.4471135425014241, + "learning_rate": 2.4019202323911255e-06, + "loss": 0.5235, + "step": 11629 + }, + { + "epoch": 3.087348997743263, + "grad_norm": 0.4528342444019298, + "learning_rate": 2.4015713742133495e-06, + "loss": 0.5489, + "step": 11630 + }, + { + "epoch": 3.0876144962166467, + "grad_norm": 0.47491126730620253, + "learning_rate": 2.4012225179551726e-06, + "loss": 0.5817, + "step": 11631 + }, + { + "epoch": 3.0878799946900304, + "grad_norm": 0.436825252540165, + "learning_rate": 2.4008736636233985e-06, + "loss": 0.532, + "step": 11632 + }, + { + "epoch": 3.0881454931634145, + "grad_norm": 0.4537553387947287, + "learning_rate": 2.400524811224831e-06, + "loss": 0.5413, + "step": 11633 + }, + { + "epoch": 3.088410991636798, + "grad_norm": 0.45491351212053394, + "learning_rate": 2.400175960766274e-06, + "loss": 0.5718, + "step": 11634 + }, + { + "epoch": 3.0886764901101817, + "grad_norm": 0.4725039325195927, + "learning_rate": 2.3998271122545287e-06, + "loss": 0.5829, + "step": 11635 + }, + { + "epoch": 3.088941988583566, + "grad_norm": 0.45135817280529056, + "learning_rate": 2.3994782656964007e-06, + "loss": 0.5453, + "step": 11636 + }, + { + "epoch": 3.0892074870569495, + "grad_norm": 0.44006000910393317, + "learning_rate": 2.3991294210986924e-06, + "loss": 0.5502, + "step": 11637 + }, + { + "epoch": 3.089472985530333, + "grad_norm": 0.4545602481533639, + "learning_rate": 2.398780578468207e-06, + "loss": 0.5488, + "step": 11638 + }, + { + "epoch": 3.0897384840037168, + "grad_norm": 0.44864899029729133, + "learning_rate": 2.398431737811749e-06, + "loss": 0.5697, + "step": 11639 + }, + { + "epoch": 3.090003982477101, + "grad_norm": 0.45269264897709593, + "learning_rate": 2.3980828991361206e-06, + "loss": 0.5524, + "step": 11640 + }, + { + "epoch": 3.0902694809504845, + "grad_norm": 0.4386700168310677, + "learning_rate": 2.3977340624481245e-06, + "loss": 0.5677, + "step": 11641 + }, + { + "epoch": 3.090534979423868, + "grad_norm": 0.4546090687187948, + "learning_rate": 2.397385227754565e-06, + "loss": 0.5476, + "step": 11642 + }, + { + "epoch": 3.090800477897252, + "grad_norm": 0.44710709300459756, + "learning_rate": 2.397036395062245e-06, + "loss": 0.5243, + "step": 11643 + }, + { + "epoch": 3.091065976370636, + "grad_norm": 0.4458780256927925, + "learning_rate": 2.396687564377967e-06, + "loss": 0.5855, + "step": 11644 + }, + { + "epoch": 3.0913314748440195, + "grad_norm": 0.46020534861082846, + "learning_rate": 2.3963387357085353e-06, + "loss": 0.5749, + "step": 11645 + }, + { + "epoch": 3.0915969733174036, + "grad_norm": 0.4543410836247079, + "learning_rate": 2.395989909060751e-06, + "loss": 0.5385, + "step": 11646 + }, + { + "epoch": 3.091862471790787, + "grad_norm": 0.4629066868969013, + "learning_rate": 2.3956410844414185e-06, + "loss": 0.5425, + "step": 11647 + }, + { + "epoch": 3.092127970264171, + "grad_norm": 0.44968546688436994, + "learning_rate": 2.3952922618573404e-06, + "loss": 0.5397, + "step": 11648 + }, + { + "epoch": 3.092393468737555, + "grad_norm": 0.4644782799429297, + "learning_rate": 2.3949434413153195e-06, + "loss": 0.5276, + "step": 11649 + }, + { + "epoch": 3.0926589672109386, + "grad_norm": 0.44689705541425445, + "learning_rate": 2.394594622822159e-06, + "loss": 0.542, + "step": 11650 + }, + { + "epoch": 3.092924465684322, + "grad_norm": 0.46193158441804544, + "learning_rate": 2.394245806384662e-06, + "loss": 0.544, + "step": 11651 + }, + { + "epoch": 3.0931899641577063, + "grad_norm": 0.4511477857575874, + "learning_rate": 2.39389699200963e-06, + "loss": 0.5545, + "step": 11652 + }, + { + "epoch": 3.09345546263109, + "grad_norm": 0.45913925280558493, + "learning_rate": 2.393548179703866e-06, + "loss": 0.5307, + "step": 11653 + }, + { + "epoch": 3.0937209611044736, + "grad_norm": 0.4584177238231391, + "learning_rate": 2.3931993694741737e-06, + "loss": 0.5492, + "step": 11654 + }, + { + "epoch": 3.0939864595778572, + "grad_norm": 0.44236769846587864, + "learning_rate": 2.3928505613273546e-06, + "loss": 0.5414, + "step": 11655 + }, + { + "epoch": 3.0942519580512413, + "grad_norm": 0.46533226353690077, + "learning_rate": 2.3925017552702125e-06, + "loss": 0.525, + "step": 11656 + }, + { + "epoch": 3.094517456524625, + "grad_norm": 0.44556117700918196, + "learning_rate": 2.3921529513095497e-06, + "loss": 0.5265, + "step": 11657 + }, + { + "epoch": 3.0947829549980086, + "grad_norm": 0.46380813219517747, + "learning_rate": 2.3918041494521676e-06, + "loss": 0.5134, + "step": 11658 + }, + { + "epoch": 3.0950484534713927, + "grad_norm": 0.4439846358567936, + "learning_rate": 2.39145534970487e-06, + "loss": 0.5587, + "step": 11659 + }, + { + "epoch": 3.0953139519447763, + "grad_norm": 0.4755854379493961, + "learning_rate": 2.3911065520744583e-06, + "loss": 0.5653, + "step": 11660 + }, + { + "epoch": 3.09557945041816, + "grad_norm": 0.45121226205263937, + "learning_rate": 2.390757756567736e-06, + "loss": 0.5548, + "step": 11661 + }, + { + "epoch": 3.095844948891544, + "grad_norm": 0.45490569864651714, + "learning_rate": 2.3904089631915044e-06, + "loss": 0.5551, + "step": 11662 + }, + { + "epoch": 3.0961104473649277, + "grad_norm": 0.4496392386047114, + "learning_rate": 2.3900601719525676e-06, + "loss": 0.5316, + "step": 11663 + }, + { + "epoch": 3.0963759458383113, + "grad_norm": 0.46330358960693546, + "learning_rate": 2.3897113828577256e-06, + "loss": 0.5392, + "step": 11664 + }, + { + "epoch": 3.0966414443116954, + "grad_norm": 0.46344527653492346, + "learning_rate": 2.3893625959137816e-06, + "loss": 0.5749, + "step": 11665 + }, + { + "epoch": 3.096906942785079, + "grad_norm": 0.4533480070358089, + "learning_rate": 2.3890138111275375e-06, + "loss": 0.5191, + "step": 11666 + }, + { + "epoch": 3.0971724412584627, + "grad_norm": 0.4546032152339237, + "learning_rate": 2.3886650285057963e-06, + "loss": 0.5772, + "step": 11667 + }, + { + "epoch": 3.0974379397318463, + "grad_norm": 0.4460242801885309, + "learning_rate": 2.3883162480553605e-06, + "loss": 0.5293, + "step": 11668 + }, + { + "epoch": 3.0977034382052304, + "grad_norm": 0.45026723526569723, + "learning_rate": 2.38796746978303e-06, + "loss": 0.5721, + "step": 11669 + }, + { + "epoch": 3.097968936678614, + "grad_norm": 0.44958993970359756, + "learning_rate": 2.387618693695609e-06, + "loss": 0.5762, + "step": 11670 + }, + { + "epoch": 3.0982344351519977, + "grad_norm": 0.45386416511552574, + "learning_rate": 2.387269919799898e-06, + "loss": 0.5449, + "step": 11671 + }, + { + "epoch": 3.098499933625382, + "grad_norm": 0.467705826561258, + "learning_rate": 2.3869211481027e-06, + "loss": 0.5568, + "step": 11672 + }, + { + "epoch": 3.0987654320987654, + "grad_norm": 0.45373042329394825, + "learning_rate": 2.386572378610816e-06, + "loss": 0.5748, + "step": 11673 + }, + { + "epoch": 3.099030930572149, + "grad_norm": 0.46178207720684394, + "learning_rate": 2.386223611331049e-06, + "loss": 0.56, + "step": 11674 + }, + { + "epoch": 3.099296429045533, + "grad_norm": 0.46482844879008234, + "learning_rate": 2.3858748462701996e-06, + "loss": 0.6261, + "step": 11675 + }, + { + "epoch": 3.099561927518917, + "grad_norm": 0.45341897629162803, + "learning_rate": 2.38552608343507e-06, + "loss": 0.5616, + "step": 11676 + }, + { + "epoch": 3.0998274259923004, + "grad_norm": 0.4503636359842079, + "learning_rate": 2.3851773228324624e-06, + "loss": 0.5285, + "step": 11677 + }, + { + "epoch": 3.1000929244656845, + "grad_norm": 0.4501312289403777, + "learning_rate": 2.384828564469178e-06, + "loss": 0.5199, + "step": 11678 + }, + { + "epoch": 3.100358422939068, + "grad_norm": 0.4509428598065902, + "learning_rate": 2.3844798083520187e-06, + "loss": 0.535, + "step": 11679 + }, + { + "epoch": 3.100623921412452, + "grad_norm": 0.4575445265556578, + "learning_rate": 2.384131054487787e-06, + "loss": 0.5311, + "step": 11680 + }, + { + "epoch": 3.100889419885836, + "grad_norm": 0.4549972502640145, + "learning_rate": 2.3837823028832825e-06, + "loss": 0.5376, + "step": 11681 + }, + { + "epoch": 3.1011549183592195, + "grad_norm": 0.46139631713274615, + "learning_rate": 2.3834335535453075e-06, + "loss": 0.5504, + "step": 11682 + }, + { + "epoch": 3.101420416832603, + "grad_norm": 0.4474409976447581, + "learning_rate": 2.3830848064806635e-06, + "loss": 0.5078, + "step": 11683 + }, + { + "epoch": 3.101685915305987, + "grad_norm": 0.4629258899056781, + "learning_rate": 2.382736061696152e-06, + "loss": 0.5145, + "step": 11684 + }, + { + "epoch": 3.101951413779371, + "grad_norm": 0.43832233207832816, + "learning_rate": 2.3823873191985743e-06, + "loss": 0.5674, + "step": 11685 + }, + { + "epoch": 3.1022169122527545, + "grad_norm": 0.45596779082584954, + "learning_rate": 2.382038578994733e-06, + "loss": 0.5566, + "step": 11686 + }, + { + "epoch": 3.102482410726138, + "grad_norm": 0.44157542877360395, + "learning_rate": 2.381689841091427e-06, + "loss": 0.5312, + "step": 11687 + }, + { + "epoch": 3.1027479091995223, + "grad_norm": 0.4470285227041901, + "learning_rate": 2.381341105495459e-06, + "loss": 0.5649, + "step": 11688 + }, + { + "epoch": 3.103013407672906, + "grad_norm": 0.46329320383695966, + "learning_rate": 2.3809923722136298e-06, + "loss": 0.5778, + "step": 11689 + }, + { + "epoch": 3.1032789061462895, + "grad_norm": 0.4692033651479321, + "learning_rate": 2.380643641252741e-06, + "loss": 0.5229, + "step": 11690 + }, + { + "epoch": 3.1035444046196736, + "grad_norm": 0.44616414578282565, + "learning_rate": 2.380294912619594e-06, + "loss": 0.5341, + "step": 11691 + }, + { + "epoch": 3.1038099030930573, + "grad_norm": 0.45234813987575695, + "learning_rate": 2.379946186320989e-06, + "loss": 0.5205, + "step": 11692 + }, + { + "epoch": 3.104075401566441, + "grad_norm": 0.43463918132220514, + "learning_rate": 2.3795974623637265e-06, + "loss": 0.5556, + "step": 11693 + }, + { + "epoch": 3.1043409000398245, + "grad_norm": 0.44083089330814906, + "learning_rate": 2.3792487407546087e-06, + "loss": 0.5355, + "step": 11694 + }, + { + "epoch": 3.1046063985132086, + "grad_norm": 0.4530164680787931, + "learning_rate": 2.3789000215004366e-06, + "loss": 0.562, + "step": 11695 + }, + { + "epoch": 3.1048718969865923, + "grad_norm": 0.45947491042120703, + "learning_rate": 2.37855130460801e-06, + "loss": 0.5215, + "step": 11696 + }, + { + "epoch": 3.105137395459976, + "grad_norm": 0.44575068308308474, + "learning_rate": 2.3782025900841314e-06, + "loss": 0.5885, + "step": 11697 + }, + { + "epoch": 3.10540289393336, + "grad_norm": 0.451986042189683, + "learning_rate": 2.3778538779355996e-06, + "loss": 0.5491, + "step": 11698 + }, + { + "epoch": 3.1056683924067436, + "grad_norm": 0.45402515184793113, + "learning_rate": 2.3775051681692162e-06, + "loss": 0.5935, + "step": 11699 + }, + { + "epoch": 3.1059338908801273, + "grad_norm": 0.4751543136792409, + "learning_rate": 2.377156460791782e-06, + "loss": 0.5564, + "step": 11700 + }, + { + "epoch": 3.1061993893535114, + "grad_norm": 0.45009078362303184, + "learning_rate": 2.3768077558100974e-06, + "loss": 0.5379, + "step": 11701 + }, + { + "epoch": 3.106464887826895, + "grad_norm": 0.4570427369997015, + "learning_rate": 2.3764590532309635e-06, + "loss": 0.5175, + "step": 11702 + }, + { + "epoch": 3.1067303863002786, + "grad_norm": 0.45386640858526156, + "learning_rate": 2.376110353061181e-06, + "loss": 0.5548, + "step": 11703 + }, + { + "epoch": 3.1069958847736627, + "grad_norm": 0.4518641702102767, + "learning_rate": 2.3757616553075493e-06, + "loss": 0.5569, + "step": 11704 + }, + { + "epoch": 3.1072613832470464, + "grad_norm": 0.45041484996366055, + "learning_rate": 2.3754129599768694e-06, + "loss": 0.5692, + "step": 11705 + }, + { + "epoch": 3.10752688172043, + "grad_norm": 0.4554978315785331, + "learning_rate": 2.375064267075942e-06, + "loss": 0.5716, + "step": 11706 + }, + { + "epoch": 3.107792380193814, + "grad_norm": 0.44741418343448375, + "learning_rate": 2.374715576611567e-06, + "loss": 0.4959, + "step": 11707 + }, + { + "epoch": 3.1080578786671977, + "grad_norm": 0.45503747099599895, + "learning_rate": 2.3743668885905457e-06, + "loss": 0.5518, + "step": 11708 + }, + { + "epoch": 3.1083233771405814, + "grad_norm": 0.4431368452438896, + "learning_rate": 2.374018203019678e-06, + "loss": 0.5308, + "step": 11709 + }, + { + "epoch": 3.108588875613965, + "grad_norm": 0.4458652674102593, + "learning_rate": 2.373669519905763e-06, + "loss": 0.582, + "step": 11710 + }, + { + "epoch": 3.108854374087349, + "grad_norm": 0.4692365630070526, + "learning_rate": 2.373320839255601e-06, + "loss": 0.5427, + "step": 11711 + }, + { + "epoch": 3.1091198725607327, + "grad_norm": 0.4600862992202426, + "learning_rate": 2.3729721610759938e-06, + "loss": 0.5286, + "step": 11712 + }, + { + "epoch": 3.1093853710341164, + "grad_norm": 0.4695259293918339, + "learning_rate": 2.37262348537374e-06, + "loss": 0.5427, + "step": 11713 + }, + { + "epoch": 3.1096508695075005, + "grad_norm": 0.4514257706064871, + "learning_rate": 2.3722748121556413e-06, + "loss": 0.5582, + "step": 11714 + }, + { + "epoch": 3.109916367980884, + "grad_norm": 0.4403443261707449, + "learning_rate": 2.3719261414284957e-06, + "loss": 0.5484, + "step": 11715 + }, + { + "epoch": 3.1101818664542678, + "grad_norm": 0.45837862873268126, + "learning_rate": 2.3715774731991034e-06, + "loss": 0.5788, + "step": 11716 + }, + { + "epoch": 3.110447364927652, + "grad_norm": 0.4473199203319095, + "learning_rate": 2.371228807474265e-06, + "loss": 0.502, + "step": 11717 + }, + { + "epoch": 3.1107128634010355, + "grad_norm": 0.45132572744699595, + "learning_rate": 2.3708801442607803e-06, + "loss": 0.5407, + "step": 11718 + }, + { + "epoch": 3.110978361874419, + "grad_norm": 0.44424377391294806, + "learning_rate": 2.3705314835654487e-06, + "loss": 0.5395, + "step": 11719 + }, + { + "epoch": 3.111243860347803, + "grad_norm": 0.4457680201596601, + "learning_rate": 2.370182825395071e-06, + "loss": 0.5301, + "step": 11720 + }, + { + "epoch": 3.111509358821187, + "grad_norm": 0.44869578356096657, + "learning_rate": 2.3698341697564452e-06, + "loss": 0.5489, + "step": 11721 + }, + { + "epoch": 3.1117748572945705, + "grad_norm": 0.45168581768490124, + "learning_rate": 2.369485516656372e-06, + "loss": 0.5523, + "step": 11722 + }, + { + "epoch": 3.112040355767954, + "grad_norm": 0.45681395350376636, + "learning_rate": 2.3691368661016507e-06, + "loss": 0.5215, + "step": 11723 + }, + { + "epoch": 3.112305854241338, + "grad_norm": 0.43744741076446336, + "learning_rate": 2.368788218099081e-06, + "loss": 0.5263, + "step": 11724 + }, + { + "epoch": 3.112571352714722, + "grad_norm": 0.44594978697550103, + "learning_rate": 2.3684395726554625e-06, + "loss": 0.5318, + "step": 11725 + }, + { + "epoch": 3.1128368511881055, + "grad_norm": 0.45331967052763983, + "learning_rate": 2.368090929777595e-06, + "loss": 0.5381, + "step": 11726 + }, + { + "epoch": 3.1131023496614896, + "grad_norm": 0.4694669563525516, + "learning_rate": 2.367742289472277e-06, + "loss": 0.559, + "step": 11727 + }, + { + "epoch": 3.113367848134873, + "grad_norm": 0.44937769449546033, + "learning_rate": 2.3673936517463075e-06, + "loss": 0.5295, + "step": 11728 + }, + { + "epoch": 3.113633346608257, + "grad_norm": 0.4563437266288898, + "learning_rate": 2.367045016606487e-06, + "loss": 0.5389, + "step": 11729 + }, + { + "epoch": 3.113898845081641, + "grad_norm": 0.4668787617468006, + "learning_rate": 2.3666963840596135e-06, + "loss": 0.5418, + "step": 11730 + }, + { + "epoch": 3.1141643435550246, + "grad_norm": 0.4474620015251689, + "learning_rate": 2.3663477541124876e-06, + "loss": 0.5515, + "step": 11731 + }, + { + "epoch": 3.1144298420284082, + "grad_norm": 0.45798040602451795, + "learning_rate": 2.3659991267719078e-06, + "loss": 0.5885, + "step": 11732 + }, + { + "epoch": 3.1146953405017923, + "grad_norm": 0.4669521626151058, + "learning_rate": 2.365650502044673e-06, + "loss": 0.5374, + "step": 11733 + }, + { + "epoch": 3.114960838975176, + "grad_norm": 0.45737778115290334, + "learning_rate": 2.365301879937582e-06, + "loss": 0.5623, + "step": 11734 + }, + { + "epoch": 3.1152263374485596, + "grad_norm": 0.4586176356981947, + "learning_rate": 2.364953260457434e-06, + "loss": 0.5475, + "step": 11735 + }, + { + "epoch": 3.1154918359219437, + "grad_norm": 0.4476921469239407, + "learning_rate": 2.364604643611028e-06, + "loss": 0.5353, + "step": 11736 + }, + { + "epoch": 3.1157573343953273, + "grad_norm": 0.4801789942492191, + "learning_rate": 2.3642560294051643e-06, + "loss": 0.5566, + "step": 11737 + }, + { + "epoch": 3.116022832868711, + "grad_norm": 0.4470597776203245, + "learning_rate": 2.363907417846639e-06, + "loss": 0.5745, + "step": 11738 + }, + { + "epoch": 3.1162883313420946, + "grad_norm": 0.44458031074757975, + "learning_rate": 2.3635588089422528e-06, + "loss": 0.5473, + "step": 11739 + }, + { + "epoch": 3.1165538298154787, + "grad_norm": 0.4935263470663776, + "learning_rate": 2.3632102026988035e-06, + "loss": 0.4716, + "step": 11740 + }, + { + "epoch": 3.1168193282888623, + "grad_norm": 0.45781509398117604, + "learning_rate": 2.3628615991230904e-06, + "loss": 0.5663, + "step": 11741 + }, + { + "epoch": 3.117084826762246, + "grad_norm": 0.4407026604343912, + "learning_rate": 2.362512998221912e-06, + "loss": 0.5541, + "step": 11742 + }, + { + "epoch": 3.11735032523563, + "grad_norm": 0.43370469008587437, + "learning_rate": 2.3621644000020677e-06, + "loss": 0.5501, + "step": 11743 + }, + { + "epoch": 3.1176158237090137, + "grad_norm": 0.44236715688911166, + "learning_rate": 2.361815804470354e-06, + "loss": 0.5239, + "step": 11744 + }, + { + "epoch": 3.1178813221823973, + "grad_norm": 0.4454365951175594, + "learning_rate": 2.3614672116335704e-06, + "loss": 0.5402, + "step": 11745 + }, + { + "epoch": 3.1181468206557814, + "grad_norm": 0.46649211530743734, + "learning_rate": 2.3611186214985156e-06, + "loss": 0.5819, + "step": 11746 + }, + { + "epoch": 3.118412319129165, + "grad_norm": 0.4489057380501878, + "learning_rate": 2.3607700340719875e-06, + "loss": 0.5584, + "step": 11747 + }, + { + "epoch": 3.1186778176025487, + "grad_norm": 0.4783414489956911, + "learning_rate": 2.3604214493607844e-06, + "loss": 0.5485, + "step": 11748 + }, + { + "epoch": 3.1189433160759323, + "grad_norm": 0.450927168301951, + "learning_rate": 2.3600728673717056e-06, + "loss": 0.5643, + "step": 11749 + }, + { + "epoch": 3.1192088145493164, + "grad_norm": 0.4522232063980048, + "learning_rate": 2.359724288111548e-06, + "loss": 0.5298, + "step": 11750 + }, + { + "epoch": 3.1194743130227, + "grad_norm": 0.4469001558039488, + "learning_rate": 2.35937571158711e-06, + "loss": 0.5477, + "step": 11751 + }, + { + "epoch": 3.1197398114960837, + "grad_norm": 0.44597692997654864, + "learning_rate": 2.3590271378051906e-06, + "loss": 0.5575, + "step": 11752 + }, + { + "epoch": 3.120005309969468, + "grad_norm": 0.45973556407944144, + "learning_rate": 2.3586785667725863e-06, + "loss": 0.5439, + "step": 11753 + }, + { + "epoch": 3.1202708084428514, + "grad_norm": 0.46937625765053004, + "learning_rate": 2.3583299984960963e-06, + "loss": 0.511, + "step": 11754 + }, + { + "epoch": 3.120536306916235, + "grad_norm": 0.46156198421908556, + "learning_rate": 2.3579814329825198e-06, + "loss": 0.5314, + "step": 11755 + }, + { + "epoch": 3.120801805389619, + "grad_norm": 0.4572328838983525, + "learning_rate": 2.3576328702386512e-06, + "loss": 0.5738, + "step": 11756 + }, + { + "epoch": 3.121067303863003, + "grad_norm": 0.4744917205346979, + "learning_rate": 2.35728431027129e-06, + "loss": 0.5675, + "step": 11757 + }, + { + "epoch": 3.1213328023363864, + "grad_norm": 0.4734380900001336, + "learning_rate": 2.356935753087235e-06, + "loss": 0.5861, + "step": 11758 + }, + { + "epoch": 3.1215983008097705, + "grad_norm": 0.4795260387337211, + "learning_rate": 2.3565871986932824e-06, + "loss": 0.5706, + "step": 11759 + }, + { + "epoch": 3.121863799283154, + "grad_norm": 0.48310742282946234, + "learning_rate": 2.356238647096231e-06, + "loss": 0.5081, + "step": 11760 + }, + { + "epoch": 3.122129297756538, + "grad_norm": 0.43683266659066383, + "learning_rate": 2.355890098302879e-06, + "loss": 0.5224, + "step": 11761 + }, + { + "epoch": 3.122394796229922, + "grad_norm": 0.4588944730846631, + "learning_rate": 2.355541552320022e-06, + "loss": 0.5437, + "step": 11762 + }, + { + "epoch": 3.1226602947033055, + "grad_norm": 0.4788938720972009, + "learning_rate": 2.355193009154458e-06, + "loss": 0.5722, + "step": 11763 + }, + { + "epoch": 3.122925793176689, + "grad_norm": 0.47877834442730755, + "learning_rate": 2.354844468812985e-06, + "loss": 0.5642, + "step": 11764 + }, + { + "epoch": 3.123191291650073, + "grad_norm": 0.45124919597007257, + "learning_rate": 2.3544959313024007e-06, + "loss": 0.556, + "step": 11765 + }, + { + "epoch": 3.123456790123457, + "grad_norm": 0.44213716337862324, + "learning_rate": 2.354147396629502e-06, + "loss": 0.5342, + "step": 11766 + }, + { + "epoch": 3.1237222885968405, + "grad_norm": 0.45120079869185686, + "learning_rate": 2.353798864801086e-06, + "loss": 0.5411, + "step": 11767 + }, + { + "epoch": 3.123987787070224, + "grad_norm": 0.45287146730197203, + "learning_rate": 2.35345033582395e-06, + "loss": 0.5491, + "step": 11768 + }, + { + "epoch": 3.1242532855436083, + "grad_norm": 0.44696996381347803, + "learning_rate": 2.353101809704891e-06, + "loss": 0.5317, + "step": 11769 + }, + { + "epoch": 3.124518784016992, + "grad_norm": 0.45924444509524265, + "learning_rate": 2.352753286450707e-06, + "loss": 0.5786, + "step": 11770 + }, + { + "epoch": 3.1247842824903755, + "grad_norm": 0.4626433083968456, + "learning_rate": 2.352404766068194e-06, + "loss": 0.5561, + "step": 11771 + }, + { + "epoch": 3.1250497809637596, + "grad_norm": 0.45703495688418977, + "learning_rate": 2.3520562485641506e-06, + "loss": 0.5668, + "step": 11772 + }, + { + "epoch": 3.1253152794371433, + "grad_norm": 0.4588134340702698, + "learning_rate": 2.3517077339453716e-06, + "loss": 0.5526, + "step": 11773 + }, + { + "epoch": 3.125580777910527, + "grad_norm": 0.4583810498174728, + "learning_rate": 2.3513592222186545e-06, + "loss": 0.5599, + "step": 11774 + }, + { + "epoch": 3.125846276383911, + "grad_norm": 0.45117006327058934, + "learning_rate": 2.3510107133907967e-06, + "loss": 0.5376, + "step": 11775 + }, + { + "epoch": 3.1261117748572946, + "grad_norm": 0.44717385572374685, + "learning_rate": 2.3506622074685947e-06, + "loss": 0.5809, + "step": 11776 + }, + { + "epoch": 3.1263772733306783, + "grad_norm": 0.45564658491177, + "learning_rate": 2.3503137044588457e-06, + "loss": 0.5615, + "step": 11777 + }, + { + "epoch": 3.126642771804062, + "grad_norm": 0.4578593928636026, + "learning_rate": 2.349965204368346e-06, + "loss": 0.5679, + "step": 11778 + }, + { + "epoch": 3.126908270277446, + "grad_norm": 0.4496146029151605, + "learning_rate": 2.349616707203892e-06, + "loss": 0.5511, + "step": 11779 + }, + { + "epoch": 3.1271737687508296, + "grad_norm": 0.449285539507093, + "learning_rate": 2.3492682129722803e-06, + "loss": 0.5444, + "step": 11780 + }, + { + "epoch": 3.1274392672242133, + "grad_norm": 0.44869662148393685, + "learning_rate": 2.348919721680307e-06, + "loss": 0.5083, + "step": 11781 + }, + { + "epoch": 3.1277047656975974, + "grad_norm": 0.43883607143646797, + "learning_rate": 2.34857123333477e-06, + "loss": 0.501, + "step": 11782 + }, + { + "epoch": 3.127970264170981, + "grad_norm": 0.4514988200857329, + "learning_rate": 2.348222747942464e-06, + "loss": 0.5332, + "step": 11783 + }, + { + "epoch": 3.1282357626443646, + "grad_norm": 0.4532026232902584, + "learning_rate": 2.3478742655101867e-06, + "loss": 0.5571, + "step": 11784 + }, + { + "epoch": 3.1285012611177487, + "grad_norm": 0.46522718198984775, + "learning_rate": 2.347525786044733e-06, + "loss": 0.5219, + "step": 11785 + }, + { + "epoch": 3.1287667595911324, + "grad_norm": 0.4618598216065151, + "learning_rate": 2.3471773095529002e-06, + "loss": 0.5596, + "step": 11786 + }, + { + "epoch": 3.129032258064516, + "grad_norm": 0.4479095073007572, + "learning_rate": 2.346828836041484e-06, + "loss": 0.5762, + "step": 11787 + }, + { + "epoch": 3.1292977565379, + "grad_norm": 0.4710935356662311, + "learning_rate": 2.34648036551728e-06, + "loss": 0.5192, + "step": 11788 + }, + { + "epoch": 3.1295632550112837, + "grad_norm": 0.44054884582373033, + "learning_rate": 2.3461318979870863e-06, + "loss": 0.5481, + "step": 11789 + }, + { + "epoch": 3.1298287534846674, + "grad_norm": 0.48327267587012185, + "learning_rate": 2.3457834334576964e-06, + "loss": 0.5776, + "step": 11790 + }, + { + "epoch": 3.1300942519580515, + "grad_norm": 0.46704537426248677, + "learning_rate": 2.345434971935907e-06, + "loss": 0.5239, + "step": 11791 + }, + { + "epoch": 3.130359750431435, + "grad_norm": 0.4510464443700734, + "learning_rate": 2.345086513428514e-06, + "loss": 0.5439, + "step": 11792 + }, + { + "epoch": 3.1306252489048187, + "grad_norm": 0.4427174169988707, + "learning_rate": 2.344738057942313e-06, + "loss": 0.5515, + "step": 11793 + }, + { + "epoch": 3.1308907473782024, + "grad_norm": 0.45321403456527054, + "learning_rate": 2.3443896054841003e-06, + "loss": 0.5542, + "step": 11794 + }, + { + "epoch": 3.1311562458515865, + "grad_norm": 0.45340279915072434, + "learning_rate": 2.344041156060672e-06, + "loss": 0.5313, + "step": 11795 + }, + { + "epoch": 3.13142174432497, + "grad_norm": 0.4680933198061559, + "learning_rate": 2.343692709678822e-06, + "loss": 0.5099, + "step": 11796 + }, + { + "epoch": 3.1316872427983538, + "grad_norm": 0.4481888361268329, + "learning_rate": 2.3433442663453475e-06, + "loss": 0.4944, + "step": 11797 + }, + { + "epoch": 3.131952741271738, + "grad_norm": 0.445128295739164, + "learning_rate": 2.342995826067043e-06, + "loss": 0.5205, + "step": 11798 + }, + { + "epoch": 3.1322182397451215, + "grad_norm": 0.45366238247168494, + "learning_rate": 2.342647388850704e-06, + "loss": 0.565, + "step": 11799 + }, + { + "epoch": 3.132483738218505, + "grad_norm": 0.45470504689544455, + "learning_rate": 2.3422989547031265e-06, + "loss": 0.5202, + "step": 11800 + }, + { + "epoch": 3.132749236691889, + "grad_norm": 0.4470652866631795, + "learning_rate": 2.3419505236311064e-06, + "loss": 0.5642, + "step": 11801 + }, + { + "epoch": 3.133014735165273, + "grad_norm": 0.4642963466438548, + "learning_rate": 2.341602095641437e-06, + "loss": 0.5258, + "step": 11802 + }, + { + "epoch": 3.1332802336386565, + "grad_norm": 0.4485459058918683, + "learning_rate": 2.3412536707409143e-06, + "loss": 0.5121, + "step": 11803 + }, + { + "epoch": 3.13354573211204, + "grad_norm": 0.44115598330684735, + "learning_rate": 2.3409052489363342e-06, + "loss": 0.5578, + "step": 11804 + }, + { + "epoch": 3.133811230585424, + "grad_norm": 0.4703540551522825, + "learning_rate": 2.3405568302344913e-06, + "loss": 0.5415, + "step": 11805 + }, + { + "epoch": 3.134076729058808, + "grad_norm": 0.4509277986609646, + "learning_rate": 2.3402084146421804e-06, + "loss": 0.5525, + "step": 11806 + }, + { + "epoch": 3.1343422275321915, + "grad_norm": 0.4514718874244348, + "learning_rate": 2.339860002166198e-06, + "loss": 0.5354, + "step": 11807 + }, + { + "epoch": 3.1346077260055756, + "grad_norm": 0.4600614937927662, + "learning_rate": 2.339511592813336e-06, + "loss": 0.5406, + "step": 11808 + }, + { + "epoch": 3.134873224478959, + "grad_norm": 0.45846880663391903, + "learning_rate": 2.3391631865903918e-06, + "loss": 0.5323, + "step": 11809 + }, + { + "epoch": 3.135138722952343, + "grad_norm": 0.46783416479276835, + "learning_rate": 2.3388147835041586e-06, + "loss": 0.5707, + "step": 11810 + }, + { + "epoch": 3.135404221425727, + "grad_norm": 0.45068343291927604, + "learning_rate": 2.338466383561432e-06, + "loss": 0.5625, + "step": 11811 + }, + { + "epoch": 3.1356697198991106, + "grad_norm": 0.4591908076813028, + "learning_rate": 2.3381179867690067e-06, + "loss": 0.5471, + "step": 11812 + }, + { + "epoch": 3.1359352183724942, + "grad_norm": 0.4599059088040292, + "learning_rate": 2.3377695931336766e-06, + "loss": 0.5612, + "step": 11813 + }, + { + "epoch": 3.1362007168458783, + "grad_norm": 0.45690702686973267, + "learning_rate": 2.3374212026622365e-06, + "loss": 0.5755, + "step": 11814 + }, + { + "epoch": 3.136466215319262, + "grad_norm": 0.4621955302218684, + "learning_rate": 2.3370728153614813e-06, + "loss": 0.5441, + "step": 11815 + }, + { + "epoch": 3.1367317137926456, + "grad_norm": 0.45919592802179554, + "learning_rate": 2.336724431238205e-06, + "loss": 0.5474, + "step": 11816 + }, + { + "epoch": 3.1369972122660297, + "grad_norm": 0.45810490096345674, + "learning_rate": 2.336376050299202e-06, + "loss": 0.5599, + "step": 11817 + }, + { + "epoch": 3.1372627107394133, + "grad_norm": 0.44726843871709215, + "learning_rate": 2.3360276725512683e-06, + "loss": 0.564, + "step": 11818 + }, + { + "epoch": 3.137528209212797, + "grad_norm": 0.4566162827304913, + "learning_rate": 2.3356792980011947e-06, + "loss": 0.5357, + "step": 11819 + }, + { + "epoch": 3.1377937076861806, + "grad_norm": 0.4556921610849757, + "learning_rate": 2.335330926655777e-06, + "loss": 0.5206, + "step": 11820 + }, + { + "epoch": 3.1380592061595647, + "grad_norm": 0.46243462376451944, + "learning_rate": 2.33498255852181e-06, + "loss": 0.5623, + "step": 11821 + }, + { + "epoch": 3.1383247046329483, + "grad_norm": 0.4519184831768744, + "learning_rate": 2.3346341936060867e-06, + "loss": 0.5405, + "step": 11822 + }, + { + "epoch": 3.138590203106332, + "grad_norm": 0.4619948634004656, + "learning_rate": 2.334285831915401e-06, + "loss": 0.5318, + "step": 11823 + }, + { + "epoch": 3.138855701579716, + "grad_norm": 0.4575298450110613, + "learning_rate": 2.333937473456549e-06, + "loss": 0.5668, + "step": 11824 + }, + { + "epoch": 3.1391212000530997, + "grad_norm": 0.4496015943208066, + "learning_rate": 2.333589118236322e-06, + "loss": 0.552, + "step": 11825 + }, + { + "epoch": 3.1393866985264833, + "grad_norm": 0.44877558260701894, + "learning_rate": 2.3332407662615145e-06, + "loss": 0.5255, + "step": 11826 + }, + { + "epoch": 3.1396521969998674, + "grad_norm": 0.453504290556328, + "learning_rate": 2.33289241753892e-06, + "loss": 0.5534, + "step": 11827 + }, + { + "epoch": 3.139917695473251, + "grad_norm": 0.4506235391189544, + "learning_rate": 2.332544072075333e-06, + "loss": 0.5512, + "step": 11828 + }, + { + "epoch": 3.1401831939466347, + "grad_norm": 0.45145279435136004, + "learning_rate": 2.3321957298775464e-06, + "loss": 0.5583, + "step": 11829 + }, + { + "epoch": 3.140448692420019, + "grad_norm": 0.452387072937862, + "learning_rate": 2.3318473909523543e-06, + "loss": 0.5604, + "step": 11830 + }, + { + "epoch": 3.1407141908934024, + "grad_norm": 0.4607830426546223, + "learning_rate": 2.3314990553065494e-06, + "loss": 0.5708, + "step": 11831 + }, + { + "epoch": 3.140979689366786, + "grad_norm": 0.4568216243191098, + "learning_rate": 2.3311507229469256e-06, + "loss": 0.5591, + "step": 11832 + }, + { + "epoch": 3.1412451878401697, + "grad_norm": 0.45178472430567024, + "learning_rate": 2.330802393880276e-06, + "loss": 0.5148, + "step": 11833 + }, + { + "epoch": 3.141510686313554, + "grad_norm": 0.44573715501904415, + "learning_rate": 2.330454068113394e-06, + "loss": 0.5067, + "step": 11834 + }, + { + "epoch": 3.1417761847869374, + "grad_norm": 0.4438082651575303, + "learning_rate": 2.330105745653074e-06, + "loss": 0.5616, + "step": 11835 + }, + { + "epoch": 3.142041683260321, + "grad_norm": 0.4572405008364656, + "learning_rate": 2.329757426506107e-06, + "loss": 0.5788, + "step": 11836 + }, + { + "epoch": 3.142307181733705, + "grad_norm": 0.4464227669642761, + "learning_rate": 2.3294091106792864e-06, + "loss": 0.5119, + "step": 11837 + }, + { + "epoch": 3.142572680207089, + "grad_norm": 0.43968926664032654, + "learning_rate": 2.329060798179406e-06, + "loss": 0.5545, + "step": 11838 + }, + { + "epoch": 3.1428381786804724, + "grad_norm": 0.4587224890881845, + "learning_rate": 2.328712489013259e-06, + "loss": 0.528, + "step": 11839 + }, + { + "epoch": 3.1431036771538565, + "grad_norm": 0.463554552097399, + "learning_rate": 2.3283641831876375e-06, + "loss": 0.5584, + "step": 11840 + }, + { + "epoch": 3.14336917562724, + "grad_norm": 0.44545713975558804, + "learning_rate": 2.3280158807093355e-06, + "loss": 0.5176, + "step": 11841 + }, + { + "epoch": 3.143634674100624, + "grad_norm": 0.4662438381352694, + "learning_rate": 2.327667581585144e-06, + "loss": 0.5348, + "step": 11842 + }, + { + "epoch": 3.143900172574008, + "grad_norm": 0.45100909553468216, + "learning_rate": 2.3273192858218567e-06, + "loss": 0.551, + "step": 11843 + }, + { + "epoch": 3.1441656710473915, + "grad_norm": 0.4476004829756491, + "learning_rate": 2.3269709934262664e-06, + "loss": 0.5576, + "step": 11844 + }, + { + "epoch": 3.144431169520775, + "grad_norm": 0.4647501023873188, + "learning_rate": 2.3266227044051647e-06, + "loss": 0.5554, + "step": 11845 + }, + { + "epoch": 3.1446966679941593, + "grad_norm": 0.4546178945372103, + "learning_rate": 2.3262744187653453e-06, + "loss": 0.5793, + "step": 11846 + }, + { + "epoch": 3.144962166467543, + "grad_norm": 0.4634833018000255, + "learning_rate": 2.3259261365136014e-06, + "loss": 0.554, + "step": 11847 + }, + { + "epoch": 3.1452276649409265, + "grad_norm": 0.45095087246713256, + "learning_rate": 2.325577857656722e-06, + "loss": 0.5694, + "step": 11848 + }, + { + "epoch": 3.14549316341431, + "grad_norm": 0.4532292387845422, + "learning_rate": 2.325229582201502e-06, + "loss": 0.5149, + "step": 11849 + }, + { + "epoch": 3.1457586618876943, + "grad_norm": 0.4460049402496013, + "learning_rate": 2.324881310154733e-06, + "loss": 0.546, + "step": 11850 + }, + { + "epoch": 3.146024160361078, + "grad_norm": 0.45422293174408374, + "learning_rate": 2.3245330415232074e-06, + "loss": 0.5702, + "step": 11851 + }, + { + "epoch": 3.1462896588344615, + "grad_norm": 0.43826609261206567, + "learning_rate": 2.3241847763137172e-06, + "loss": 0.5775, + "step": 11852 + }, + { + "epoch": 3.1465551573078456, + "grad_norm": 0.465867842377378, + "learning_rate": 2.3238365145330554e-06, + "loss": 0.5611, + "step": 11853 + }, + { + "epoch": 3.1468206557812293, + "grad_norm": 0.45578392589909705, + "learning_rate": 2.3234882561880117e-06, + "loss": 0.5068, + "step": 11854 + }, + { + "epoch": 3.147086154254613, + "grad_norm": 0.4512131423831718, + "learning_rate": 2.323140001285379e-06, + "loss": 0.5383, + "step": 11855 + }, + { + "epoch": 3.147351652727997, + "grad_norm": 0.4536569472563547, + "learning_rate": 2.3227917498319495e-06, + "loss": 0.5741, + "step": 11856 + }, + { + "epoch": 3.1476171512013806, + "grad_norm": 0.4592691910167352, + "learning_rate": 2.3224435018345148e-06, + "loss": 0.5286, + "step": 11857 + }, + { + "epoch": 3.1478826496747643, + "grad_norm": 0.44847520456716194, + "learning_rate": 2.322095257299867e-06, + "loss": 0.5921, + "step": 11858 + }, + { + "epoch": 3.148148148148148, + "grad_norm": 0.4615011128530479, + "learning_rate": 2.3217470162347967e-06, + "loss": 0.5438, + "step": 11859 + }, + { + "epoch": 3.148413646621532, + "grad_norm": 0.45434002064169093, + "learning_rate": 2.3213987786460963e-06, + "loss": 0.5749, + "step": 11860 + }, + { + "epoch": 3.1486791450949156, + "grad_norm": 0.4574653987830054, + "learning_rate": 2.3210505445405564e-06, + "loss": 0.5549, + "step": 11861 + }, + { + "epoch": 3.1489446435682993, + "grad_norm": 0.458277267074342, + "learning_rate": 2.3207023139249696e-06, + "loss": 0.5327, + "step": 11862 + }, + { + "epoch": 3.1492101420416834, + "grad_norm": 0.4553064141982108, + "learning_rate": 2.3203540868061263e-06, + "loss": 0.5675, + "step": 11863 + }, + { + "epoch": 3.149475640515067, + "grad_norm": 0.46381709315182235, + "learning_rate": 2.3200058631908196e-06, + "loss": 0.6012, + "step": 11864 + }, + { + "epoch": 3.1497411389884507, + "grad_norm": 0.45828934432132007, + "learning_rate": 2.319657643085838e-06, + "loss": 0.5772, + "step": 11865 + }, + { + "epoch": 3.1500066374618347, + "grad_norm": 0.45868509950179476, + "learning_rate": 2.3193094264979737e-06, + "loss": 0.5499, + "step": 11866 + }, + { + "epoch": 3.1502721359352184, + "grad_norm": 0.47561676424653343, + "learning_rate": 2.3189612134340184e-06, + "loss": 0.5544, + "step": 11867 + }, + { + "epoch": 3.150537634408602, + "grad_norm": 0.44259374677351376, + "learning_rate": 2.3186130039007628e-06, + "loss": 0.5387, + "step": 11868 + }, + { + "epoch": 3.150803132881986, + "grad_norm": 0.46992140656105286, + "learning_rate": 2.3182647979049967e-06, + "loss": 0.5647, + "step": 11869 + }, + { + "epoch": 3.1510686313553697, + "grad_norm": 0.4384444378142557, + "learning_rate": 2.3179165954535138e-06, + "loss": 0.543, + "step": 11870 + }, + { + "epoch": 3.1513341298287534, + "grad_norm": 0.447322539832735, + "learning_rate": 2.317568396553102e-06, + "loss": 0.5354, + "step": 11871 + }, + { + "epoch": 3.1515996283021375, + "grad_norm": 0.45522410185966244, + "learning_rate": 2.3172202012105533e-06, + "loss": 0.553, + "step": 11872 + }, + { + "epoch": 3.151865126775521, + "grad_norm": 0.47185336038782544, + "learning_rate": 2.316872009432658e-06, + "loss": 0.5499, + "step": 11873 + }, + { + "epoch": 3.1521306252489047, + "grad_norm": 0.46740893731854743, + "learning_rate": 2.316523821226207e-06, + "loss": 0.5442, + "step": 11874 + }, + { + "epoch": 3.1523961237222884, + "grad_norm": 0.4538978349820548, + "learning_rate": 2.3161756365979905e-06, + "loss": 0.5106, + "step": 11875 + }, + { + "epoch": 3.1526616221956725, + "grad_norm": 0.4383725636056833, + "learning_rate": 2.3158274555548e-06, + "loss": 0.5369, + "step": 11876 + }, + { + "epoch": 3.152927120669056, + "grad_norm": 0.458289464802042, + "learning_rate": 2.3154792781034245e-06, + "loss": 0.5317, + "step": 11877 + }, + { + "epoch": 3.1531926191424398, + "grad_norm": 0.4485180957545061, + "learning_rate": 2.3151311042506547e-06, + "loss": 0.5455, + "step": 11878 + }, + { + "epoch": 3.153458117615824, + "grad_norm": 0.4424529529518882, + "learning_rate": 2.314782934003281e-06, + "loss": 0.577, + "step": 11879 + }, + { + "epoch": 3.1537236160892075, + "grad_norm": 0.48368952294590534, + "learning_rate": 2.3144347673680937e-06, + "loss": 0.5554, + "step": 11880 + }, + { + "epoch": 3.153989114562591, + "grad_norm": 0.4468828892820441, + "learning_rate": 2.314086604351883e-06, + "loss": 0.5401, + "step": 11881 + }, + { + "epoch": 3.154254613035975, + "grad_norm": 0.4534676232194772, + "learning_rate": 2.3137384449614392e-06, + "loss": 0.5468, + "step": 11882 + }, + { + "epoch": 3.154520111509359, + "grad_norm": 0.4617975977696021, + "learning_rate": 2.313390289203551e-06, + "loss": 0.5633, + "step": 11883 + }, + { + "epoch": 3.1547856099827425, + "grad_norm": 0.45961329025233716, + "learning_rate": 2.313042137085009e-06, + "loss": 0.5466, + "step": 11884 + }, + { + "epoch": 3.1550511084561266, + "grad_norm": 0.46140884465785875, + "learning_rate": 2.312693988612603e-06, + "loss": 0.5235, + "step": 11885 + }, + { + "epoch": 3.15531660692951, + "grad_norm": 0.46017972046761896, + "learning_rate": 2.3123458437931234e-06, + "loss": 0.5257, + "step": 11886 + }, + { + "epoch": 3.155582105402894, + "grad_norm": 0.45782551115668935, + "learning_rate": 2.311997702633359e-06, + "loss": 0.5486, + "step": 11887 + }, + { + "epoch": 3.1558476038762775, + "grad_norm": 0.4556464378648574, + "learning_rate": 2.3116495651401e-06, + "loss": 0.5691, + "step": 11888 + }, + { + "epoch": 3.1561131023496616, + "grad_norm": 0.4527791128096492, + "learning_rate": 2.3113014313201353e-06, + "loss": 0.589, + "step": 11889 + }, + { + "epoch": 3.156378600823045, + "grad_norm": 0.4694596305845738, + "learning_rate": 2.3109533011802544e-06, + "loss": 0.5637, + "step": 11890 + }, + { + "epoch": 3.156644099296429, + "grad_norm": 0.44273057471544064, + "learning_rate": 2.310605174727247e-06, + "loss": 0.5154, + "step": 11891 + }, + { + "epoch": 3.156909597769813, + "grad_norm": 0.4619852478624283, + "learning_rate": 2.3102570519679027e-06, + "loss": 0.5636, + "step": 11892 + }, + { + "epoch": 3.1571750962431966, + "grad_norm": 0.4646091603101284, + "learning_rate": 2.3099089329090115e-06, + "loss": 0.5556, + "step": 11893 + }, + { + "epoch": 3.1574405947165802, + "grad_norm": 0.4668777010606396, + "learning_rate": 2.30956081755736e-06, + "loss": 0.5325, + "step": 11894 + }, + { + "epoch": 3.1577060931899643, + "grad_norm": 0.45927350078000767, + "learning_rate": 2.3092127059197394e-06, + "loss": 0.5736, + "step": 11895 + }, + { + "epoch": 3.157971591663348, + "grad_norm": 0.46308138195763815, + "learning_rate": 2.3088645980029377e-06, + "loss": 0.5999, + "step": 11896 + }, + { + "epoch": 3.1582370901367316, + "grad_norm": 0.46950608585504244, + "learning_rate": 2.3085164938137446e-06, + "loss": 0.5709, + "step": 11897 + }, + { + "epoch": 3.1585025886101157, + "grad_norm": 0.45039099700353363, + "learning_rate": 2.3081683933589486e-06, + "loss": 0.5408, + "step": 11898 + }, + { + "epoch": 3.1587680870834993, + "grad_norm": 0.4364745188609442, + "learning_rate": 2.30782029664534e-06, + "loss": 0.5558, + "step": 11899 + }, + { + "epoch": 3.159033585556883, + "grad_norm": 0.45798365668270336, + "learning_rate": 2.307472203679705e-06, + "loss": 0.5464, + "step": 11900 + }, + { + "epoch": 3.159299084030267, + "grad_norm": 0.4462187590629579, + "learning_rate": 2.3071241144688335e-06, + "loss": 0.5425, + "step": 11901 + }, + { + "epoch": 3.1595645825036507, + "grad_norm": 0.4475251677886612, + "learning_rate": 2.306776029019514e-06, + "loss": 0.5647, + "step": 11902 + }, + { + "epoch": 3.1598300809770343, + "grad_norm": 0.44981184194941526, + "learning_rate": 2.3064279473385344e-06, + "loss": 0.5285, + "step": 11903 + }, + { + "epoch": 3.160095579450418, + "grad_norm": 0.4682206800698678, + "learning_rate": 2.3060798694326843e-06, + "loss": 0.5745, + "step": 11904 + }, + { + "epoch": 3.160361077923802, + "grad_norm": 0.44524995744002105, + "learning_rate": 2.305731795308752e-06, + "loss": 0.5622, + "step": 11905 + }, + { + "epoch": 3.1606265763971857, + "grad_norm": 0.4558035053055899, + "learning_rate": 2.305383724973525e-06, + "loss": 0.5555, + "step": 11906 + }, + { + "epoch": 3.1608920748705693, + "grad_norm": 0.4721828951515266, + "learning_rate": 2.3050356584337917e-06, + "loss": 0.5416, + "step": 11907 + }, + { + "epoch": 3.1611575733439534, + "grad_norm": 0.4446206220869704, + "learning_rate": 2.30468759569634e-06, + "loss": 0.5602, + "step": 11908 + }, + { + "epoch": 3.161423071817337, + "grad_norm": 0.46952691845740646, + "learning_rate": 2.304339536767959e-06, + "loss": 0.5496, + "step": 11909 + }, + { + "epoch": 3.1616885702907207, + "grad_norm": 0.46082072633497934, + "learning_rate": 2.3039914816554373e-06, + "loss": 0.5308, + "step": 11910 + }, + { + "epoch": 3.161954068764105, + "grad_norm": 0.45829078660720957, + "learning_rate": 2.30364343036556e-06, + "loss": 0.5559, + "step": 11911 + }, + { + "epoch": 3.1622195672374884, + "grad_norm": 0.46516840904427664, + "learning_rate": 2.3032953829051166e-06, + "loss": 0.5579, + "step": 11912 + }, + { + "epoch": 3.162485065710872, + "grad_norm": 0.4456581336622198, + "learning_rate": 2.302947339280895e-06, + "loss": 0.5396, + "step": 11913 + }, + { + "epoch": 3.1627505641842557, + "grad_norm": 0.4596621465567285, + "learning_rate": 2.302599299499683e-06, + "loss": 0.5486, + "step": 11914 + }, + { + "epoch": 3.16301606265764, + "grad_norm": 0.44672630278292447, + "learning_rate": 2.302251263568267e-06, + "loss": 0.5362, + "step": 11915 + }, + { + "epoch": 3.1632815611310234, + "grad_norm": 0.4548573150208644, + "learning_rate": 2.3019032314934377e-06, + "loss": 0.5362, + "step": 11916 + }, + { + "epoch": 3.163547059604407, + "grad_norm": 0.4757949569485824, + "learning_rate": 2.301555203281979e-06, + "loss": 0.5432, + "step": 11917 + }, + { + "epoch": 3.163812558077791, + "grad_norm": 0.450711984322392, + "learning_rate": 2.3012071789406797e-06, + "loss": 0.5365, + "step": 11918 + }, + { + "epoch": 3.164078056551175, + "grad_norm": 0.4711714557958494, + "learning_rate": 2.3008591584763266e-06, + "loss": 0.5494, + "step": 11919 + }, + { + "epoch": 3.1643435550245584, + "grad_norm": 0.45993654962098, + "learning_rate": 2.3005111418957084e-06, + "loss": 0.5151, + "step": 11920 + }, + { + "epoch": 3.1646090534979425, + "grad_norm": 0.44092050017653484, + "learning_rate": 2.300163129205611e-06, + "loss": 0.5517, + "step": 11921 + }, + { + "epoch": 3.164874551971326, + "grad_norm": 0.4577916406260054, + "learning_rate": 2.2998151204128225e-06, + "loss": 0.5346, + "step": 11922 + }, + { + "epoch": 3.16514005044471, + "grad_norm": 0.4566487614485163, + "learning_rate": 2.2994671155241287e-06, + "loss": 0.5512, + "step": 11923 + }, + { + "epoch": 3.165405548918094, + "grad_norm": 0.4586303785217844, + "learning_rate": 2.299119114546317e-06, + "loss": 0.5074, + "step": 11924 + }, + { + "epoch": 3.1656710473914775, + "grad_norm": 0.4431922571512932, + "learning_rate": 2.2987711174861745e-06, + "loss": 0.5307, + "step": 11925 + }, + { + "epoch": 3.165936545864861, + "grad_norm": 0.4465445116872388, + "learning_rate": 2.2984231243504877e-06, + "loss": 0.5395, + "step": 11926 + }, + { + "epoch": 3.1662020443382453, + "grad_norm": 0.47166576253612597, + "learning_rate": 2.298075135146044e-06, + "loss": 0.567, + "step": 11927 + }, + { + "epoch": 3.166467542811629, + "grad_norm": 0.45857533208259, + "learning_rate": 2.2977271498796303e-06, + "loss": 0.5617, + "step": 11928 + }, + { + "epoch": 3.1667330412850125, + "grad_norm": 0.44057361651831567, + "learning_rate": 2.2973791685580313e-06, + "loss": 0.5344, + "step": 11929 + }, + { + "epoch": 3.1669985397583966, + "grad_norm": 0.46064935425687686, + "learning_rate": 2.2970311911880345e-06, + "loss": 0.5892, + "step": 11930 + }, + { + "epoch": 3.1672640382317803, + "grad_norm": 0.45689291662741055, + "learning_rate": 2.2966832177764268e-06, + "loss": 0.579, + "step": 11931 + }, + { + "epoch": 3.167529536705164, + "grad_norm": 0.45590590724984836, + "learning_rate": 2.296335248329994e-06, + "loss": 0.5589, + "step": 11932 + }, + { + "epoch": 3.1677950351785475, + "grad_norm": 0.4432010792154784, + "learning_rate": 2.295987282855523e-06, + "loss": 0.5517, + "step": 11933 + }, + { + "epoch": 3.1680605336519316, + "grad_norm": 0.4535460304201819, + "learning_rate": 2.2956393213597996e-06, + "loss": 0.5575, + "step": 11934 + }, + { + "epoch": 3.1683260321253153, + "grad_norm": 0.4472429511601056, + "learning_rate": 2.2952913638496088e-06, + "loss": 0.5666, + "step": 11935 + }, + { + "epoch": 3.168591530598699, + "grad_norm": 0.45519272895729573, + "learning_rate": 2.294943410331738e-06, + "loss": 0.5492, + "step": 11936 + }, + { + "epoch": 3.168857029072083, + "grad_norm": 0.4496165437786142, + "learning_rate": 2.294595460812973e-06, + "loss": 0.5577, + "step": 11937 + }, + { + "epoch": 3.1691225275454666, + "grad_norm": 0.4567582927424457, + "learning_rate": 2.294247515300099e-06, + "loss": 0.5308, + "step": 11938 + }, + { + "epoch": 3.1693880260188503, + "grad_norm": 0.4559478049438848, + "learning_rate": 2.293899573799904e-06, + "loss": 0.5481, + "step": 11939 + }, + { + "epoch": 3.1696535244922344, + "grad_norm": 0.4580583642804652, + "learning_rate": 2.2935516363191695e-06, + "loss": 0.5803, + "step": 11940 + }, + { + "epoch": 3.169919022965618, + "grad_norm": 0.4512815847150975, + "learning_rate": 2.2932037028646843e-06, + "loss": 0.5643, + "step": 11941 + }, + { + "epoch": 3.1701845214390016, + "grad_norm": 0.4395173673345484, + "learning_rate": 2.2928557734432334e-06, + "loss": 0.5134, + "step": 11942 + }, + { + "epoch": 3.1704500199123853, + "grad_norm": 0.45927137400986795, + "learning_rate": 2.2925078480616023e-06, + "loss": 0.5538, + "step": 11943 + }, + { + "epoch": 3.1707155183857694, + "grad_norm": 0.4648944791227291, + "learning_rate": 2.292159926726576e-06, + "loss": 0.5378, + "step": 11944 + }, + { + "epoch": 3.170981016859153, + "grad_norm": 0.4488916888498538, + "learning_rate": 2.291812009444941e-06, + "loss": 0.5304, + "step": 11945 + }, + { + "epoch": 3.1712465153325367, + "grad_norm": 0.44352560345299413, + "learning_rate": 2.2914640962234805e-06, + "loss": 0.5136, + "step": 11946 + }, + { + "epoch": 3.1715120138059207, + "grad_norm": 0.4708600371657463, + "learning_rate": 2.2911161870689806e-06, + "loss": 0.5465, + "step": 11947 + }, + { + "epoch": 3.1717775122793044, + "grad_norm": 0.45775028611192936, + "learning_rate": 2.2907682819882263e-06, + "loss": 0.5737, + "step": 11948 + }, + { + "epoch": 3.172043010752688, + "grad_norm": 0.46017534142323596, + "learning_rate": 2.2904203809880033e-06, + "loss": 0.55, + "step": 11949 + }, + { + "epoch": 3.172308509226072, + "grad_norm": 0.448756471154226, + "learning_rate": 2.290072484075096e-06, + "loss": 0.5707, + "step": 11950 + }, + { + "epoch": 3.1725740076994557, + "grad_norm": 0.4634043266146457, + "learning_rate": 2.2897245912562895e-06, + "loss": 0.5513, + "step": 11951 + }, + { + "epoch": 3.1728395061728394, + "grad_norm": 0.44983267361748785, + "learning_rate": 2.289376702538368e-06, + "loss": 0.5596, + "step": 11952 + }, + { + "epoch": 3.1731050046462235, + "grad_norm": 0.4472643215390135, + "learning_rate": 2.2890288179281163e-06, + "loss": 0.527, + "step": 11953 + }, + { + "epoch": 3.173370503119607, + "grad_norm": 0.45415175539460245, + "learning_rate": 2.2886809374323193e-06, + "loss": 0.5296, + "step": 11954 + }, + { + "epoch": 3.1736360015929908, + "grad_norm": 0.45180212643048523, + "learning_rate": 2.288333061057761e-06, + "loss": 0.5674, + "step": 11955 + }, + { + "epoch": 3.173901500066375, + "grad_norm": 0.482653627030646, + "learning_rate": 2.287985188811228e-06, + "loss": 0.5568, + "step": 11956 + }, + { + "epoch": 3.1741669985397585, + "grad_norm": 0.4682150997849175, + "learning_rate": 2.2876373206995016e-06, + "loss": 0.5556, + "step": 11957 + }, + { + "epoch": 3.174432497013142, + "grad_norm": 0.46590560439215517, + "learning_rate": 2.2872894567293673e-06, + "loss": 0.6059, + "step": 11958 + }, + { + "epoch": 3.1746979954865258, + "grad_norm": 0.46556796630195274, + "learning_rate": 2.2869415969076095e-06, + "loss": 0.5665, + "step": 11959 + }, + { + "epoch": 3.17496349395991, + "grad_norm": 0.4424825426775279, + "learning_rate": 2.2865937412410117e-06, + "loss": 0.5079, + "step": 11960 + }, + { + "epoch": 3.1752289924332935, + "grad_norm": 0.45887644809287786, + "learning_rate": 2.286245889736359e-06, + "loss": 0.5498, + "step": 11961 + }, + { + "epoch": 3.175494490906677, + "grad_norm": 0.4409957756866072, + "learning_rate": 2.2858980424004356e-06, + "loss": 0.5537, + "step": 11962 + }, + { + "epoch": 3.175759989380061, + "grad_norm": 0.46018503733655214, + "learning_rate": 2.2855501992400238e-06, + "loss": 0.5522, + "step": 11963 + }, + { + "epoch": 3.176025487853445, + "grad_norm": 0.4478207696342093, + "learning_rate": 2.285202360261908e-06, + "loss": 0.5561, + "step": 11964 + }, + { + "epoch": 3.1762909863268285, + "grad_norm": 0.4509626756394883, + "learning_rate": 2.2848545254728723e-06, + "loss": 0.5937, + "step": 11965 + }, + { + "epoch": 3.1765564848002126, + "grad_norm": 0.4735172319303348, + "learning_rate": 2.2845066948797003e-06, + "loss": 0.532, + "step": 11966 + }, + { + "epoch": 3.176821983273596, + "grad_norm": 0.45510264104461606, + "learning_rate": 2.2841588684891746e-06, + "loss": 0.5762, + "step": 11967 + }, + { + "epoch": 3.17708748174698, + "grad_norm": 0.45940511443012727, + "learning_rate": 2.283811046308081e-06, + "loss": 0.5882, + "step": 11968 + }, + { + "epoch": 3.1773529802203635, + "grad_norm": 0.4593209916670848, + "learning_rate": 2.2834632283432e-06, + "loss": 0.561, + "step": 11969 + }, + { + "epoch": 3.1776184786937476, + "grad_norm": 0.4677320326489966, + "learning_rate": 2.2831154146013163e-06, + "loss": 0.5745, + "step": 11970 + }, + { + "epoch": 3.1778839771671312, + "grad_norm": 0.44025127110192513, + "learning_rate": 2.282767605089213e-06, + "loss": 0.5404, + "step": 11971 + }, + { + "epoch": 3.178149475640515, + "grad_norm": 0.459600201265043, + "learning_rate": 2.2824197998136737e-06, + "loss": 0.5345, + "step": 11972 + }, + { + "epoch": 3.178414974113899, + "grad_norm": 0.4510852089507729, + "learning_rate": 2.2820719987814806e-06, + "loss": 0.5756, + "step": 11973 + }, + { + "epoch": 3.1786804725872826, + "grad_norm": 0.45612439516926667, + "learning_rate": 2.2817242019994185e-06, + "loss": 0.5756, + "step": 11974 + }, + { + "epoch": 3.1789459710606662, + "grad_norm": 0.47673808738057594, + "learning_rate": 2.281376409474268e-06, + "loss": 0.562, + "step": 11975 + }, + { + "epoch": 3.1792114695340503, + "grad_norm": 0.46393072132264174, + "learning_rate": 2.281028621212812e-06, + "loss": 0.5119, + "step": 11976 + }, + { + "epoch": 3.179476968007434, + "grad_norm": 0.474091952795938, + "learning_rate": 2.280680837221835e-06, + "loss": 0.5173, + "step": 11977 + }, + { + "epoch": 3.1797424664808176, + "grad_norm": 0.4626248482084982, + "learning_rate": 2.2803330575081177e-06, + "loss": 0.5518, + "step": 11978 + }, + { + "epoch": 3.1800079649542017, + "grad_norm": 0.46396505374532665, + "learning_rate": 2.279985282078445e-06, + "loss": 0.4993, + "step": 11979 + }, + { + "epoch": 3.1802734634275853, + "grad_norm": 0.4577769150827353, + "learning_rate": 2.2796375109395968e-06, + "loss": 0.5705, + "step": 11980 + }, + { + "epoch": 3.180538961900969, + "grad_norm": 0.4560043970137087, + "learning_rate": 2.279289744098357e-06, + "loss": 0.5355, + "step": 11981 + }, + { + "epoch": 3.180804460374353, + "grad_norm": 0.4401055290037387, + "learning_rate": 2.2789419815615075e-06, + "loss": 0.5225, + "step": 11982 + }, + { + "epoch": 3.1810699588477367, + "grad_norm": 0.4482367031330957, + "learning_rate": 2.2785942233358308e-06, + "loss": 0.5499, + "step": 11983 + }, + { + "epoch": 3.1813354573211203, + "grad_norm": 0.45572598590679947, + "learning_rate": 2.2782464694281086e-06, + "loss": 0.5251, + "step": 11984 + }, + { + "epoch": 3.1816009557945044, + "grad_norm": 0.4667275212883273, + "learning_rate": 2.2778987198451237e-06, + "loss": 0.5705, + "step": 11985 + }, + { + "epoch": 3.181866454267888, + "grad_norm": 0.44958010361997885, + "learning_rate": 2.2775509745936573e-06, + "loss": 0.5558, + "step": 11986 + }, + { + "epoch": 3.1821319527412717, + "grad_norm": 0.46626592902557124, + "learning_rate": 2.277203233680491e-06, + "loss": 0.5574, + "step": 11987 + }, + { + "epoch": 3.1823974512146553, + "grad_norm": 0.4635906252070708, + "learning_rate": 2.276855497112408e-06, + "loss": 0.5213, + "step": 11988 + }, + { + "epoch": 3.1826629496880394, + "grad_norm": 0.45264797469297147, + "learning_rate": 2.2765077648961887e-06, + "loss": 0.5401, + "step": 11989 + }, + { + "epoch": 3.182928448161423, + "grad_norm": 0.4399546623244452, + "learning_rate": 2.276160037038615e-06, + "loss": 0.5237, + "step": 11990 + }, + { + "epoch": 3.1831939466348067, + "grad_norm": 0.4487082587186952, + "learning_rate": 2.27581231354647e-06, + "loss": 0.5294, + "step": 11991 + }, + { + "epoch": 3.183459445108191, + "grad_norm": 0.47512218392202166, + "learning_rate": 2.275464594426533e-06, + "loss": 0.5897, + "step": 11992 + }, + { + "epoch": 3.1837249435815744, + "grad_norm": 0.468212533986608, + "learning_rate": 2.2751168796855855e-06, + "loss": 0.5088, + "step": 11993 + }, + { + "epoch": 3.183990442054958, + "grad_norm": 0.46094414792627186, + "learning_rate": 2.27476916933041e-06, + "loss": 0.5116, + "step": 11994 + }, + { + "epoch": 3.184255940528342, + "grad_norm": 0.44229883939908693, + "learning_rate": 2.274421463367786e-06, + "loss": 0.547, + "step": 11995 + }, + { + "epoch": 3.184521439001726, + "grad_norm": 0.46093167367687804, + "learning_rate": 2.274073761804497e-06, + "loss": 0.543, + "step": 11996 + }, + { + "epoch": 3.1847869374751094, + "grad_norm": 0.4487065111767353, + "learning_rate": 2.273726064647323e-06, + "loss": 0.5426, + "step": 11997 + }, + { + "epoch": 3.185052435948493, + "grad_norm": 0.45691013144107645, + "learning_rate": 2.2733783719030444e-06, + "loss": 0.5303, + "step": 11998 + }, + { + "epoch": 3.185317934421877, + "grad_norm": 0.45419276777520756, + "learning_rate": 2.2730306835784423e-06, + "loss": 0.5156, + "step": 11999 + }, + { + "epoch": 3.185583432895261, + "grad_norm": 0.45450884051682744, + "learning_rate": 2.272682999680297e-06, + "loss": 0.5085, + "step": 12000 + }, + { + "epoch": 3.1858489313686444, + "grad_norm": 0.463686702003172, + "learning_rate": 2.2723353202153908e-06, + "loss": 0.5728, + "step": 12001 + }, + { + "epoch": 3.1861144298420285, + "grad_norm": 0.4612983885435976, + "learning_rate": 2.2719876451905028e-06, + "loss": 0.5512, + "step": 12002 + }, + { + "epoch": 3.186379928315412, + "grad_norm": 0.4450596182726371, + "learning_rate": 2.271639974612415e-06, + "loss": 0.5655, + "step": 12003 + }, + { + "epoch": 3.186645426788796, + "grad_norm": 0.466999257292668, + "learning_rate": 2.2712923084879056e-06, + "loss": 0.5431, + "step": 12004 + }, + { + "epoch": 3.18691092526218, + "grad_norm": 0.4555153611578263, + "learning_rate": 2.2709446468237566e-06, + "loss": 0.5498, + "step": 12005 + }, + { + "epoch": 3.1871764237355635, + "grad_norm": 0.4629705146813975, + "learning_rate": 2.2705969896267467e-06, + "loss": 0.5468, + "step": 12006 + }, + { + "epoch": 3.187441922208947, + "grad_norm": 0.4509704544541768, + "learning_rate": 2.270249336903658e-06, + "loss": 0.5719, + "step": 12007 + }, + { + "epoch": 3.1877074206823313, + "grad_norm": 0.46999624999682965, + "learning_rate": 2.269901688661271e-06, + "loss": 0.5387, + "step": 12008 + }, + { + "epoch": 3.187972919155715, + "grad_norm": 0.44971157740260614, + "learning_rate": 2.2695540449063634e-06, + "loss": 0.5787, + "step": 12009 + }, + { + "epoch": 3.1882384176290985, + "grad_norm": 0.45668394990435407, + "learning_rate": 2.2692064056457157e-06, + "loss": 0.558, + "step": 12010 + }, + { + "epoch": 3.1885039161024826, + "grad_norm": 0.4496143677955816, + "learning_rate": 2.2688587708861084e-06, + "loss": 0.5545, + "step": 12011 + }, + { + "epoch": 3.1887694145758663, + "grad_norm": 0.440637091969325, + "learning_rate": 2.268511140634321e-06, + "loss": 0.5616, + "step": 12012 + }, + { + "epoch": 3.18903491304925, + "grad_norm": 0.4660644754839236, + "learning_rate": 2.2681635148971334e-06, + "loss": 0.5393, + "step": 12013 + }, + { + "epoch": 3.1893004115226335, + "grad_norm": 0.44462407578478047, + "learning_rate": 2.267815893681325e-06, + "loss": 0.552, + "step": 12014 + }, + { + "epoch": 3.1895659099960176, + "grad_norm": 0.46355666669317747, + "learning_rate": 2.267468276993675e-06, + "loss": 0.5576, + "step": 12015 + }, + { + "epoch": 3.1898314084694013, + "grad_norm": 0.4581048296855899, + "learning_rate": 2.2671206648409625e-06, + "loss": 0.5872, + "step": 12016 + }, + { + "epoch": 3.190096906942785, + "grad_norm": 0.46121080803605136, + "learning_rate": 2.266773057229968e-06, + "loss": 0.5531, + "step": 12017 + }, + { + "epoch": 3.190362405416169, + "grad_norm": 0.4608905259243993, + "learning_rate": 2.2664254541674692e-06, + "loss": 0.5465, + "step": 12018 + }, + { + "epoch": 3.1906279038895526, + "grad_norm": 0.4618892315378303, + "learning_rate": 2.266077855660246e-06, + "loss": 0.5309, + "step": 12019 + }, + { + "epoch": 3.1908934023629363, + "grad_norm": 0.4577611693930053, + "learning_rate": 2.2657302617150785e-06, + "loss": 0.5607, + "step": 12020 + }, + { + "epoch": 3.1911589008363204, + "grad_norm": 0.4388856593475079, + "learning_rate": 2.265382672338744e-06, + "loss": 0.4986, + "step": 12021 + }, + { + "epoch": 3.191424399309704, + "grad_norm": 0.4535276687953131, + "learning_rate": 2.265035087538021e-06, + "loss": 0.5137, + "step": 12022 + }, + { + "epoch": 3.1916898977830876, + "grad_norm": 0.43577116403381694, + "learning_rate": 2.2646875073196895e-06, + "loss": 0.5543, + "step": 12023 + }, + { + "epoch": 3.1919553962564717, + "grad_norm": 0.4639696857394536, + "learning_rate": 2.2643399316905277e-06, + "loss": 0.5468, + "step": 12024 + }, + { + "epoch": 3.1922208947298554, + "grad_norm": 0.4434211391947143, + "learning_rate": 2.263992360657314e-06, + "loss": 0.5343, + "step": 12025 + }, + { + "epoch": 3.192486393203239, + "grad_norm": 0.4590767927260959, + "learning_rate": 2.263644794226828e-06, + "loss": 0.5257, + "step": 12026 + }, + { + "epoch": 3.1927518916766227, + "grad_norm": 0.45200571658006, + "learning_rate": 2.2632972324058462e-06, + "loss": 0.5569, + "step": 12027 + }, + { + "epoch": 3.1930173901500067, + "grad_norm": 0.4707935313823779, + "learning_rate": 2.2629496752011486e-06, + "loss": 0.5768, + "step": 12028 + }, + { + "epoch": 3.1932828886233904, + "grad_norm": 0.4614747272850468, + "learning_rate": 2.2626021226195123e-06, + "loss": 0.5615, + "step": 12029 + }, + { + "epoch": 3.193548387096774, + "grad_norm": 0.4548660363541643, + "learning_rate": 2.2622545746677157e-06, + "loss": 0.5441, + "step": 12030 + }, + { + "epoch": 3.193813885570158, + "grad_norm": 0.44517187875612163, + "learning_rate": 2.2619070313525377e-06, + "loss": 0.4936, + "step": 12031 + }, + { + "epoch": 3.1940793840435417, + "grad_norm": 0.44847273092111056, + "learning_rate": 2.2615594926807554e-06, + "loss": 0.5438, + "step": 12032 + }, + { + "epoch": 3.1943448825169254, + "grad_norm": 0.4582845204066536, + "learning_rate": 2.2612119586591462e-06, + "loss": 0.5158, + "step": 12033 + }, + { + "epoch": 3.1946103809903095, + "grad_norm": 0.4691001153907748, + "learning_rate": 2.2608644292944886e-06, + "loss": 0.5714, + "step": 12034 + }, + { + "epoch": 3.194875879463693, + "grad_norm": 0.45743358215071744, + "learning_rate": 2.2605169045935606e-06, + "loss": 0.5613, + "step": 12035 + }, + { + "epoch": 3.1951413779370768, + "grad_norm": 0.4341302331623997, + "learning_rate": 2.2601693845631387e-06, + "loss": 0.5389, + "step": 12036 + }, + { + "epoch": 3.195406876410461, + "grad_norm": 0.4727388039673577, + "learning_rate": 2.2598218692100023e-06, + "loss": 0.565, + "step": 12037 + }, + { + "epoch": 3.1956723748838445, + "grad_norm": 0.4592950393171834, + "learning_rate": 2.2594743585409263e-06, + "loss": 0.5123, + "step": 12038 + }, + { + "epoch": 3.195937873357228, + "grad_norm": 0.4360063807145816, + "learning_rate": 2.2591268525626896e-06, + "loss": 0.5354, + "step": 12039 + }, + { + "epoch": 3.196203371830612, + "grad_norm": 0.44070428224772124, + "learning_rate": 2.2587793512820688e-06, + "loss": 0.5511, + "step": 12040 + }, + { + "epoch": 3.196468870303996, + "grad_norm": 0.4527837138948097, + "learning_rate": 2.2584318547058414e-06, + "loss": 0.5429, + "step": 12041 + }, + { + "epoch": 3.1967343687773795, + "grad_norm": 0.46156954611492307, + "learning_rate": 2.2580843628407842e-06, + "loss": 0.5968, + "step": 12042 + }, + { + "epoch": 3.196999867250763, + "grad_norm": 0.4600964713946351, + "learning_rate": 2.2577368756936747e-06, + "loss": 0.5519, + "step": 12043 + }, + { + "epoch": 3.197265365724147, + "grad_norm": 0.4588229331576669, + "learning_rate": 2.257389393271289e-06, + "loss": 0.5397, + "step": 12044 + }, + { + "epoch": 3.197530864197531, + "grad_norm": 0.4508866314953257, + "learning_rate": 2.2570419155804038e-06, + "loss": 0.5408, + "step": 12045 + }, + { + "epoch": 3.1977963626709145, + "grad_norm": 0.4645064039383363, + "learning_rate": 2.256694442627796e-06, + "loss": 0.5216, + "step": 12046 + }, + { + "epoch": 3.1980618611442986, + "grad_norm": 0.4564802153939025, + "learning_rate": 2.256346974420243e-06, + "loss": 0.5128, + "step": 12047 + }, + { + "epoch": 3.198327359617682, + "grad_norm": 0.44992453751388645, + "learning_rate": 2.25599951096452e-06, + "loss": 0.5775, + "step": 12048 + }, + { + "epoch": 3.198592858091066, + "grad_norm": 0.45272866581134596, + "learning_rate": 2.255652052267405e-06, + "loss": 0.5654, + "step": 12049 + }, + { + "epoch": 3.19885835656445, + "grad_norm": 0.46225153529222474, + "learning_rate": 2.2553045983356726e-06, + "loss": 0.5701, + "step": 12050 + }, + { + "epoch": 3.1991238550378336, + "grad_norm": 0.4535628174109866, + "learning_rate": 2.2549571491760985e-06, + "loss": 0.5458, + "step": 12051 + }, + { + "epoch": 3.1993893535112172, + "grad_norm": 0.4607797671906974, + "learning_rate": 2.254609704795461e-06, + "loss": 0.5467, + "step": 12052 + }, + { + "epoch": 3.199654851984601, + "grad_norm": 0.470816052538922, + "learning_rate": 2.2542622652005346e-06, + "loss": 0.5469, + "step": 12053 + }, + { + "epoch": 3.199920350457985, + "grad_norm": 0.45565485148084167, + "learning_rate": 2.2539148303980974e-06, + "loss": 0.5171, + "step": 12054 + }, + { + "epoch": 3.2001858489313686, + "grad_norm": 0.4479978505775746, + "learning_rate": 2.253567400394922e-06, + "loss": 0.5481, + "step": 12055 + }, + { + "epoch": 3.2004513474047522, + "grad_norm": 0.44181825140960695, + "learning_rate": 2.2532199751977856e-06, + "loss": 0.5608, + "step": 12056 + }, + { + "epoch": 3.2007168458781363, + "grad_norm": 0.4479570571438855, + "learning_rate": 2.252872554813464e-06, + "loss": 0.5152, + "step": 12057 + }, + { + "epoch": 3.20098234435152, + "grad_norm": 0.47002670294432924, + "learning_rate": 2.2525251392487328e-06, + "loss": 0.5261, + "step": 12058 + }, + { + "epoch": 3.2012478428249036, + "grad_norm": 0.45402092226230917, + "learning_rate": 2.2521777285103673e-06, + "loss": 0.5485, + "step": 12059 + }, + { + "epoch": 3.2015133412982877, + "grad_norm": 0.4690168015304434, + "learning_rate": 2.2518303226051435e-06, + "loss": 0.5532, + "step": 12060 + }, + { + "epoch": 3.2017788397716713, + "grad_norm": 0.4498716109355263, + "learning_rate": 2.2514829215398354e-06, + "loss": 0.5327, + "step": 12061 + }, + { + "epoch": 3.202044338245055, + "grad_norm": 0.45858012458963276, + "learning_rate": 2.2511355253212186e-06, + "loss": 0.5606, + "step": 12062 + }, + { + "epoch": 3.202309836718439, + "grad_norm": 0.4548005249086935, + "learning_rate": 2.2507881339560687e-06, + "loss": 0.5403, + "step": 12063 + }, + { + "epoch": 3.2025753351918227, + "grad_norm": 0.49008197469923637, + "learning_rate": 2.25044074745116e-06, + "loss": 0.5416, + "step": 12064 + }, + { + "epoch": 3.2028408336652063, + "grad_norm": 0.4551498377877728, + "learning_rate": 2.2500933658132683e-06, + "loss": 0.5738, + "step": 12065 + }, + { + "epoch": 3.2031063321385904, + "grad_norm": 0.4618772963686445, + "learning_rate": 2.2497459890491682e-06, + "loss": 0.5457, + "step": 12066 + }, + { + "epoch": 3.203371830611974, + "grad_norm": 0.45878121212282486, + "learning_rate": 2.2493986171656335e-06, + "loss": 0.5835, + "step": 12067 + }, + { + "epoch": 3.2036373290853577, + "grad_norm": 0.4752351285463351, + "learning_rate": 2.2490512501694394e-06, + "loss": 0.5295, + "step": 12068 + }, + { + "epoch": 3.2039028275587413, + "grad_norm": 0.44611285960299535, + "learning_rate": 2.2487038880673597e-06, + "loss": 0.5754, + "step": 12069 + }, + { + "epoch": 3.2041683260321254, + "grad_norm": 0.4709357865274885, + "learning_rate": 2.24835653086617e-06, + "loss": 0.5422, + "step": 12070 + }, + { + "epoch": 3.204433824505509, + "grad_norm": 0.4610817227608981, + "learning_rate": 2.248009178572644e-06, + "loss": 0.5233, + "step": 12071 + }, + { + "epoch": 3.2046993229788927, + "grad_norm": 0.4528930371804233, + "learning_rate": 2.2476618311935566e-06, + "loss": 0.508, + "step": 12072 + }, + { + "epoch": 3.204964821452277, + "grad_norm": 0.4481102857538323, + "learning_rate": 2.247314488735681e-06, + "loss": 0.5432, + "step": 12073 + }, + { + "epoch": 3.2052303199256604, + "grad_norm": 0.4671262988786407, + "learning_rate": 2.2469671512057907e-06, + "loss": 0.5736, + "step": 12074 + }, + { + "epoch": 3.205495818399044, + "grad_norm": 0.45117701472613003, + "learning_rate": 2.246619818610661e-06, + "loss": 0.5501, + "step": 12075 + }, + { + "epoch": 3.205761316872428, + "grad_norm": 0.4890338961575178, + "learning_rate": 2.2462724909570654e-06, + "loss": 0.4999, + "step": 12076 + }, + { + "epoch": 3.206026815345812, + "grad_norm": 0.4547605036692314, + "learning_rate": 2.2459251682517778e-06, + "loss": 0.5612, + "step": 12077 + }, + { + "epoch": 3.2062923138191954, + "grad_norm": 0.44653337837380486, + "learning_rate": 2.245577850501571e-06, + "loss": 0.5576, + "step": 12078 + }, + { + "epoch": 3.2065578122925795, + "grad_norm": 0.4652059954447466, + "learning_rate": 2.245230537713219e-06, + "loss": 0.5774, + "step": 12079 + }, + { + "epoch": 3.206823310765963, + "grad_norm": 0.4501068914291192, + "learning_rate": 2.244883229893495e-06, + "loss": 0.5703, + "step": 12080 + }, + { + "epoch": 3.207088809239347, + "grad_norm": 0.46878727246919116, + "learning_rate": 2.2445359270491735e-06, + "loss": 0.5747, + "step": 12081 + }, + { + "epoch": 3.2073543077127304, + "grad_norm": 0.4729406387605314, + "learning_rate": 2.2441886291870263e-06, + "loss": 0.5677, + "step": 12082 + }, + { + "epoch": 3.2076198061861145, + "grad_norm": 0.45966063067065105, + "learning_rate": 2.2438413363138283e-06, + "loss": 0.5436, + "step": 12083 + }, + { + "epoch": 3.207885304659498, + "grad_norm": 0.45546426349227226, + "learning_rate": 2.2434940484363505e-06, + "loss": 0.5318, + "step": 12084 + }, + { + "epoch": 3.208150803132882, + "grad_norm": 0.4549677523565809, + "learning_rate": 2.2431467655613667e-06, + "loss": 0.5372, + "step": 12085 + }, + { + "epoch": 3.208416301606266, + "grad_norm": 0.45277210155815134, + "learning_rate": 2.24279948769565e-06, + "loss": 0.5494, + "step": 12086 + }, + { + "epoch": 3.2086818000796495, + "grad_norm": 0.4627747034900131, + "learning_rate": 2.2424522148459727e-06, + "loss": 0.5635, + "step": 12087 + }, + { + "epoch": 3.208947298553033, + "grad_norm": 0.4510628916255789, + "learning_rate": 2.2421049470191077e-06, + "loss": 0.557, + "step": 12088 + }, + { + "epoch": 3.2092127970264173, + "grad_norm": 0.4533828423471941, + "learning_rate": 2.2417576842218287e-06, + "loss": 0.5344, + "step": 12089 + }, + { + "epoch": 3.209478295499801, + "grad_norm": 0.4669724652502168, + "learning_rate": 2.2414104264609064e-06, + "loss": 0.5167, + "step": 12090 + }, + { + "epoch": 3.2097437939731845, + "grad_norm": 0.45594452700812144, + "learning_rate": 2.2410631737431136e-06, + "loss": 0.5792, + "step": 12091 + }, + { + "epoch": 3.2100092924465686, + "grad_norm": 0.46395962574191985, + "learning_rate": 2.2407159260752233e-06, + "loss": 0.554, + "step": 12092 + }, + { + "epoch": 3.2102747909199523, + "grad_norm": 0.46931571697257185, + "learning_rate": 2.2403686834640074e-06, + "loss": 0.5514, + "step": 12093 + }, + { + "epoch": 3.210540289393336, + "grad_norm": 0.4848476102512686, + "learning_rate": 2.2400214459162376e-06, + "loss": 0.5647, + "step": 12094 + }, + { + "epoch": 3.21080578786672, + "grad_norm": 0.46571891452389336, + "learning_rate": 2.239674213438687e-06, + "loss": 0.5742, + "step": 12095 + }, + { + "epoch": 3.2110712863401036, + "grad_norm": 0.4656596644272797, + "learning_rate": 2.239326986038126e-06, + "loss": 0.5357, + "step": 12096 + }, + { + "epoch": 3.2113367848134873, + "grad_norm": 0.4763818268153712, + "learning_rate": 2.238979763721326e-06, + "loss": 0.5257, + "step": 12097 + }, + { + "epoch": 3.211602283286871, + "grad_norm": 0.46889928409729764, + "learning_rate": 2.2386325464950604e-06, + "loss": 0.5586, + "step": 12098 + }, + { + "epoch": 3.211867781760255, + "grad_norm": 0.446437680670925, + "learning_rate": 2.2382853343661005e-06, + "loss": 0.5754, + "step": 12099 + }, + { + "epoch": 3.2121332802336386, + "grad_norm": 0.4754970797630485, + "learning_rate": 2.237938127341218e-06, + "loss": 0.5612, + "step": 12100 + }, + { + "epoch": 3.2123987787070223, + "grad_norm": 0.4677208564964637, + "learning_rate": 2.237590925427183e-06, + "loss": 0.5391, + "step": 12101 + }, + { + "epoch": 3.2126642771804064, + "grad_norm": 0.4695931236951604, + "learning_rate": 2.237243728630767e-06, + "loss": 0.5193, + "step": 12102 + }, + { + "epoch": 3.21292977565379, + "grad_norm": 0.4497338233579656, + "learning_rate": 2.2368965369587416e-06, + "loss": 0.5175, + "step": 12103 + }, + { + "epoch": 3.2131952741271737, + "grad_norm": 0.451723849156023, + "learning_rate": 2.2365493504178784e-06, + "loss": 0.5267, + "step": 12104 + }, + { + "epoch": 3.2134607726005577, + "grad_norm": 0.45168296666022734, + "learning_rate": 2.2362021690149476e-06, + "loss": 0.52, + "step": 12105 + }, + { + "epoch": 3.2137262710739414, + "grad_norm": 0.45246909625047804, + "learning_rate": 2.235854992756721e-06, + "loss": 0.559, + "step": 12106 + }, + { + "epoch": 3.213991769547325, + "grad_norm": 0.4573653635617974, + "learning_rate": 2.235507821649968e-06, + "loss": 0.5096, + "step": 12107 + }, + { + "epoch": 3.2142572680207087, + "grad_norm": 0.45368240343468075, + "learning_rate": 2.23516065570146e-06, + "loss": 0.5518, + "step": 12108 + }, + { + "epoch": 3.2145227664940927, + "grad_norm": 0.46948971125596406, + "learning_rate": 2.234813494917968e-06, + "loss": 0.5315, + "step": 12109 + }, + { + "epoch": 3.2147882649674764, + "grad_norm": 0.4585606700071447, + "learning_rate": 2.234466339306262e-06, + "loss": 0.4933, + "step": 12110 + }, + { + "epoch": 3.21505376344086, + "grad_norm": 0.4703784538853916, + "learning_rate": 2.2341191888731127e-06, + "loss": 0.5552, + "step": 12111 + }, + { + "epoch": 3.215319261914244, + "grad_norm": 0.44464686912513046, + "learning_rate": 2.233772043625291e-06, + "loss": 0.5875, + "step": 12112 + }, + { + "epoch": 3.2155847603876277, + "grad_norm": 0.46274246587299905, + "learning_rate": 2.2334249035695656e-06, + "loss": 0.5572, + "step": 12113 + }, + { + "epoch": 3.2158502588610114, + "grad_norm": 0.44512400348257325, + "learning_rate": 2.233077768712707e-06, + "loss": 0.5539, + "step": 12114 + }, + { + "epoch": 3.2161157573343955, + "grad_norm": 0.44128150992307696, + "learning_rate": 2.2327306390614854e-06, + "loss": 0.5681, + "step": 12115 + }, + { + "epoch": 3.216381255807779, + "grad_norm": 0.4529521159573287, + "learning_rate": 2.2323835146226706e-06, + "loss": 0.5112, + "step": 12116 + }, + { + "epoch": 3.2166467542811628, + "grad_norm": 0.47760805388944544, + "learning_rate": 2.2320363954030326e-06, + "loss": 0.5559, + "step": 12117 + }, + { + "epoch": 3.216912252754547, + "grad_norm": 0.46417438150451695, + "learning_rate": 2.231689281409342e-06, + "loss": 0.5112, + "step": 12118 + }, + { + "epoch": 3.2171777512279305, + "grad_norm": 0.4699795409075221, + "learning_rate": 2.2313421726483663e-06, + "loss": 0.5391, + "step": 12119 + }, + { + "epoch": 3.217443249701314, + "grad_norm": 0.45310331280150884, + "learning_rate": 2.2309950691268763e-06, + "loss": 0.5514, + "step": 12120 + }, + { + "epoch": 3.217708748174698, + "grad_norm": 0.46829186434105485, + "learning_rate": 2.230647970851641e-06, + "loss": 0.508, + "step": 12121 + }, + { + "epoch": 3.217974246648082, + "grad_norm": 0.4718360330198163, + "learning_rate": 2.2303008778294292e-06, + "loss": 0.5316, + "step": 12122 + }, + { + "epoch": 3.2182397451214655, + "grad_norm": 0.4647252295897281, + "learning_rate": 2.2299537900670114e-06, + "loss": 0.5433, + "step": 12123 + }, + { + "epoch": 3.218505243594849, + "grad_norm": 0.4578173347754867, + "learning_rate": 2.229606707571156e-06, + "loss": 0.5778, + "step": 12124 + }, + { + "epoch": 3.218770742068233, + "grad_norm": 0.4511035043832162, + "learning_rate": 2.2292596303486314e-06, + "loss": 0.5274, + "step": 12125 + }, + { + "epoch": 3.219036240541617, + "grad_norm": 0.47448842514494627, + "learning_rate": 2.2289125584062073e-06, + "loss": 0.53, + "step": 12126 + }, + { + "epoch": 3.2193017390150005, + "grad_norm": 0.4639869128370032, + "learning_rate": 2.228565491750651e-06, + "loss": 0.5299, + "step": 12127 + }, + { + "epoch": 3.2195672374883846, + "grad_norm": 0.4584662461537314, + "learning_rate": 2.2282184303887333e-06, + "loss": 0.5007, + "step": 12128 + }, + { + "epoch": 3.219832735961768, + "grad_norm": 0.4663110405042132, + "learning_rate": 2.2278713743272224e-06, + "loss": 0.564, + "step": 12129 + }, + { + "epoch": 3.220098234435152, + "grad_norm": 0.4673406841538499, + "learning_rate": 2.2275243235728845e-06, + "loss": 0.5731, + "step": 12130 + }, + { + "epoch": 3.220363732908536, + "grad_norm": 0.47030636110289986, + "learning_rate": 2.22717727813249e-06, + "loss": 0.5801, + "step": 12131 + }, + { + "epoch": 3.2206292313819196, + "grad_norm": 0.46900073011499105, + "learning_rate": 2.226830238012806e-06, + "loss": 0.5217, + "step": 12132 + }, + { + "epoch": 3.2208947298553032, + "grad_norm": 0.4674387077100141, + "learning_rate": 2.2264832032206016e-06, + "loss": 0.5449, + "step": 12133 + }, + { + "epoch": 3.2211602283286873, + "grad_norm": 0.46153847590659663, + "learning_rate": 2.2261361737626443e-06, + "loss": 0.5246, + "step": 12134 + }, + { + "epoch": 3.221425726802071, + "grad_norm": 0.4336889807400292, + "learning_rate": 2.225789149645703e-06, + "loss": 0.4993, + "step": 12135 + }, + { + "epoch": 3.2216912252754546, + "grad_norm": 0.4697712684557118, + "learning_rate": 2.2254421308765436e-06, + "loss": 0.552, + "step": 12136 + }, + { + "epoch": 3.2219567237488382, + "grad_norm": 0.440115187230645, + "learning_rate": 2.225095117461935e-06, + "loss": 0.5416, + "step": 12137 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.4806838936062625, + "learning_rate": 2.2247481094086447e-06, + "loss": 0.5607, + "step": 12138 + }, + { + "epoch": 3.222487720695606, + "grad_norm": 0.4579858849906208, + "learning_rate": 2.2244011067234405e-06, + "loss": 0.577, + "step": 12139 + }, + { + "epoch": 3.2227532191689896, + "grad_norm": 0.4432900058184422, + "learning_rate": 2.224054109413089e-06, + "loss": 0.5323, + "step": 12140 + }, + { + "epoch": 3.2230187176423737, + "grad_norm": 0.45549097321417753, + "learning_rate": 2.2237071174843596e-06, + "loss": 0.5833, + "step": 12141 + }, + { + "epoch": 3.2232842161157573, + "grad_norm": 0.45396196674628586, + "learning_rate": 2.2233601309440163e-06, + "loss": 0.5568, + "step": 12142 + }, + { + "epoch": 3.223549714589141, + "grad_norm": 0.4614920207590504, + "learning_rate": 2.2230131497988278e-06, + "loss": 0.5603, + "step": 12143 + }, + { + "epoch": 3.223815213062525, + "grad_norm": 0.44942017586443134, + "learning_rate": 2.2226661740555614e-06, + "loss": 0.539, + "step": 12144 + }, + { + "epoch": 3.2240807115359087, + "grad_norm": 0.44521133851088185, + "learning_rate": 2.2223192037209836e-06, + "loss": 0.522, + "step": 12145 + }, + { + "epoch": 3.2243462100092923, + "grad_norm": 0.4521787414758281, + "learning_rate": 2.221972238801861e-06, + "loss": 0.5386, + "step": 12146 + }, + { + "epoch": 3.2246117084826764, + "grad_norm": 0.4477524357178312, + "learning_rate": 2.221625279304962e-06, + "loss": 0.5244, + "step": 12147 + }, + { + "epoch": 3.22487720695606, + "grad_norm": 0.4592824521022462, + "learning_rate": 2.2212783252370496e-06, + "loss": 0.5676, + "step": 12148 + }, + { + "epoch": 3.2251427054294437, + "grad_norm": 0.4417146126638557, + "learning_rate": 2.2209313766048934e-06, + "loss": 0.5403, + "step": 12149 + }, + { + "epoch": 3.225408203902828, + "grad_norm": 0.4366688082949209, + "learning_rate": 2.220584433415258e-06, + "loss": 0.5253, + "step": 12150 + }, + { + "epoch": 3.2256737023762114, + "grad_norm": 0.4560865805538416, + "learning_rate": 2.2202374956749105e-06, + "loss": 0.5396, + "step": 12151 + }, + { + "epoch": 3.225939200849595, + "grad_norm": 0.45554731995836184, + "learning_rate": 2.219890563390617e-06, + "loss": 0.5596, + "step": 12152 + }, + { + "epoch": 3.2262046993229787, + "grad_norm": 0.45145979357005994, + "learning_rate": 2.2195436365691434e-06, + "loss": 0.5193, + "step": 12153 + }, + { + "epoch": 3.226470197796363, + "grad_norm": 0.4580316692631737, + "learning_rate": 2.2191967152172554e-06, + "loss": 0.5433, + "step": 12154 + }, + { + "epoch": 3.2267356962697464, + "grad_norm": 0.4748272990049663, + "learning_rate": 2.2188497993417187e-06, + "loss": 0.5716, + "step": 12155 + }, + { + "epoch": 3.22700119474313, + "grad_norm": 0.46202573047756074, + "learning_rate": 2.2185028889493e-06, + "loss": 0.5847, + "step": 12156 + }, + { + "epoch": 3.227266693216514, + "grad_norm": 0.44798805376898804, + "learning_rate": 2.218155984046763e-06, + "loss": 0.538, + "step": 12157 + }, + { + "epoch": 3.227532191689898, + "grad_norm": 0.4660034709747994, + "learning_rate": 2.2178090846408762e-06, + "loss": 0.587, + "step": 12158 + }, + { + "epoch": 3.2277976901632814, + "grad_norm": 0.48147163212559496, + "learning_rate": 2.217462190738402e-06, + "loss": 0.5597, + "step": 12159 + }, + { + "epoch": 3.2280631886366655, + "grad_norm": 0.4586045285283479, + "learning_rate": 2.2171153023461067e-06, + "loss": 0.5515, + "step": 12160 + }, + { + "epoch": 3.228328687110049, + "grad_norm": 0.46502968493999525, + "learning_rate": 2.216768419470756e-06, + "loss": 0.5858, + "step": 12161 + }, + { + "epoch": 3.228594185583433, + "grad_norm": 0.4591106631592454, + "learning_rate": 2.2164215421191145e-06, + "loss": 0.5454, + "step": 12162 + }, + { + "epoch": 3.2288596840568164, + "grad_norm": 0.4958019888662801, + "learning_rate": 2.2160746702979465e-06, + "loss": 0.5402, + "step": 12163 + }, + { + "epoch": 3.2291251825302005, + "grad_norm": 0.4849185985974037, + "learning_rate": 2.215727804014019e-06, + "loss": 0.5369, + "step": 12164 + }, + { + "epoch": 3.229390681003584, + "grad_norm": 0.47333501783178134, + "learning_rate": 2.2153809432740948e-06, + "loss": 0.5616, + "step": 12165 + }, + { + "epoch": 3.229656179476968, + "grad_norm": 0.4592387052920761, + "learning_rate": 2.2150340880849387e-06, + "loss": 0.5512, + "step": 12166 + }, + { + "epoch": 3.229921677950352, + "grad_norm": 0.467101843241378, + "learning_rate": 2.2146872384533153e-06, + "loss": 0.4867, + "step": 12167 + }, + { + "epoch": 3.2301871764237355, + "grad_norm": 0.4529533947825373, + "learning_rate": 2.2143403943859903e-06, + "loss": 0.5482, + "step": 12168 + }, + { + "epoch": 3.230452674897119, + "grad_norm": 0.47249892257166426, + "learning_rate": 2.2139935558897263e-06, + "loss": 0.5489, + "step": 12169 + }, + { + "epoch": 3.2307181733705033, + "grad_norm": 0.44970666156245037, + "learning_rate": 2.2136467229712886e-06, + "loss": 0.5353, + "step": 12170 + }, + { + "epoch": 3.230983671843887, + "grad_norm": 0.46488357509370176, + "learning_rate": 2.213299895637441e-06, + "loss": 0.5431, + "step": 12171 + }, + { + "epoch": 3.2312491703172705, + "grad_norm": 0.4534898071054505, + "learning_rate": 2.2129530738949472e-06, + "loss": 0.5546, + "step": 12172 + }, + { + "epoch": 3.2315146687906546, + "grad_norm": 0.44740152800151645, + "learning_rate": 2.212606257750571e-06, + "loss": 0.5242, + "step": 12173 + }, + { + "epoch": 3.2317801672640383, + "grad_norm": 0.4649213048231864, + "learning_rate": 2.2122594472110763e-06, + "loss": 0.5471, + "step": 12174 + }, + { + "epoch": 3.232045665737422, + "grad_norm": 0.4820295704321057, + "learning_rate": 2.2119126422832286e-06, + "loss": 0.5454, + "step": 12175 + }, + { + "epoch": 3.232311164210806, + "grad_norm": 0.46720009037869287, + "learning_rate": 2.211565842973788e-06, + "loss": 0.5599, + "step": 12176 + }, + { + "epoch": 3.2325766626841896, + "grad_norm": 0.45121648978633006, + "learning_rate": 2.21121904928952e-06, + "loss": 0.558, + "step": 12177 + }, + { + "epoch": 3.2328421611575733, + "grad_norm": 0.46940860421368497, + "learning_rate": 2.210872261237188e-06, + "loss": 0.5634, + "step": 12178 + }, + { + "epoch": 3.233107659630957, + "grad_norm": 0.44614122509916604, + "learning_rate": 2.210525478823554e-06, + "loss": 0.5404, + "step": 12179 + }, + { + "epoch": 3.233373158104341, + "grad_norm": 0.4528417382414176, + "learning_rate": 2.2101787020553825e-06, + "loss": 0.5412, + "step": 12180 + }, + { + "epoch": 3.2336386565777246, + "grad_norm": 0.45937324115672057, + "learning_rate": 2.2098319309394366e-06, + "loss": 0.5466, + "step": 12181 + }, + { + "epoch": 3.2339041550511083, + "grad_norm": 0.4569451644713486, + "learning_rate": 2.2094851654824772e-06, + "loss": 0.5546, + "step": 12182 + }, + { + "epoch": 3.2341696535244924, + "grad_norm": 0.4481938655920306, + "learning_rate": 2.209138405691269e-06, + "loss": 0.5378, + "step": 12183 + }, + { + "epoch": 3.234435151997876, + "grad_norm": 0.4512656372869046, + "learning_rate": 2.2087916515725738e-06, + "loss": 0.557, + "step": 12184 + }, + { + "epoch": 3.2347006504712597, + "grad_norm": 0.46281960152815793, + "learning_rate": 2.2084449031331547e-06, + "loss": 0.559, + "step": 12185 + }, + { + "epoch": 3.2349661489446437, + "grad_norm": 0.44736310895191644, + "learning_rate": 2.2080981603797733e-06, + "loss": 0.5648, + "step": 12186 + }, + { + "epoch": 3.2352316474180274, + "grad_norm": 0.4543565784586833, + "learning_rate": 2.2077514233191937e-06, + "loss": 0.5266, + "step": 12187 + }, + { + "epoch": 3.235497145891411, + "grad_norm": 0.46002096479386184, + "learning_rate": 2.2074046919581755e-06, + "loss": 0.5332, + "step": 12188 + }, + { + "epoch": 3.235762644364795, + "grad_norm": 0.4538934417673362, + "learning_rate": 2.207057966303483e-06, + "loss": 0.5735, + "step": 12189 + }, + { + "epoch": 3.2360281428381787, + "grad_norm": 0.45619657442827855, + "learning_rate": 2.2067112463618765e-06, + "loss": 0.5644, + "step": 12190 + }, + { + "epoch": 3.2362936413115624, + "grad_norm": 0.44448051857426224, + "learning_rate": 2.20636453214012e-06, + "loss": 0.5687, + "step": 12191 + }, + { + "epoch": 3.236559139784946, + "grad_norm": 0.444621750598178, + "learning_rate": 2.206017823644973e-06, + "loss": 0.5211, + "step": 12192 + }, + { + "epoch": 3.23682463825833, + "grad_norm": 0.4485008157651884, + "learning_rate": 2.2056711208831997e-06, + "loss": 0.5677, + "step": 12193 + }, + { + "epoch": 3.2370901367317138, + "grad_norm": 0.4528398790508969, + "learning_rate": 2.205324423861559e-06, + "loss": 0.5088, + "step": 12194 + }, + { + "epoch": 3.2373556352050974, + "grad_norm": 0.4471990174442139, + "learning_rate": 2.2049777325868136e-06, + "loss": 0.5256, + "step": 12195 + }, + { + "epoch": 3.2376211336784815, + "grad_norm": 0.445900984045634, + "learning_rate": 2.204631047065725e-06, + "loss": 0.5817, + "step": 12196 + }, + { + "epoch": 3.237886632151865, + "grad_norm": 0.45361077452901394, + "learning_rate": 2.204284367305054e-06, + "loss": 0.5159, + "step": 12197 + }, + { + "epoch": 3.2381521306252488, + "grad_norm": 0.452310084875701, + "learning_rate": 2.2039376933115625e-06, + "loss": 0.5282, + "step": 12198 + }, + { + "epoch": 3.238417629098633, + "grad_norm": 0.4488876666616326, + "learning_rate": 2.20359102509201e-06, + "loss": 0.5388, + "step": 12199 + }, + { + "epoch": 3.2386831275720165, + "grad_norm": 0.45576514812790636, + "learning_rate": 2.2032443626531587e-06, + "loss": 0.535, + "step": 12200 + }, + { + "epoch": 3.2389486260454, + "grad_norm": 0.462596653433408, + "learning_rate": 2.2028977060017687e-06, + "loss": 0.5768, + "step": 12201 + }, + { + "epoch": 3.239214124518784, + "grad_norm": 0.45109602400534715, + "learning_rate": 2.2025510551446007e-06, + "loss": 0.5857, + "step": 12202 + }, + { + "epoch": 3.239479622992168, + "grad_norm": 0.4595647937941383, + "learning_rate": 2.2022044100884154e-06, + "loss": 0.5234, + "step": 12203 + }, + { + "epoch": 3.2397451214655515, + "grad_norm": 0.4771367523638718, + "learning_rate": 2.2018577708399746e-06, + "loss": 0.5432, + "step": 12204 + }, + { + "epoch": 3.2400106199389356, + "grad_norm": 0.4652568836582108, + "learning_rate": 2.201511137406036e-06, + "loss": 0.5487, + "step": 12205 + }, + { + "epoch": 3.240276118412319, + "grad_norm": 0.45308743888268965, + "learning_rate": 2.2011645097933612e-06, + "loss": 0.5494, + "step": 12206 + }, + { + "epoch": 3.240541616885703, + "grad_norm": 0.4763589233900814, + "learning_rate": 2.20081788800871e-06, + "loss": 0.5645, + "step": 12207 + }, + { + "epoch": 3.2408071153590865, + "grad_norm": 0.46797291432006216, + "learning_rate": 2.2004712720588424e-06, + "loss": 0.5599, + "step": 12208 + }, + { + "epoch": 3.2410726138324706, + "grad_norm": 0.4523293511370984, + "learning_rate": 2.200124661950518e-06, + "loss": 0.5515, + "step": 12209 + }, + { + "epoch": 3.2413381123058542, + "grad_norm": 0.4784021315369009, + "learning_rate": 2.1997780576904983e-06, + "loss": 0.5567, + "step": 12210 + }, + { + "epoch": 3.241603610779238, + "grad_norm": 0.4712859001401952, + "learning_rate": 2.199431459285541e-06, + "loss": 0.5517, + "step": 12211 + }, + { + "epoch": 3.241869109252622, + "grad_norm": 0.4466948521886155, + "learning_rate": 2.1990848667424055e-06, + "loss": 0.5073, + "step": 12212 + }, + { + "epoch": 3.2421346077260056, + "grad_norm": 0.46764161057134546, + "learning_rate": 2.198738280067852e-06, + "loss": 0.5625, + "step": 12213 + }, + { + "epoch": 3.2424001061993892, + "grad_norm": 0.4732155112232539, + "learning_rate": 2.1983916992686404e-06, + "loss": 0.5553, + "step": 12214 + }, + { + "epoch": 3.2426656046727733, + "grad_norm": 0.46146798220541685, + "learning_rate": 2.198045124351528e-06, + "loss": 0.5472, + "step": 12215 + }, + { + "epoch": 3.242931103146157, + "grad_norm": 0.4473519016083044, + "learning_rate": 2.197698555323276e-06, + "loss": 0.5522, + "step": 12216 + }, + { + "epoch": 3.2431966016195406, + "grad_norm": 0.4684066269483547, + "learning_rate": 2.1973519921906424e-06, + "loss": 0.5246, + "step": 12217 + }, + { + "epoch": 3.2434621000929242, + "grad_norm": 0.4537425994399478, + "learning_rate": 2.1970054349603853e-06, + "loss": 0.548, + "step": 12218 + }, + { + "epoch": 3.2437275985663083, + "grad_norm": 0.45478074346668496, + "learning_rate": 2.196658883639264e-06, + "loss": 0.5296, + "step": 12219 + }, + { + "epoch": 3.243993097039692, + "grad_norm": 0.4513207887052805, + "learning_rate": 2.1963123382340374e-06, + "loss": 0.5484, + "step": 12220 + }, + { + "epoch": 3.2442585955130756, + "grad_norm": 0.4514299286004895, + "learning_rate": 2.195965798751465e-06, + "loss": 0.5486, + "step": 12221 + }, + { + "epoch": 3.2445240939864597, + "grad_norm": 0.4659791010550979, + "learning_rate": 2.195619265198303e-06, + "loss": 0.5422, + "step": 12222 + }, + { + "epoch": 3.2447895924598433, + "grad_norm": 0.45580487147978704, + "learning_rate": 2.19527273758131e-06, + "loss": 0.5557, + "step": 12223 + }, + { + "epoch": 3.245055090933227, + "grad_norm": 0.4468500408575031, + "learning_rate": 2.194926215907245e-06, + "loss": 0.5601, + "step": 12224 + }, + { + "epoch": 3.245320589406611, + "grad_norm": 0.4554152796638522, + "learning_rate": 2.1945797001828657e-06, + "loss": 0.5285, + "step": 12225 + }, + { + "epoch": 3.2455860878799947, + "grad_norm": 0.4523583981292327, + "learning_rate": 2.19423319041493e-06, + "loss": 0.5264, + "step": 12226 + }, + { + "epoch": 3.2458515863533783, + "grad_norm": 0.43941840350684247, + "learning_rate": 2.1938866866101962e-06, + "loss": 0.4943, + "step": 12227 + }, + { + "epoch": 3.2461170848267624, + "grad_norm": 0.4736205118896015, + "learning_rate": 2.1935401887754213e-06, + "loss": 0.5576, + "step": 12228 + }, + { + "epoch": 3.246382583300146, + "grad_norm": 0.45033360717758586, + "learning_rate": 2.193193696917363e-06, + "loss": 0.5956, + "step": 12229 + }, + { + "epoch": 3.2466480817735297, + "grad_norm": 0.4639290296005261, + "learning_rate": 2.1928472110427784e-06, + "loss": 0.5287, + "step": 12230 + }, + { + "epoch": 3.246913580246914, + "grad_norm": 0.4481895081183422, + "learning_rate": 2.1925007311584256e-06, + "loss": 0.5571, + "step": 12231 + }, + { + "epoch": 3.2471790787202974, + "grad_norm": 0.46467440781894104, + "learning_rate": 2.1921542572710615e-06, + "loss": 0.5407, + "step": 12232 + }, + { + "epoch": 3.247444577193681, + "grad_norm": 0.4626729522932544, + "learning_rate": 2.1918077893874436e-06, + "loss": 0.5786, + "step": 12233 + }, + { + "epoch": 3.2477100756670647, + "grad_norm": 0.4650606058278087, + "learning_rate": 2.1914613275143274e-06, + "loss": 0.5364, + "step": 12234 + }, + { + "epoch": 3.247975574140449, + "grad_norm": 0.45336035894428495, + "learning_rate": 2.191114871658471e-06, + "loss": 0.5641, + "step": 12235 + }, + { + "epoch": 3.2482410726138324, + "grad_norm": 0.47239233246393225, + "learning_rate": 2.190768421826631e-06, + "loss": 0.5477, + "step": 12236 + }, + { + "epoch": 3.248506571087216, + "grad_norm": 0.44701614909347615, + "learning_rate": 2.190421978025564e-06, + "loss": 0.5373, + "step": 12237 + }, + { + "epoch": 3.2487720695606, + "grad_norm": 0.4609332627294107, + "learning_rate": 2.190075540262026e-06, + "loss": 0.5384, + "step": 12238 + }, + { + "epoch": 3.249037568033984, + "grad_norm": 0.4458144543497252, + "learning_rate": 2.189729108542775e-06, + "loss": 0.5225, + "step": 12239 + }, + { + "epoch": 3.2493030665073674, + "grad_norm": 0.4413670248541004, + "learning_rate": 2.1893826828745656e-06, + "loss": 0.5428, + "step": 12240 + }, + { + "epoch": 3.2495685649807515, + "grad_norm": 0.45306350599003875, + "learning_rate": 2.1890362632641536e-06, + "loss": 0.5455, + "step": 12241 + }, + { + "epoch": 3.249834063454135, + "grad_norm": 0.45364293517903237, + "learning_rate": 2.188689849718296e-06, + "loss": 0.5685, + "step": 12242 + }, + { + "epoch": 3.250099561927519, + "grad_norm": 0.47282242529865876, + "learning_rate": 2.1883434422437487e-06, + "loss": 0.573, + "step": 12243 + }, + { + "epoch": 3.2503650604009025, + "grad_norm": 0.46429110361282744, + "learning_rate": 2.1879970408472675e-06, + "loss": 0.5483, + "step": 12244 + }, + { + "epoch": 3.2506305588742865, + "grad_norm": 0.4559969392553884, + "learning_rate": 2.1876506455356084e-06, + "loss": 0.5682, + "step": 12245 + }, + { + "epoch": 3.25089605734767, + "grad_norm": 0.45674876607865655, + "learning_rate": 2.1873042563155257e-06, + "loss": 0.5144, + "step": 12246 + }, + { + "epoch": 3.251161555821054, + "grad_norm": 0.4620448442801122, + "learning_rate": 2.186957873193776e-06, + "loss": 0.5343, + "step": 12247 + }, + { + "epoch": 3.251427054294438, + "grad_norm": 0.4640661206716123, + "learning_rate": 2.186611496177114e-06, + "loss": 0.5521, + "step": 12248 + }, + { + "epoch": 3.2516925527678215, + "grad_norm": 0.4673762151763436, + "learning_rate": 2.186265125272295e-06, + "loss": 0.5567, + "step": 12249 + }, + { + "epoch": 3.251958051241205, + "grad_norm": 0.4540230118986643, + "learning_rate": 2.1859187604860753e-06, + "loss": 0.5536, + "step": 12250 + }, + { + "epoch": 3.2522235497145893, + "grad_norm": 0.448016804956048, + "learning_rate": 2.185572401825208e-06, + "loss": 0.5524, + "step": 12251 + }, + { + "epoch": 3.252489048187973, + "grad_norm": 0.4504075288766789, + "learning_rate": 2.1852260492964486e-06, + "loss": 0.5649, + "step": 12252 + }, + { + "epoch": 3.2527545466613565, + "grad_norm": 0.45060298785596997, + "learning_rate": 2.1848797029065517e-06, + "loss": 0.5317, + "step": 12253 + }, + { + "epoch": 3.2530200451347406, + "grad_norm": 0.46116606816857464, + "learning_rate": 2.1845333626622723e-06, + "loss": 0.5562, + "step": 12254 + }, + { + "epoch": 3.2532855436081243, + "grad_norm": 0.4502536140958218, + "learning_rate": 2.184187028570364e-06, + "loss": 0.5406, + "step": 12255 + }, + { + "epoch": 3.253551042081508, + "grad_norm": 0.46030513987158267, + "learning_rate": 2.1838407006375837e-06, + "loss": 0.5609, + "step": 12256 + }, + { + "epoch": 3.253816540554892, + "grad_norm": 0.46397187384573835, + "learning_rate": 2.183494378870683e-06, + "loss": 0.5662, + "step": 12257 + }, + { + "epoch": 3.2540820390282756, + "grad_norm": 0.4581781020205727, + "learning_rate": 2.1831480632764164e-06, + "loss": 0.5493, + "step": 12258 + }, + { + "epoch": 3.2543475375016593, + "grad_norm": 0.44988742127413267, + "learning_rate": 2.182801753861539e-06, + "loss": 0.5263, + "step": 12259 + }, + { + "epoch": 3.2546130359750434, + "grad_norm": 0.4618088019976701, + "learning_rate": 2.1824554506328033e-06, + "loss": 0.5532, + "step": 12260 + }, + { + "epoch": 3.254878534448427, + "grad_norm": 0.4435286655850674, + "learning_rate": 2.182109153596964e-06, + "loss": 0.5122, + "step": 12261 + }, + { + "epoch": 3.2551440329218106, + "grad_norm": 0.45841993963219524, + "learning_rate": 2.181762862760775e-06, + "loss": 0.5734, + "step": 12262 + }, + { + "epoch": 3.2554095313951943, + "grad_norm": 0.46582912516943265, + "learning_rate": 2.181416578130989e-06, + "loss": 0.5516, + "step": 12263 + }, + { + "epoch": 3.2556750298685784, + "grad_norm": 0.4535072663127884, + "learning_rate": 2.1810702997143597e-06, + "loss": 0.5419, + "step": 12264 + }, + { + "epoch": 3.255940528341962, + "grad_norm": 0.45088452958914055, + "learning_rate": 2.1807240275176406e-06, + "loss": 0.5242, + "step": 12265 + }, + { + "epoch": 3.2562060268153457, + "grad_norm": 0.4464104589626324, + "learning_rate": 2.180377761547585e-06, + "loss": 0.5415, + "step": 12266 + }, + { + "epoch": 3.2564715252887297, + "grad_norm": 0.4489483961090341, + "learning_rate": 2.180031501810945e-06, + "loss": 0.5359, + "step": 12267 + }, + { + "epoch": 3.2567370237621134, + "grad_norm": 0.46077893288079735, + "learning_rate": 2.179685248314475e-06, + "loss": 0.5317, + "step": 12268 + }, + { + "epoch": 3.257002522235497, + "grad_norm": 0.4604300868922512, + "learning_rate": 2.1793390010649263e-06, + "loss": 0.5467, + "step": 12269 + }, + { + "epoch": 3.257268020708881, + "grad_norm": 0.44047003662387385, + "learning_rate": 2.178992760069052e-06, + "loss": 0.531, + "step": 12270 + }, + { + "epoch": 3.2575335191822647, + "grad_norm": 0.4659524432466133, + "learning_rate": 2.178646525333605e-06, + "loss": 0.5319, + "step": 12271 + }, + { + "epoch": 3.2577990176556484, + "grad_norm": 0.45961894884958276, + "learning_rate": 2.1783002968653375e-06, + "loss": 0.5161, + "step": 12272 + }, + { + "epoch": 3.258064516129032, + "grad_norm": 0.4550542980535424, + "learning_rate": 2.1779540746710028e-06, + "loss": 0.5581, + "step": 12273 + }, + { + "epoch": 3.258330014602416, + "grad_norm": 0.46943214935068683, + "learning_rate": 2.177607858757351e-06, + "loss": 0.5542, + "step": 12274 + }, + { + "epoch": 3.2585955130757998, + "grad_norm": 0.4693414891660907, + "learning_rate": 2.1772616491311355e-06, + "loss": 0.595, + "step": 12275 + }, + { + "epoch": 3.2588610115491834, + "grad_norm": 0.45634352290625546, + "learning_rate": 2.176915445799108e-06, + "loss": 0.5547, + "step": 12276 + }, + { + "epoch": 3.2591265100225675, + "grad_norm": 0.4505007710819265, + "learning_rate": 2.1765692487680207e-06, + "loss": 0.5502, + "step": 12277 + }, + { + "epoch": 3.259392008495951, + "grad_norm": 0.45089164788539504, + "learning_rate": 2.1762230580446247e-06, + "loss": 0.5391, + "step": 12278 + }, + { + "epoch": 3.2596575069693348, + "grad_norm": 0.4756201383844795, + "learning_rate": 2.175876873635673e-06, + "loss": 0.5554, + "step": 12279 + }, + { + "epoch": 3.259923005442719, + "grad_norm": 0.46986271506127986, + "learning_rate": 2.1755306955479137e-06, + "loss": 0.5679, + "step": 12280 + }, + { + "epoch": 3.2601885039161025, + "grad_norm": 0.45375843892770507, + "learning_rate": 2.1751845237881013e-06, + "loss": 0.5328, + "step": 12281 + }, + { + "epoch": 3.260454002389486, + "grad_norm": 0.45079587443096764, + "learning_rate": 2.1748383583629857e-06, + "loss": 0.5547, + "step": 12282 + }, + { + "epoch": 3.26071950086287, + "grad_norm": 0.47380479799000436, + "learning_rate": 2.1744921992793187e-06, + "loss": 0.5519, + "step": 12283 + }, + { + "epoch": 3.260984999336254, + "grad_norm": 0.4541047389290832, + "learning_rate": 2.1741460465438507e-06, + "loss": 0.5281, + "step": 12284 + }, + { + "epoch": 3.2612504978096375, + "grad_norm": 0.46714790443204585, + "learning_rate": 2.1737999001633333e-06, + "loss": 0.542, + "step": 12285 + }, + { + "epoch": 3.2615159962830216, + "grad_norm": 0.4499956143798917, + "learning_rate": 2.1734537601445157e-06, + "loss": 0.5015, + "step": 12286 + }, + { + "epoch": 3.261781494756405, + "grad_norm": 0.44120477205898767, + "learning_rate": 2.173107626494149e-06, + "loss": 0.5094, + "step": 12287 + }, + { + "epoch": 3.262046993229789, + "grad_norm": 0.4657718455642984, + "learning_rate": 2.1727614992189842e-06, + "loss": 0.5339, + "step": 12288 + }, + { + "epoch": 3.262312491703173, + "grad_norm": 0.45233398986833173, + "learning_rate": 2.1724153783257717e-06, + "loss": 0.5179, + "step": 12289 + }, + { + "epoch": 3.2625779901765566, + "grad_norm": 0.46576182783896775, + "learning_rate": 2.172069263821261e-06, + "loss": 0.5213, + "step": 12290 + }, + { + "epoch": 3.2628434886499402, + "grad_norm": 0.44764117450619784, + "learning_rate": 2.1717231557122032e-06, + "loss": 0.5296, + "step": 12291 + }, + { + "epoch": 3.263108987123324, + "grad_norm": 0.46302309626466576, + "learning_rate": 2.1713770540053474e-06, + "loss": 0.5757, + "step": 12292 + }, + { + "epoch": 3.263374485596708, + "grad_norm": 0.45631612419447926, + "learning_rate": 2.171030958707443e-06, + "loss": 0.5762, + "step": 12293 + }, + { + "epoch": 3.2636399840700916, + "grad_norm": 0.4492634451632467, + "learning_rate": 2.1706848698252408e-06, + "loss": 0.5664, + "step": 12294 + }, + { + "epoch": 3.2639054825434752, + "grad_norm": 0.4575033662401832, + "learning_rate": 2.1703387873654896e-06, + "loss": 0.5582, + "step": 12295 + }, + { + "epoch": 3.2641709810168593, + "grad_norm": 0.46534394944435503, + "learning_rate": 2.1699927113349404e-06, + "loss": 0.5315, + "step": 12296 + }, + { + "epoch": 3.264436479490243, + "grad_norm": 0.4556667307815348, + "learning_rate": 2.1696466417403407e-06, + "loss": 0.5602, + "step": 12297 + }, + { + "epoch": 3.2647019779636266, + "grad_norm": 0.4717002788198049, + "learning_rate": 2.16930057858844e-06, + "loss": 0.5417, + "step": 12298 + }, + { + "epoch": 3.2649674764370102, + "grad_norm": 0.45028413547115853, + "learning_rate": 2.1689545218859876e-06, + "loss": 0.5638, + "step": 12299 + }, + { + "epoch": 3.2652329749103943, + "grad_norm": 0.453856312188405, + "learning_rate": 2.168608471639732e-06, + "loss": 0.5384, + "step": 12300 + }, + { + "epoch": 3.265498473383778, + "grad_norm": 0.4551687062613768, + "learning_rate": 2.1682624278564234e-06, + "loss": 0.5525, + "step": 12301 + }, + { + "epoch": 3.2657639718571616, + "grad_norm": 0.4496812130929585, + "learning_rate": 2.167916390542811e-06, + "loss": 0.5314, + "step": 12302 + }, + { + "epoch": 3.2660294703305457, + "grad_norm": 0.4478350802724008, + "learning_rate": 2.1675703597056402e-06, + "loss": 0.568, + "step": 12303 + }, + { + "epoch": 3.2662949688039293, + "grad_norm": 0.46553783889200445, + "learning_rate": 2.167224335351662e-06, + "loss": 0.5301, + "step": 12304 + }, + { + "epoch": 3.266560467277313, + "grad_norm": 0.45729653131524617, + "learning_rate": 2.1668783174876233e-06, + "loss": 0.5335, + "step": 12305 + }, + { + "epoch": 3.266825965750697, + "grad_norm": 0.4561360195630808, + "learning_rate": 2.1665323061202736e-06, + "loss": 0.5668, + "step": 12306 + }, + { + "epoch": 3.2670914642240807, + "grad_norm": 0.44572420469491425, + "learning_rate": 2.1661863012563607e-06, + "loss": 0.5385, + "step": 12307 + }, + { + "epoch": 3.2673569626974643, + "grad_norm": 0.4574547912527172, + "learning_rate": 2.165840302902632e-06, + "loss": 0.5604, + "step": 12308 + }, + { + "epoch": 3.2676224611708484, + "grad_norm": 0.4789080110011661, + "learning_rate": 2.1654943110658357e-06, + "loss": 0.6085, + "step": 12309 + }, + { + "epoch": 3.267887959644232, + "grad_norm": 0.4543267047960963, + "learning_rate": 2.165148325752719e-06, + "loss": 0.5521, + "step": 12310 + }, + { + "epoch": 3.2681534581176157, + "grad_norm": 0.4496362006975845, + "learning_rate": 2.1648023469700296e-06, + "loss": 0.5577, + "step": 12311 + }, + { + "epoch": 3.268418956591, + "grad_norm": 0.4734540988237096, + "learning_rate": 2.1644563747245157e-06, + "loss": 0.575, + "step": 12312 + }, + { + "epoch": 3.2686844550643834, + "grad_norm": 0.4523182378340119, + "learning_rate": 2.164110409022924e-06, + "loss": 0.5378, + "step": 12313 + }, + { + "epoch": 3.268949953537767, + "grad_norm": 0.44522234767187285, + "learning_rate": 2.163764449872002e-06, + "loss": 0.5616, + "step": 12314 + }, + { + "epoch": 3.269215452011151, + "grad_norm": 0.457241552116792, + "learning_rate": 2.163418497278496e-06, + "loss": 0.5447, + "step": 12315 + }, + { + "epoch": 3.269480950484535, + "grad_norm": 0.4783272653819701, + "learning_rate": 2.163072551249153e-06, + "loss": 0.571, + "step": 12316 + }, + { + "epoch": 3.2697464489579184, + "grad_norm": 0.47933894285094675, + "learning_rate": 2.162726611790721e-06, + "loss": 0.5803, + "step": 12317 + }, + { + "epoch": 3.270011947431302, + "grad_norm": 0.45006221733087765, + "learning_rate": 2.1623806789099453e-06, + "loss": 0.5411, + "step": 12318 + }, + { + "epoch": 3.270277445904686, + "grad_norm": 0.4674729523320809, + "learning_rate": 2.1620347526135734e-06, + "loss": 0.5736, + "step": 12319 + }, + { + "epoch": 3.27054294437807, + "grad_norm": 0.4630387602480421, + "learning_rate": 2.161688832908351e-06, + "loss": 0.5539, + "step": 12320 + }, + { + "epoch": 3.2708084428514534, + "grad_norm": 0.46411430443083534, + "learning_rate": 2.1613429198010246e-06, + "loss": 0.5236, + "step": 12321 + }, + { + "epoch": 3.2710739413248375, + "grad_norm": 0.46004419225889487, + "learning_rate": 2.160997013298341e-06, + "loss": 0.5593, + "step": 12322 + }, + { + "epoch": 3.271339439798221, + "grad_norm": 0.4752256263664418, + "learning_rate": 2.160651113407045e-06, + "loss": 0.5583, + "step": 12323 + }, + { + "epoch": 3.271604938271605, + "grad_norm": 0.46458881658982487, + "learning_rate": 2.160305220133883e-06, + "loss": 0.5604, + "step": 12324 + }, + { + "epoch": 3.271870436744989, + "grad_norm": 0.46593997070643833, + "learning_rate": 2.159959333485602e-06, + "loss": 0.5425, + "step": 12325 + }, + { + "epoch": 3.2721359352183725, + "grad_norm": 0.46545380918963536, + "learning_rate": 2.1596134534689454e-06, + "loss": 0.545, + "step": 12326 + }, + { + "epoch": 3.272401433691756, + "grad_norm": 0.46336840608494917, + "learning_rate": 2.159267580090661e-06, + "loss": 0.5467, + "step": 12327 + }, + { + "epoch": 3.27266693216514, + "grad_norm": 0.4640023370949904, + "learning_rate": 2.158921713357492e-06, + "loss": 0.533, + "step": 12328 + }, + { + "epoch": 3.272932430638524, + "grad_norm": 0.44217133281709387, + "learning_rate": 2.1585758532761848e-06, + "loss": 0.56, + "step": 12329 + }, + { + "epoch": 3.2731979291119075, + "grad_norm": 0.45820796240795947, + "learning_rate": 2.158229999853485e-06, + "loss": 0.5412, + "step": 12330 + }, + { + "epoch": 3.273463427585291, + "grad_norm": 0.45728993190774014, + "learning_rate": 2.1578841530961375e-06, + "loss": 0.5027, + "step": 12331 + }, + { + "epoch": 3.2737289260586753, + "grad_norm": 0.45754961032599484, + "learning_rate": 2.1575383130108857e-06, + "loss": 0.5496, + "step": 12332 + }, + { + "epoch": 3.273994424532059, + "grad_norm": 0.4486056707893225, + "learning_rate": 2.1571924796044755e-06, + "loss": 0.5375, + "step": 12333 + }, + { + "epoch": 3.2742599230054426, + "grad_norm": 0.44641523693761453, + "learning_rate": 2.156846652883651e-06, + "loss": 0.5778, + "step": 12334 + }, + { + "epoch": 3.2745254214788266, + "grad_norm": 0.45246039508430874, + "learning_rate": 2.1565008328551573e-06, + "loss": 0.5506, + "step": 12335 + }, + { + "epoch": 3.2747909199522103, + "grad_norm": 0.44492815619577564, + "learning_rate": 2.156155019525738e-06, + "loss": 0.4808, + "step": 12336 + }, + { + "epoch": 3.275056418425594, + "grad_norm": 0.4732053512559555, + "learning_rate": 2.1558092129021385e-06, + "loss": 0.5433, + "step": 12337 + }, + { + "epoch": 3.275321916898978, + "grad_norm": 0.43960027974845506, + "learning_rate": 2.1554634129911016e-06, + "loss": 0.5592, + "step": 12338 + }, + { + "epoch": 3.2755874153723616, + "grad_norm": 0.46746031254095655, + "learning_rate": 2.1551176197993717e-06, + "loss": 0.5444, + "step": 12339 + }, + { + "epoch": 3.2758529138457453, + "grad_norm": 0.45197983128030766, + "learning_rate": 2.1547718333336923e-06, + "loss": 0.5453, + "step": 12340 + }, + { + "epoch": 3.2761184123191294, + "grad_norm": 0.4415563336430487, + "learning_rate": 2.1544260536008077e-06, + "loss": 0.5269, + "step": 12341 + }, + { + "epoch": 3.276383910792513, + "grad_norm": 0.4593169665518242, + "learning_rate": 2.154080280607462e-06, + "loss": 0.5413, + "step": 12342 + }, + { + "epoch": 3.2766494092658967, + "grad_norm": 0.4484731636262464, + "learning_rate": 2.1537345143603967e-06, + "loss": 0.4824, + "step": 12343 + }, + { + "epoch": 3.2769149077392807, + "grad_norm": 0.455571153332722, + "learning_rate": 2.1533887548663564e-06, + "loss": 0.5691, + "step": 12344 + }, + { + "epoch": 3.2771804062126644, + "grad_norm": 0.4404313283931506, + "learning_rate": 2.153043002132084e-06, + "loss": 0.5478, + "step": 12345 + }, + { + "epoch": 3.277445904686048, + "grad_norm": 0.48317361280282894, + "learning_rate": 2.1526972561643222e-06, + "loss": 0.5583, + "step": 12346 + }, + { + "epoch": 3.2777114031594317, + "grad_norm": 0.4410181208477213, + "learning_rate": 2.1523515169698144e-06, + "loss": 0.5816, + "step": 12347 + }, + { + "epoch": 3.2779769016328157, + "grad_norm": 0.4576468337877864, + "learning_rate": 2.1520057845553048e-06, + "loss": 0.5535, + "step": 12348 + }, + { + "epoch": 3.2782424001061994, + "grad_norm": 0.4434151125089868, + "learning_rate": 2.151660058927533e-06, + "loss": 0.5479, + "step": 12349 + }, + { + "epoch": 3.278507898579583, + "grad_norm": 0.44108424075470404, + "learning_rate": 2.151314340093243e-06, + "loss": 0.566, + "step": 12350 + }, + { + "epoch": 3.278773397052967, + "grad_norm": 0.4632574225230179, + "learning_rate": 2.1509686280591766e-06, + "loss": 0.5614, + "step": 12351 + }, + { + "epoch": 3.2790388955263507, + "grad_norm": 0.46406939028470656, + "learning_rate": 2.1506229228320775e-06, + "loss": 0.55, + "step": 12352 + }, + { + "epoch": 3.2793043939997344, + "grad_norm": 0.46178228423300816, + "learning_rate": 2.1502772244186862e-06, + "loss": 0.5917, + "step": 12353 + }, + { + "epoch": 3.279569892473118, + "grad_norm": 0.47012841625942065, + "learning_rate": 2.1499315328257457e-06, + "loss": 0.564, + "step": 12354 + }, + { + "epoch": 3.279835390946502, + "grad_norm": 0.47144097405157837, + "learning_rate": 2.1495858480599975e-06, + "loss": 0.5553, + "step": 12355 + }, + { + "epoch": 3.2801008894198858, + "grad_norm": 0.46739273924940306, + "learning_rate": 2.1492401701281827e-06, + "loss": 0.5712, + "step": 12356 + }, + { + "epoch": 3.2803663878932694, + "grad_norm": 0.4535229787918185, + "learning_rate": 2.1488944990370436e-06, + "loss": 0.509, + "step": 12357 + }, + { + "epoch": 3.2806318863666535, + "grad_norm": 0.4518493623382024, + "learning_rate": 2.1485488347933215e-06, + "loss": 0.5353, + "step": 12358 + }, + { + "epoch": 3.280897384840037, + "grad_norm": 0.468178091379063, + "learning_rate": 2.1482031774037573e-06, + "loss": 0.5526, + "step": 12359 + }, + { + "epoch": 3.2811628833134208, + "grad_norm": 0.4765956333059461, + "learning_rate": 2.147857526875094e-06, + "loss": 0.5643, + "step": 12360 + }, + { + "epoch": 3.281428381786805, + "grad_norm": 0.4496352903577852, + "learning_rate": 2.1475118832140696e-06, + "loss": 0.5364, + "step": 12361 + }, + { + "epoch": 3.2816938802601885, + "grad_norm": 0.44899248191256785, + "learning_rate": 2.147166246427427e-06, + "loss": 0.5069, + "step": 12362 + }, + { + "epoch": 3.281959378733572, + "grad_norm": 0.44507611563600624, + "learning_rate": 2.146820616521906e-06, + "loss": 0.5579, + "step": 12363 + }, + { + "epoch": 3.282224877206956, + "grad_norm": 0.45456431956357113, + "learning_rate": 2.146474993504248e-06, + "loss": 0.5426, + "step": 12364 + }, + { + "epoch": 3.28249037568034, + "grad_norm": 0.4635750203014093, + "learning_rate": 2.146129377381193e-06, + "loss": 0.5378, + "step": 12365 + }, + { + "epoch": 3.2827558741537235, + "grad_norm": 0.46148378318825967, + "learning_rate": 2.145783768159482e-06, + "loss": 0.5935, + "step": 12366 + }, + { + "epoch": 3.2830213726271076, + "grad_norm": 0.45021015544327986, + "learning_rate": 2.145438165845855e-06, + "loss": 0.4956, + "step": 12367 + }, + { + "epoch": 3.2832868711004912, + "grad_norm": 0.4580421117606796, + "learning_rate": 2.145092570447051e-06, + "loss": 0.5642, + "step": 12368 + }, + { + "epoch": 3.283552369573875, + "grad_norm": 0.46712077048895817, + "learning_rate": 2.1447469819698113e-06, + "loss": 0.5637, + "step": 12369 + }, + { + "epoch": 3.283817868047259, + "grad_norm": 0.4573299099565559, + "learning_rate": 2.144401400420875e-06, + "loss": 0.5648, + "step": 12370 + }, + { + "epoch": 3.2840833665206426, + "grad_norm": 0.46309959404780165, + "learning_rate": 2.1440558258069828e-06, + "loss": 0.5246, + "step": 12371 + }, + { + "epoch": 3.2843488649940262, + "grad_norm": 0.4608769272595653, + "learning_rate": 2.1437102581348727e-06, + "loss": 0.5532, + "step": 12372 + }, + { + "epoch": 3.28461436346741, + "grad_norm": 0.4266641255972511, + "learning_rate": 2.1433646974112856e-06, + "loss": 0.5374, + "step": 12373 + }, + { + "epoch": 3.284879861940794, + "grad_norm": 0.4676102793528825, + "learning_rate": 2.1430191436429596e-06, + "loss": 0.5509, + "step": 12374 + }, + { + "epoch": 3.2851453604141776, + "grad_norm": 0.4521447846602556, + "learning_rate": 2.142673596836634e-06, + "loss": 0.5623, + "step": 12375 + }, + { + "epoch": 3.2854108588875612, + "grad_norm": 0.4696830646397409, + "learning_rate": 2.1423280569990486e-06, + "loss": 0.5313, + "step": 12376 + }, + { + "epoch": 3.2856763573609453, + "grad_norm": 0.46817297306686456, + "learning_rate": 2.141982524136943e-06, + "loss": 0.5511, + "step": 12377 + }, + { + "epoch": 3.285941855834329, + "grad_norm": 0.46625064409427197, + "learning_rate": 2.1416369982570535e-06, + "loss": 0.5161, + "step": 12378 + }, + { + "epoch": 3.2862073543077126, + "grad_norm": 0.4470850947765853, + "learning_rate": 2.14129147936612e-06, + "loss": 0.5119, + "step": 12379 + }, + { + "epoch": 3.2864728527810967, + "grad_norm": 0.45845483365353, + "learning_rate": 2.1409459674708805e-06, + "loss": 0.5653, + "step": 12380 + }, + { + "epoch": 3.2867383512544803, + "grad_norm": 0.463464445182446, + "learning_rate": 2.1406004625780742e-06, + "loss": 0.5326, + "step": 12381 + }, + { + "epoch": 3.287003849727864, + "grad_norm": 0.47213776118015705, + "learning_rate": 2.140254964694439e-06, + "loss": 0.556, + "step": 12382 + }, + { + "epoch": 3.2872693482012476, + "grad_norm": 0.4557688333004908, + "learning_rate": 2.139909473826713e-06, + "loss": 0.5626, + "step": 12383 + }, + { + "epoch": 3.2875348466746317, + "grad_norm": 0.4658287046462975, + "learning_rate": 2.139563989981633e-06, + "loss": 0.5272, + "step": 12384 + }, + { + "epoch": 3.2878003451480153, + "grad_norm": 0.45081826173860234, + "learning_rate": 2.1392185131659388e-06, + "loss": 0.5702, + "step": 12385 + }, + { + "epoch": 3.288065843621399, + "grad_norm": 0.4639014895647045, + "learning_rate": 2.138873043386366e-06, + "loss": 0.5292, + "step": 12386 + }, + { + "epoch": 3.288331342094783, + "grad_norm": 0.45025599929965054, + "learning_rate": 2.1385275806496536e-06, + "loss": 0.5231, + "step": 12387 + }, + { + "epoch": 3.2885968405681667, + "grad_norm": 0.4619551742318092, + "learning_rate": 2.1381821249625383e-06, + "loss": 0.5505, + "step": 12388 + }, + { + "epoch": 3.2888623390415503, + "grad_norm": 0.47476434652582916, + "learning_rate": 2.137836676331758e-06, + "loss": 0.5299, + "step": 12389 + }, + { + "epoch": 3.2891278375149344, + "grad_norm": 0.45406518245314403, + "learning_rate": 2.1374912347640487e-06, + "loss": 0.5534, + "step": 12390 + }, + { + "epoch": 3.289393335988318, + "grad_norm": 0.46029059240639586, + "learning_rate": 2.137145800266148e-06, + "loss": 0.5172, + "step": 12391 + }, + { + "epoch": 3.2896588344617017, + "grad_norm": 0.47365373366549757, + "learning_rate": 2.136800372844792e-06, + "loss": 0.5642, + "step": 12392 + }, + { + "epoch": 3.289924332935086, + "grad_norm": 0.4468839273599769, + "learning_rate": 2.136454952506718e-06, + "loss": 0.5498, + "step": 12393 + }, + { + "epoch": 3.2901898314084694, + "grad_norm": 0.44770564727568374, + "learning_rate": 2.136109539258664e-06, + "loss": 0.5434, + "step": 12394 + }, + { + "epoch": 3.290455329881853, + "grad_norm": 0.47233595705624315, + "learning_rate": 2.1357641331073637e-06, + "loss": 0.5676, + "step": 12395 + }, + { + "epoch": 3.290720828355237, + "grad_norm": 0.475650800074712, + "learning_rate": 2.1354187340595548e-06, + "loss": 0.5907, + "step": 12396 + }, + { + "epoch": 3.290986326828621, + "grad_norm": 0.46681056752671507, + "learning_rate": 2.1350733421219733e-06, + "loss": 0.5241, + "step": 12397 + }, + { + "epoch": 3.2912518253020044, + "grad_norm": 0.4481327864639219, + "learning_rate": 2.134727957301355e-06, + "loss": 0.5539, + "step": 12398 + }, + { + "epoch": 3.2915173237753885, + "grad_norm": 0.4669374564227783, + "learning_rate": 2.134382579604436e-06, + "loss": 0.5578, + "step": 12399 + }, + { + "epoch": 3.291782822248772, + "grad_norm": 0.4504344209920806, + "learning_rate": 2.1340372090379517e-06, + "loss": 0.5469, + "step": 12400 + }, + { + "epoch": 3.292048320722156, + "grad_norm": 0.4589140818451969, + "learning_rate": 2.133691845608638e-06, + "loss": 0.5457, + "step": 12401 + }, + { + "epoch": 3.2923138191955394, + "grad_norm": 0.4624496461005644, + "learning_rate": 2.1333464893232303e-06, + "loss": 0.5519, + "step": 12402 + }, + { + "epoch": 3.2925793176689235, + "grad_norm": 0.44704220602180317, + "learning_rate": 2.1330011401884636e-06, + "loss": 0.5732, + "step": 12403 + }, + { + "epoch": 3.292844816142307, + "grad_norm": 0.46443935285081467, + "learning_rate": 2.1326557982110733e-06, + "loss": 0.5525, + "step": 12404 + }, + { + "epoch": 3.293110314615691, + "grad_norm": 0.4535438882521344, + "learning_rate": 2.132310463397794e-06, + "loss": 0.5578, + "step": 12405 + }, + { + "epoch": 3.293375813089075, + "grad_norm": 0.4606368317492769, + "learning_rate": 2.1319651357553623e-06, + "loss": 0.5499, + "step": 12406 + }, + { + "epoch": 3.2936413115624585, + "grad_norm": 0.4572589082842307, + "learning_rate": 2.1316198152905103e-06, + "loss": 0.5517, + "step": 12407 + }, + { + "epoch": 3.293906810035842, + "grad_norm": 0.46150367572548606, + "learning_rate": 2.131274502009974e-06, + "loss": 0.5591, + "step": 12408 + }, + { + "epoch": 3.2941723085092263, + "grad_norm": 0.4576877179941248, + "learning_rate": 2.130929195920488e-06, + "loss": 0.5428, + "step": 12409 + }, + { + "epoch": 3.29443780698261, + "grad_norm": 0.4465836768067627, + "learning_rate": 2.130583897028786e-06, + "loss": 0.5316, + "step": 12410 + }, + { + "epoch": 3.2947033054559935, + "grad_norm": 0.46352521000381525, + "learning_rate": 2.1302386053416026e-06, + "loss": 0.5043, + "step": 12411 + }, + { + "epoch": 3.294968803929377, + "grad_norm": 0.45032463938605893, + "learning_rate": 2.129893320865672e-06, + "loss": 0.5519, + "step": 12412 + }, + { + "epoch": 3.2952343024027613, + "grad_norm": 0.4709078747874101, + "learning_rate": 2.1295480436077275e-06, + "loss": 0.5451, + "step": 12413 + }, + { + "epoch": 3.295499800876145, + "grad_norm": 0.4457063021855977, + "learning_rate": 2.129202773574503e-06, + "loss": 0.5635, + "step": 12414 + }, + { + "epoch": 3.2957652993495286, + "grad_norm": 0.4616322554486769, + "learning_rate": 2.128857510772733e-06, + "loss": 0.5392, + "step": 12415 + }, + { + "epoch": 3.2960307978229126, + "grad_norm": 0.45572101045975255, + "learning_rate": 2.128512255209149e-06, + "loss": 0.5647, + "step": 12416 + }, + { + "epoch": 3.2962962962962963, + "grad_norm": 0.4604240898712047, + "learning_rate": 2.1281670068904868e-06, + "loss": 0.5801, + "step": 12417 + }, + { + "epoch": 3.29656179476968, + "grad_norm": 0.4743002024590024, + "learning_rate": 2.127821765823478e-06, + "loss": 0.529, + "step": 12418 + }, + { + "epoch": 3.296827293243064, + "grad_norm": 0.4813014068520655, + "learning_rate": 2.127476532014856e-06, + "loss": 0.5521, + "step": 12419 + }, + { + "epoch": 3.2970927917164476, + "grad_norm": 0.4561602439983453, + "learning_rate": 2.1271313054713534e-06, + "loss": 0.5148, + "step": 12420 + }, + { + "epoch": 3.2973582901898313, + "grad_norm": 0.4517895152320192, + "learning_rate": 2.1267860861997035e-06, + "loss": 0.5633, + "step": 12421 + }, + { + "epoch": 3.2976237886632154, + "grad_norm": 0.4658096977668192, + "learning_rate": 2.1264408742066387e-06, + "loss": 0.5595, + "step": 12422 + }, + { + "epoch": 3.297889287136599, + "grad_norm": 0.44924424446352795, + "learning_rate": 2.1260956694988924e-06, + "loss": 0.5867, + "step": 12423 + }, + { + "epoch": 3.2981547856099827, + "grad_norm": 0.4650390134535376, + "learning_rate": 2.125750472083195e-06, + "loss": 0.5902, + "step": 12424 + }, + { + "epoch": 3.2984202840833667, + "grad_norm": 0.47204397022329564, + "learning_rate": 2.12540528196628e-06, + "loss": 0.5762, + "step": 12425 + }, + { + "epoch": 3.2986857825567504, + "grad_norm": 0.45965819358481436, + "learning_rate": 2.125060099154879e-06, + "loss": 0.5714, + "step": 12426 + }, + { + "epoch": 3.298951281030134, + "grad_norm": 0.4755868604587048, + "learning_rate": 2.124714923655724e-06, + "loss": 0.5714, + "step": 12427 + }, + { + "epoch": 3.299216779503518, + "grad_norm": 0.451780964745843, + "learning_rate": 2.1243697554755468e-06, + "loss": 0.5461, + "step": 12428 + }, + { + "epoch": 3.2994822779769017, + "grad_norm": 0.46427767702418216, + "learning_rate": 2.1240245946210797e-06, + "loss": 0.5728, + "step": 12429 + }, + { + "epoch": 3.2997477764502854, + "grad_norm": 0.4610754211575944, + "learning_rate": 2.123679441099053e-06, + "loss": 0.592, + "step": 12430 + }, + { + "epoch": 3.300013274923669, + "grad_norm": 0.4417926175054234, + "learning_rate": 2.1233342949161984e-06, + "loss": 0.522, + "step": 12431 + }, + { + "epoch": 3.300278773397053, + "grad_norm": 0.4680247748595976, + "learning_rate": 2.1229891560792474e-06, + "loss": 0.5727, + "step": 12432 + }, + { + "epoch": 3.3005442718704368, + "grad_norm": 0.46310851191953706, + "learning_rate": 2.122644024594931e-06, + "loss": 0.5525, + "step": 12433 + }, + { + "epoch": 3.3008097703438204, + "grad_norm": 0.4560786977925917, + "learning_rate": 2.1222989004699803e-06, + "loss": 0.5463, + "step": 12434 + }, + { + "epoch": 3.3010752688172045, + "grad_norm": 0.4661549219068018, + "learning_rate": 2.1219537837111264e-06, + "loss": 0.5522, + "step": 12435 + }, + { + "epoch": 3.301340767290588, + "grad_norm": 0.4619028587425063, + "learning_rate": 2.1216086743250985e-06, + "loss": 0.5666, + "step": 12436 + }, + { + "epoch": 3.3016062657639718, + "grad_norm": 0.4516607422828077, + "learning_rate": 2.1212635723186274e-06, + "loss": 0.5122, + "step": 12437 + }, + { + "epoch": 3.3018717642373554, + "grad_norm": 0.46857206034300036, + "learning_rate": 2.120918477698444e-06, + "loss": 0.584, + "step": 12438 + }, + { + "epoch": 3.3021372627107395, + "grad_norm": 0.44527405967549133, + "learning_rate": 2.1205733904712793e-06, + "loss": 0.4977, + "step": 12439 + }, + { + "epoch": 3.302402761184123, + "grad_norm": 0.4507144902397387, + "learning_rate": 2.1202283106438632e-06, + "loss": 0.51, + "step": 12440 + }, + { + "epoch": 3.3026682596575068, + "grad_norm": 0.46250077713026594, + "learning_rate": 2.1198832382229236e-06, + "loss": 0.5389, + "step": 12441 + }, + { + "epoch": 3.302933758130891, + "grad_norm": 0.4359955068355555, + "learning_rate": 2.1195381732151918e-06, + "loss": 0.5535, + "step": 12442 + }, + { + "epoch": 3.3031992566042745, + "grad_norm": 0.4554354739491032, + "learning_rate": 2.119193115627397e-06, + "loss": 0.522, + "step": 12443 + }, + { + "epoch": 3.303464755077658, + "grad_norm": 0.47096946804051365, + "learning_rate": 2.118848065466269e-06, + "loss": 0.5244, + "step": 12444 + }, + { + "epoch": 3.303730253551042, + "grad_norm": 0.4601930551412257, + "learning_rate": 2.118503022738537e-06, + "loss": 0.558, + "step": 12445 + }, + { + "epoch": 3.303995752024426, + "grad_norm": 0.4680848282067587, + "learning_rate": 2.11815798745093e-06, + "loss": 0.5338, + "step": 12446 + }, + { + "epoch": 3.3042612504978095, + "grad_norm": 0.4636880869855386, + "learning_rate": 2.1178129596101775e-06, + "loss": 0.54, + "step": 12447 + }, + { + "epoch": 3.3045267489711936, + "grad_norm": 0.4571575295334612, + "learning_rate": 2.1174679392230076e-06, + "loss": 0.5426, + "step": 12448 + }, + { + "epoch": 3.3047922474445772, + "grad_norm": 0.45945410776255485, + "learning_rate": 2.1171229262961494e-06, + "loss": 0.567, + "step": 12449 + }, + { + "epoch": 3.305057745917961, + "grad_norm": 0.46038683212836085, + "learning_rate": 2.1167779208363317e-06, + "loss": 0.5372, + "step": 12450 + }, + { + "epoch": 3.305323244391345, + "grad_norm": 0.4410129222152262, + "learning_rate": 2.116432922850283e-06, + "loss": 0.5161, + "step": 12451 + }, + { + "epoch": 3.3055887428647286, + "grad_norm": 0.4562754830833976, + "learning_rate": 2.116087932344732e-06, + "loss": 0.5437, + "step": 12452 + }, + { + "epoch": 3.3058542413381122, + "grad_norm": 0.4714121654480041, + "learning_rate": 2.1157429493264056e-06, + "loss": 0.5461, + "step": 12453 + }, + { + "epoch": 3.3061197398114963, + "grad_norm": 0.46305007700456285, + "learning_rate": 2.1153979738020325e-06, + "loss": 0.5773, + "step": 12454 + }, + { + "epoch": 3.30638523828488, + "grad_norm": 0.4636551694201425, + "learning_rate": 2.1150530057783407e-06, + "loss": 0.5503, + "step": 12455 + }, + { + "epoch": 3.3066507367582636, + "grad_norm": 0.4600401005916624, + "learning_rate": 2.1147080452620573e-06, + "loss": 0.5429, + "step": 12456 + }, + { + "epoch": 3.3069162352316472, + "grad_norm": 0.44521919634042084, + "learning_rate": 2.1143630922599108e-06, + "loss": 0.5447, + "step": 12457 + }, + { + "epoch": 3.3071817337050313, + "grad_norm": 0.44575622568736406, + "learning_rate": 2.114018146778629e-06, + "loss": 0.5122, + "step": 12458 + }, + { + "epoch": 3.307447232178415, + "grad_norm": 0.448285534633307, + "learning_rate": 2.1136732088249372e-06, + "loss": 0.5424, + "step": 12459 + }, + { + "epoch": 3.3077127306517986, + "grad_norm": 0.46119896375268676, + "learning_rate": 2.113328278405564e-06, + "loss": 0.5752, + "step": 12460 + }, + { + "epoch": 3.3079782291251827, + "grad_norm": 0.47090997878443963, + "learning_rate": 2.1129833555272363e-06, + "loss": 0.5579, + "step": 12461 + }, + { + "epoch": 3.3082437275985663, + "grad_norm": 0.4596055827363926, + "learning_rate": 2.1126384401966806e-06, + "loss": 0.5604, + "step": 12462 + }, + { + "epoch": 3.30850922607195, + "grad_norm": 0.44433855933873495, + "learning_rate": 2.1122935324206238e-06, + "loss": 0.4976, + "step": 12463 + }, + { + "epoch": 3.308774724545334, + "grad_norm": 0.4598679614786918, + "learning_rate": 2.111948632205793e-06, + "loss": 0.5276, + "step": 12464 + }, + { + "epoch": 3.3090402230187177, + "grad_norm": 0.45450933980777275, + "learning_rate": 2.111603739558913e-06, + "loss": 0.5389, + "step": 12465 + }, + { + "epoch": 3.3093057214921013, + "grad_norm": 0.45739907359601684, + "learning_rate": 2.1112588544867115e-06, + "loss": 0.5303, + "step": 12466 + }, + { + "epoch": 3.309571219965485, + "grad_norm": 0.459642658666993, + "learning_rate": 2.110913976995914e-06, + "loss": 0.4873, + "step": 12467 + }, + { + "epoch": 3.309836718438869, + "grad_norm": 0.46011984312522497, + "learning_rate": 2.1105691070932465e-06, + "loss": 0.5357, + "step": 12468 + }, + { + "epoch": 3.3101022169122527, + "grad_norm": 0.46828927507083296, + "learning_rate": 2.110224244785436e-06, + "loss": 0.5498, + "step": 12469 + }, + { + "epoch": 3.3103677153856363, + "grad_norm": 0.45531443583831677, + "learning_rate": 2.109879390079206e-06, + "loss": 0.5362, + "step": 12470 + }, + { + "epoch": 3.3106332138590204, + "grad_norm": 0.4410326958464635, + "learning_rate": 2.1095345429812835e-06, + "loss": 0.5203, + "step": 12471 + }, + { + "epoch": 3.310898712332404, + "grad_norm": 0.4405471853967866, + "learning_rate": 2.1091897034983926e-06, + "loss": 0.5727, + "step": 12472 + }, + { + "epoch": 3.3111642108057877, + "grad_norm": 0.46935547032205094, + "learning_rate": 2.10884487163726e-06, + "loss": 0.5242, + "step": 12473 + }, + { + "epoch": 3.311429709279172, + "grad_norm": 0.44328272263231056, + "learning_rate": 2.10850004740461e-06, + "loss": 0.5545, + "step": 12474 + }, + { + "epoch": 3.3116952077525554, + "grad_norm": 0.45478070066380205, + "learning_rate": 2.1081552308071678e-06, + "loss": 0.5245, + "step": 12475 + }, + { + "epoch": 3.311960706225939, + "grad_norm": 0.45699553552185684, + "learning_rate": 2.1078104218516577e-06, + "loss": 0.552, + "step": 12476 + }, + { + "epoch": 3.312226204699323, + "grad_norm": 0.4458374547274046, + "learning_rate": 2.1074656205448048e-06, + "loss": 0.5542, + "step": 12477 + }, + { + "epoch": 3.312491703172707, + "grad_norm": 0.4638519657956804, + "learning_rate": 2.1071208268933335e-06, + "loss": 0.5587, + "step": 12478 + }, + { + "epoch": 3.3127572016460904, + "grad_norm": 0.4642410948923572, + "learning_rate": 2.1067760409039678e-06, + "loss": 0.5879, + "step": 12479 + }, + { + "epoch": 3.3130227001194745, + "grad_norm": 0.45855673658755025, + "learning_rate": 2.106431262583432e-06, + "loss": 0.5178, + "step": 12480 + }, + { + "epoch": 3.313288198592858, + "grad_norm": 0.4560886432338761, + "learning_rate": 2.1060864919384514e-06, + "loss": 0.5511, + "step": 12481 + }, + { + "epoch": 3.313553697066242, + "grad_norm": 0.4479522484387603, + "learning_rate": 2.1057417289757474e-06, + "loss": 0.538, + "step": 12482 + }, + { + "epoch": 3.313819195539626, + "grad_norm": 0.46573541513302824, + "learning_rate": 2.1053969737020448e-06, + "loss": 0.5702, + "step": 12483 + }, + { + "epoch": 3.3140846940130095, + "grad_norm": 0.4611913376015197, + "learning_rate": 2.1050522261240676e-06, + "loss": 0.5117, + "step": 12484 + }, + { + "epoch": 3.314350192486393, + "grad_norm": 0.4489390341304483, + "learning_rate": 2.1047074862485387e-06, + "loss": 0.5296, + "step": 12485 + }, + { + "epoch": 3.314615690959777, + "grad_norm": 0.4607158986637834, + "learning_rate": 2.1043627540821824e-06, + "loss": 0.5705, + "step": 12486 + }, + { + "epoch": 3.314881189433161, + "grad_norm": 0.4631355032749449, + "learning_rate": 2.1040180296317216e-06, + "loss": 0.5244, + "step": 12487 + }, + { + "epoch": 3.3151466879065445, + "grad_norm": 0.4545705844970469, + "learning_rate": 2.1036733129038782e-06, + "loss": 0.5517, + "step": 12488 + }, + { + "epoch": 3.315412186379928, + "grad_norm": 0.4590455565072322, + "learning_rate": 2.1033286039053752e-06, + "loss": 0.5125, + "step": 12489 + }, + { + "epoch": 3.3156776848533123, + "grad_norm": 0.4598888840084843, + "learning_rate": 2.1029839026429356e-06, + "loss": 0.5424, + "step": 12490 + }, + { + "epoch": 3.315943183326696, + "grad_norm": 0.45589539434615006, + "learning_rate": 2.102639209123282e-06, + "loss": 0.5493, + "step": 12491 + }, + { + "epoch": 3.3162086818000795, + "grad_norm": 0.46140689487330533, + "learning_rate": 2.1022945233531373e-06, + "loss": 0.5411, + "step": 12492 + }, + { + "epoch": 3.316474180273463, + "grad_norm": 0.4649259303458875, + "learning_rate": 2.1019498453392227e-06, + "loss": 0.587, + "step": 12493 + }, + { + "epoch": 3.3167396787468473, + "grad_norm": 0.45352260855505916, + "learning_rate": 2.1016051750882606e-06, + "loss": 0.5435, + "step": 12494 + }, + { + "epoch": 3.317005177220231, + "grad_norm": 0.45600065120720623, + "learning_rate": 2.101260512606973e-06, + "loss": 0.5267, + "step": 12495 + }, + { + "epoch": 3.3172706756936146, + "grad_norm": 0.4521577010053124, + "learning_rate": 2.1009158579020815e-06, + "loss": 0.5766, + "step": 12496 + }, + { + "epoch": 3.3175361741669986, + "grad_norm": 0.4579730024523372, + "learning_rate": 2.1005712109803083e-06, + "loss": 0.5831, + "step": 12497 + }, + { + "epoch": 3.3178016726403823, + "grad_norm": 0.45822623507432075, + "learning_rate": 2.1002265718483757e-06, + "loss": 0.5538, + "step": 12498 + }, + { + "epoch": 3.318067171113766, + "grad_norm": 0.44890247009933903, + "learning_rate": 2.0998819405130023e-06, + "loss": 0.5535, + "step": 12499 + }, + { + "epoch": 3.31833266958715, + "grad_norm": 0.4619831177424958, + "learning_rate": 2.099537316980911e-06, + "loss": 0.5656, + "step": 12500 + }, + { + "epoch": 3.3185981680605336, + "grad_norm": 0.4490152863751281, + "learning_rate": 2.099192701258822e-06, + "loss": 0.5364, + "step": 12501 + }, + { + "epoch": 3.3188636665339173, + "grad_norm": 0.47080043520137294, + "learning_rate": 2.098848093353457e-06, + "loss": 0.5707, + "step": 12502 + }, + { + "epoch": 3.3191291650073014, + "grad_norm": 0.457352177522771, + "learning_rate": 2.098503493271536e-06, + "loss": 0.5744, + "step": 12503 + }, + { + "epoch": 3.319394663480685, + "grad_norm": 0.4577390034326609, + "learning_rate": 2.098158901019781e-06, + "loss": 0.5146, + "step": 12504 + }, + { + "epoch": 3.3196601619540687, + "grad_norm": 0.46068076476183023, + "learning_rate": 2.0978143166049106e-06, + "loss": 0.5452, + "step": 12505 + }, + { + "epoch": 3.3199256604274527, + "grad_norm": 0.46622527022841176, + "learning_rate": 2.0974697400336456e-06, + "loss": 0.5522, + "step": 12506 + }, + { + "epoch": 3.3201911589008364, + "grad_norm": 0.45328441817231063, + "learning_rate": 2.0971251713127065e-06, + "loss": 0.5093, + "step": 12507 + }, + { + "epoch": 3.32045665737422, + "grad_norm": 0.43375240043693214, + "learning_rate": 2.096780610448813e-06, + "loss": 0.5379, + "step": 12508 + }, + { + "epoch": 3.320722155847604, + "grad_norm": 0.4563664377394157, + "learning_rate": 2.0964360574486846e-06, + "loss": 0.5857, + "step": 12509 + }, + { + "epoch": 3.3209876543209877, + "grad_norm": 0.46269890887163606, + "learning_rate": 2.0960915123190416e-06, + "loss": 0.5665, + "step": 12510 + }, + { + "epoch": 3.3212531527943714, + "grad_norm": 0.4686226873913855, + "learning_rate": 2.0957469750666026e-06, + "loss": 0.5694, + "step": 12511 + }, + { + "epoch": 3.321518651267755, + "grad_norm": 0.4667222560680663, + "learning_rate": 2.0954024456980875e-06, + "loss": 0.5363, + "step": 12512 + }, + { + "epoch": 3.321784149741139, + "grad_norm": 0.45605912239926993, + "learning_rate": 2.0950579242202153e-06, + "loss": 0.5235, + "step": 12513 + }, + { + "epoch": 3.3220496482145228, + "grad_norm": 0.4573541374565224, + "learning_rate": 2.0947134106397053e-06, + "loss": 0.5697, + "step": 12514 + }, + { + "epoch": 3.3223151466879064, + "grad_norm": 0.45532022688939805, + "learning_rate": 2.094368904963277e-06, + "loss": 0.5369, + "step": 12515 + }, + { + "epoch": 3.3225806451612905, + "grad_norm": 0.45840015082414337, + "learning_rate": 2.0940244071976466e-06, + "loss": 0.5725, + "step": 12516 + }, + { + "epoch": 3.322846143634674, + "grad_norm": 0.45857147291040484, + "learning_rate": 2.0936799173495348e-06, + "loss": 0.5534, + "step": 12517 + }, + { + "epoch": 3.3231116421080578, + "grad_norm": 0.46554016390189934, + "learning_rate": 2.0933354354256593e-06, + "loss": 0.5241, + "step": 12518 + }, + { + "epoch": 3.323377140581442, + "grad_norm": 0.447877572767853, + "learning_rate": 2.092990961432739e-06, + "loss": 0.5598, + "step": 12519 + }, + { + "epoch": 3.3236426390548255, + "grad_norm": 0.4553507984115381, + "learning_rate": 2.0926464953774907e-06, + "loss": 0.5391, + "step": 12520 + }, + { + "epoch": 3.323908137528209, + "grad_norm": 0.45434112405971894, + "learning_rate": 2.092302037266634e-06, + "loss": 0.5417, + "step": 12521 + }, + { + "epoch": 3.3241736360015928, + "grad_norm": 0.461418455961987, + "learning_rate": 2.0919575871068854e-06, + "loss": 0.5692, + "step": 12522 + }, + { + "epoch": 3.324439134474977, + "grad_norm": 0.4511552933044096, + "learning_rate": 2.0916131449049626e-06, + "loss": 0.5579, + "step": 12523 + }, + { + "epoch": 3.3247046329483605, + "grad_norm": 0.4610784963791445, + "learning_rate": 2.0912687106675835e-06, + "loss": 0.5428, + "step": 12524 + }, + { + "epoch": 3.324970131421744, + "grad_norm": 0.433307369833326, + "learning_rate": 2.0909242844014654e-06, + "loss": 0.4747, + "step": 12525 + }, + { + "epoch": 3.325235629895128, + "grad_norm": 0.43909895900227575, + "learning_rate": 2.0905798661133253e-06, + "loss": 0.5294, + "step": 12526 + }, + { + "epoch": 3.325501128368512, + "grad_norm": 0.4649584388562537, + "learning_rate": 2.0902354558098815e-06, + "loss": 0.5803, + "step": 12527 + }, + { + "epoch": 3.3257666268418955, + "grad_norm": 0.46429632614289235, + "learning_rate": 2.0898910534978476e-06, + "loss": 0.5618, + "step": 12528 + }, + { + "epoch": 3.3260321253152796, + "grad_norm": 0.47489956001679057, + "learning_rate": 2.089546659183943e-06, + "loss": 0.5304, + "step": 12529 + }, + { + "epoch": 3.3262976237886632, + "grad_norm": 0.45379297064326735, + "learning_rate": 2.0892022728748834e-06, + "loss": 0.5572, + "step": 12530 + }, + { + "epoch": 3.326563122262047, + "grad_norm": 0.46281803927806087, + "learning_rate": 2.088857894577385e-06, + "loss": 0.5619, + "step": 12531 + }, + { + "epoch": 3.326828620735431, + "grad_norm": 0.4532160774894718, + "learning_rate": 2.088513524298165e-06, + "loss": 0.5149, + "step": 12532 + }, + { + "epoch": 3.3270941192088146, + "grad_norm": 0.47249267368046766, + "learning_rate": 2.0881691620439397e-06, + "loss": 0.5494, + "step": 12533 + }, + { + "epoch": 3.3273596176821982, + "grad_norm": 0.45049153749463267, + "learning_rate": 2.0878248078214225e-06, + "loss": 0.5449, + "step": 12534 + }, + { + "epoch": 3.3276251161555823, + "grad_norm": 0.46322255618295577, + "learning_rate": 2.0874804616373314e-06, + "loss": 0.5394, + "step": 12535 + }, + { + "epoch": 3.327890614628966, + "grad_norm": 0.44444655621863416, + "learning_rate": 2.087136123498381e-06, + "loss": 0.5783, + "step": 12536 + }, + { + "epoch": 3.3281561131023496, + "grad_norm": 0.47398517757180264, + "learning_rate": 2.086791793411287e-06, + "loss": 0.5352, + "step": 12537 + }, + { + "epoch": 3.3284216115757337, + "grad_norm": 0.4790311055902139, + "learning_rate": 2.0864474713827653e-06, + "loss": 0.518, + "step": 12538 + }, + { + "epoch": 3.3286871100491173, + "grad_norm": 0.46572904239698093, + "learning_rate": 2.0861031574195296e-06, + "loss": 0.5186, + "step": 12539 + }, + { + "epoch": 3.328952608522501, + "grad_norm": 0.4519932846922998, + "learning_rate": 2.085758851528296e-06, + "loss": 0.5623, + "step": 12540 + }, + { + "epoch": 3.3292181069958846, + "grad_norm": 0.46093688790464066, + "learning_rate": 2.085414553715779e-06, + "loss": 0.5302, + "step": 12541 + }, + { + "epoch": 3.3294836054692687, + "grad_norm": 0.4639913452167748, + "learning_rate": 2.0850702639886937e-06, + "loss": 0.5798, + "step": 12542 + }, + { + "epoch": 3.3297491039426523, + "grad_norm": 0.458542296916341, + "learning_rate": 2.0847259823537537e-06, + "loss": 0.5601, + "step": 12543 + }, + { + "epoch": 3.330014602416036, + "grad_norm": 0.4611919718961669, + "learning_rate": 2.084381708817675e-06, + "loss": 0.5498, + "step": 12544 + }, + { + "epoch": 3.33028010088942, + "grad_norm": 0.46957024733982206, + "learning_rate": 2.0840374433871697e-06, + "loss": 0.5104, + "step": 12545 + }, + { + "epoch": 3.3305455993628037, + "grad_norm": 0.4814538441969792, + "learning_rate": 2.0836931860689524e-06, + "loss": 0.5516, + "step": 12546 + }, + { + "epoch": 3.3308110978361873, + "grad_norm": 0.4684573872955521, + "learning_rate": 2.0833489368697373e-06, + "loss": 0.5508, + "step": 12547 + }, + { + "epoch": 3.331076596309571, + "grad_norm": 0.46992997876637577, + "learning_rate": 2.083004695796238e-06, + "loss": 0.5246, + "step": 12548 + }, + { + "epoch": 3.331342094782955, + "grad_norm": 0.4802963158317988, + "learning_rate": 2.082660462855168e-06, + "loss": 0.5452, + "step": 12549 + }, + { + "epoch": 3.3316075932563387, + "grad_norm": 0.46271809428080457, + "learning_rate": 2.082316238053242e-06, + "loss": 0.5644, + "step": 12550 + }, + { + "epoch": 3.3318730917297223, + "grad_norm": 0.45403915028829606, + "learning_rate": 2.081972021397171e-06, + "loss": 0.5604, + "step": 12551 + }, + { + "epoch": 3.3321385902031064, + "grad_norm": 0.45091164578003795, + "learning_rate": 2.0816278128936695e-06, + "loss": 0.5552, + "step": 12552 + }, + { + "epoch": 3.33240408867649, + "grad_norm": 0.4600388462010232, + "learning_rate": 2.0812836125494497e-06, + "loss": 0.5875, + "step": 12553 + }, + { + "epoch": 3.3326695871498737, + "grad_norm": 0.4623470080786759, + "learning_rate": 2.0809394203712246e-06, + "loss": 0.5367, + "step": 12554 + }, + { + "epoch": 3.332935085623258, + "grad_norm": 0.44988545638785415, + "learning_rate": 2.080595236365707e-06, + "loss": 0.5684, + "step": 12555 + }, + { + "epoch": 3.3332005840966414, + "grad_norm": 0.47218987534434853, + "learning_rate": 2.0802510605396097e-06, + "loss": 0.5667, + "step": 12556 + }, + { + "epoch": 3.333466082570025, + "grad_norm": 0.46985412824531037, + "learning_rate": 2.0799068928996436e-06, + "loss": 0.6013, + "step": 12557 + }, + { + "epoch": 3.333731581043409, + "grad_norm": 0.4635316492686305, + "learning_rate": 2.079562733452522e-06, + "loss": 0.5173, + "step": 12558 + }, + { + "epoch": 3.333997079516793, + "grad_norm": 0.4555583645910946, + "learning_rate": 2.0792185822049563e-06, + "loss": 0.5313, + "step": 12559 + }, + { + "epoch": 3.3342625779901764, + "grad_norm": 0.4703939992715734, + "learning_rate": 2.0788744391636585e-06, + "loss": 0.5399, + "step": 12560 + }, + { + "epoch": 3.3345280764635605, + "grad_norm": 0.46322704782253765, + "learning_rate": 2.0785303043353415e-06, + "loss": 0.5564, + "step": 12561 + }, + { + "epoch": 3.334793574936944, + "grad_norm": 0.4601696510479565, + "learning_rate": 2.0781861777267145e-06, + "loss": 0.553, + "step": 12562 + }, + { + "epoch": 3.335059073410328, + "grad_norm": 0.45087305751246654, + "learning_rate": 2.077842059344489e-06, + "loss": 0.5249, + "step": 12563 + }, + { + "epoch": 3.335324571883712, + "grad_norm": 0.4653875480917446, + "learning_rate": 2.077497949195378e-06, + "loss": 0.5416, + "step": 12564 + }, + { + "epoch": 3.3355900703570955, + "grad_norm": 0.4693265173218801, + "learning_rate": 2.0771538472860906e-06, + "loss": 0.5374, + "step": 12565 + }, + { + "epoch": 3.335855568830479, + "grad_norm": 0.4612467910279887, + "learning_rate": 2.076809753623339e-06, + "loss": 0.5907, + "step": 12566 + }, + { + "epoch": 3.336121067303863, + "grad_norm": 0.4643558252177538, + "learning_rate": 2.076465668213834e-06, + "loss": 0.5667, + "step": 12567 + }, + { + "epoch": 3.336386565777247, + "grad_norm": 0.4700801785951776, + "learning_rate": 2.0761215910642847e-06, + "loss": 0.5284, + "step": 12568 + }, + { + "epoch": 3.3366520642506305, + "grad_norm": 0.47276837622266266, + "learning_rate": 2.0757775221814024e-06, + "loss": 0.5406, + "step": 12569 + }, + { + "epoch": 3.336917562724014, + "grad_norm": 0.4653828001244092, + "learning_rate": 2.0754334615718965e-06, + "loss": 0.5761, + "step": 12570 + }, + { + "epoch": 3.3371830611973983, + "grad_norm": 0.4640501182710139, + "learning_rate": 2.0750894092424783e-06, + "loss": 0.5676, + "step": 12571 + }, + { + "epoch": 3.337448559670782, + "grad_norm": 0.4551360914591177, + "learning_rate": 2.0747453651998565e-06, + "loss": 0.5314, + "step": 12572 + }, + { + "epoch": 3.3377140581441656, + "grad_norm": 0.45851887900778243, + "learning_rate": 2.0744013294507427e-06, + "loss": 0.524, + "step": 12573 + }, + { + "epoch": 3.3379795566175496, + "grad_norm": 0.45644004033825764, + "learning_rate": 2.0740573020018432e-06, + "loss": 0.5496, + "step": 12574 + }, + { + "epoch": 3.3382450550909333, + "grad_norm": 0.47957405640277395, + "learning_rate": 2.07371328285987e-06, + "loss": 0.5531, + "step": 12575 + }, + { + "epoch": 3.338510553564317, + "grad_norm": 0.46352746089664215, + "learning_rate": 2.073369272031531e-06, + "loss": 0.5762, + "step": 12576 + }, + { + "epoch": 3.3387760520377006, + "grad_norm": 0.46957871511109023, + "learning_rate": 2.0730252695235366e-06, + "loss": 0.5829, + "step": 12577 + }, + { + "epoch": 3.3390415505110846, + "grad_norm": 0.45905547577794376, + "learning_rate": 2.072681275342594e-06, + "loss": 0.519, + "step": 12578 + }, + { + "epoch": 3.3393070489844683, + "grad_norm": 0.4705870867385918, + "learning_rate": 2.072337289495414e-06, + "loss": 0.5849, + "step": 12579 + }, + { + "epoch": 3.339572547457852, + "grad_norm": 0.4662620438832262, + "learning_rate": 2.0719933119887032e-06, + "loss": 0.5554, + "step": 12580 + }, + { + "epoch": 3.339838045931236, + "grad_norm": 0.4743917837331077, + "learning_rate": 2.071649342829171e-06, + "loss": 0.5838, + "step": 12581 + }, + { + "epoch": 3.3401035444046197, + "grad_norm": 0.4621100641668938, + "learning_rate": 2.071305382023525e-06, + "loss": 0.5826, + "step": 12582 + }, + { + "epoch": 3.3403690428780033, + "grad_norm": 0.46659278903280366, + "learning_rate": 2.0709614295784734e-06, + "loss": 0.5747, + "step": 12583 + }, + { + "epoch": 3.3406345413513874, + "grad_norm": 0.4695482153466633, + "learning_rate": 2.070617485500725e-06, + "loss": 0.5534, + "step": 12584 + }, + { + "epoch": 3.340900039824771, + "grad_norm": 0.4665102211763585, + "learning_rate": 2.0702735497969873e-06, + "loss": 0.54, + "step": 12585 + }, + { + "epoch": 3.3411655382981547, + "grad_norm": 0.447311988563106, + "learning_rate": 2.069929622473967e-06, + "loss": 0.5523, + "step": 12586 + }, + { + "epoch": 3.3414310367715387, + "grad_norm": 0.47468945482066466, + "learning_rate": 2.0695857035383716e-06, + "loss": 0.4991, + "step": 12587 + }, + { + "epoch": 3.3416965352449224, + "grad_norm": 0.4685305937007719, + "learning_rate": 2.069241792996909e-06, + "loss": 0.5696, + "step": 12588 + }, + { + "epoch": 3.341962033718306, + "grad_norm": 0.468934319344743, + "learning_rate": 2.0688978908562863e-06, + "loss": 0.5474, + "step": 12589 + }, + { + "epoch": 3.34222753219169, + "grad_norm": 0.45826752054056613, + "learning_rate": 2.0685539971232114e-06, + "loss": 0.5327, + "step": 12590 + }, + { + "epoch": 3.3424930306650737, + "grad_norm": 0.4519640110125526, + "learning_rate": 2.068210111804389e-06, + "loss": 0.5342, + "step": 12591 + }, + { + "epoch": 3.3427585291384574, + "grad_norm": 0.45240682141334926, + "learning_rate": 2.0678662349065263e-06, + "loss": 0.5556, + "step": 12592 + }, + { + "epoch": 3.3430240276118415, + "grad_norm": 0.4690611911684847, + "learning_rate": 2.06752236643633e-06, + "loss": 0.5498, + "step": 12593 + }, + { + "epoch": 3.343289526085225, + "grad_norm": 0.45762953819575786, + "learning_rate": 2.067178506400507e-06, + "loss": 0.5522, + "step": 12594 + }, + { + "epoch": 3.3435550245586088, + "grad_norm": 0.4599788801632664, + "learning_rate": 2.0668346548057617e-06, + "loss": 0.5209, + "step": 12595 + }, + { + "epoch": 3.3438205230319924, + "grad_norm": 0.4647232001682794, + "learning_rate": 2.066490811658803e-06, + "loss": 0.5506, + "step": 12596 + }, + { + "epoch": 3.3440860215053765, + "grad_norm": 0.4708194205547255, + "learning_rate": 2.0661469769663335e-06, + "loss": 0.5883, + "step": 12597 + }, + { + "epoch": 3.34435151997876, + "grad_norm": 0.4552194989903483, + "learning_rate": 2.0658031507350607e-06, + "loss": 0.5836, + "step": 12598 + }, + { + "epoch": 3.3446170184521438, + "grad_norm": 0.482861944006749, + "learning_rate": 2.0654593329716893e-06, + "loss": 0.5638, + "step": 12599 + }, + { + "epoch": 3.344882516925528, + "grad_norm": 0.4570799635466538, + "learning_rate": 2.065115523682925e-06, + "loss": 0.5326, + "step": 12600 + }, + { + "epoch": 3.3451480153989115, + "grad_norm": 0.45793396524811736, + "learning_rate": 2.0647717228754727e-06, + "loss": 0.5468, + "step": 12601 + }, + { + "epoch": 3.345413513872295, + "grad_norm": 0.4644400156413009, + "learning_rate": 2.064427930556038e-06, + "loss": 0.5302, + "step": 12602 + }, + { + "epoch": 3.3456790123456788, + "grad_norm": 0.45037040767798153, + "learning_rate": 2.0640841467313246e-06, + "loss": 0.5462, + "step": 12603 + }, + { + "epoch": 3.345944510819063, + "grad_norm": 0.4709461828124922, + "learning_rate": 2.0637403714080376e-06, + "loss": 0.5572, + "step": 12604 + }, + { + "epoch": 3.3462100092924465, + "grad_norm": 0.4508927605673675, + "learning_rate": 2.0633966045928815e-06, + "loss": 0.5316, + "step": 12605 + }, + { + "epoch": 3.34647550776583, + "grad_norm": 0.4518100503192149, + "learning_rate": 2.0630528462925606e-06, + "loss": 0.5002, + "step": 12606 + }, + { + "epoch": 3.3467410062392142, + "grad_norm": 0.4650078283910292, + "learning_rate": 2.062709096513779e-06, + "loss": 0.5247, + "step": 12607 + }, + { + "epoch": 3.347006504712598, + "grad_norm": 0.462860954048532, + "learning_rate": 2.0623653552632415e-06, + "loss": 0.5685, + "step": 12608 + }, + { + "epoch": 3.3472720031859815, + "grad_norm": 0.4581316154958781, + "learning_rate": 2.0620216225476506e-06, + "loss": 0.5382, + "step": 12609 + }, + { + "epoch": 3.3475375016593656, + "grad_norm": 0.44994723903079986, + "learning_rate": 2.0616778983737103e-06, + "loss": 0.538, + "step": 12610 + }, + { + "epoch": 3.3478030001327492, + "grad_norm": 0.47737660026916773, + "learning_rate": 2.061334182748124e-06, + "loss": 0.5579, + "step": 12611 + }, + { + "epoch": 3.348068498606133, + "grad_norm": 0.4623047795624245, + "learning_rate": 2.060990475677595e-06, + "loss": 0.5317, + "step": 12612 + }, + { + "epoch": 3.348333997079517, + "grad_norm": 0.4587593415806032, + "learning_rate": 2.060646777168827e-06, + "loss": 0.4956, + "step": 12613 + }, + { + "epoch": 3.3485994955529006, + "grad_norm": 0.43871413611112003, + "learning_rate": 2.0603030872285226e-06, + "loss": 0.5138, + "step": 12614 + }, + { + "epoch": 3.3488649940262842, + "grad_norm": 0.4621758748062588, + "learning_rate": 2.0599594058633843e-06, + "loss": 0.5273, + "step": 12615 + }, + { + "epoch": 3.3491304924996683, + "grad_norm": 0.43588792995830405, + "learning_rate": 2.059615733080115e-06, + "loss": 0.5635, + "step": 12616 + }, + { + "epoch": 3.349395990973052, + "grad_norm": 0.47872700303162763, + "learning_rate": 2.059272068885417e-06, + "loss": 0.5635, + "step": 12617 + }, + { + "epoch": 3.3496614894464356, + "grad_norm": 0.4609291878356045, + "learning_rate": 2.058928413285993e-06, + "loss": 0.5556, + "step": 12618 + }, + { + "epoch": 3.3499269879198197, + "grad_norm": 0.45478630011007515, + "learning_rate": 2.0585847662885457e-06, + "loss": 0.5721, + "step": 12619 + }, + { + "epoch": 3.3501924863932033, + "grad_norm": 0.47566391685826487, + "learning_rate": 2.058241127899775e-06, + "loss": 0.5442, + "step": 12620 + }, + { + "epoch": 3.350457984866587, + "grad_norm": 0.4753381187272507, + "learning_rate": 2.057897498126384e-06, + "loss": 0.5623, + "step": 12621 + }, + { + "epoch": 3.3507234833399706, + "grad_norm": 0.4488691617329402, + "learning_rate": 2.0575538769750742e-06, + "loss": 0.5465, + "step": 12622 + }, + { + "epoch": 3.3509889818133547, + "grad_norm": 0.45356363308536435, + "learning_rate": 2.0572102644525476e-06, + "loss": 0.5437, + "step": 12623 + }, + { + "epoch": 3.3512544802867383, + "grad_norm": 0.4501010548049581, + "learning_rate": 2.0568666605655045e-06, + "loss": 0.5594, + "step": 12624 + }, + { + "epoch": 3.351519978760122, + "grad_norm": 0.45113569038960444, + "learning_rate": 2.056523065320648e-06, + "loss": 0.5408, + "step": 12625 + }, + { + "epoch": 3.351785477233506, + "grad_norm": 0.47599124266137316, + "learning_rate": 2.0561794787246763e-06, + "loss": 0.5177, + "step": 12626 + }, + { + "epoch": 3.3520509757068897, + "grad_norm": 0.456514770114625, + "learning_rate": 2.0558359007842916e-06, + "loss": 0.5449, + "step": 12627 + }, + { + "epoch": 3.3523164741802733, + "grad_norm": 0.4715250900398764, + "learning_rate": 2.055492331506194e-06, + "loss": 0.5657, + "step": 12628 + }, + { + "epoch": 3.3525819726536574, + "grad_norm": 0.45875132714516287, + "learning_rate": 2.0551487708970842e-06, + "loss": 0.5303, + "step": 12629 + }, + { + "epoch": 3.352847471127041, + "grad_norm": 0.49091743875106414, + "learning_rate": 2.054805218963663e-06, + "loss": 0.5169, + "step": 12630 + }, + { + "epoch": 3.3531129696004247, + "grad_norm": 0.47080179680134926, + "learning_rate": 2.05446167571263e-06, + "loss": 0.5511, + "step": 12631 + }, + { + "epoch": 3.3533784680738083, + "grad_norm": 0.4697817611520423, + "learning_rate": 2.054118141150685e-06, + "loss": 0.5312, + "step": 12632 + }, + { + "epoch": 3.3536439665471924, + "grad_norm": 0.4379885173106567, + "learning_rate": 2.053774615284528e-06, + "loss": 0.5413, + "step": 12633 + }, + { + "epoch": 3.353909465020576, + "grad_norm": 0.45811651613746984, + "learning_rate": 2.053431098120859e-06, + "loss": 0.5277, + "step": 12634 + }, + { + "epoch": 3.3541749634939597, + "grad_norm": 0.4519560773256558, + "learning_rate": 2.053087589666376e-06, + "loss": 0.5568, + "step": 12635 + }, + { + "epoch": 3.354440461967344, + "grad_norm": 0.45257613853947937, + "learning_rate": 2.052744089927781e-06, + "loss": 0.5529, + "step": 12636 + }, + { + "epoch": 3.3547059604407274, + "grad_norm": 0.4505801029398686, + "learning_rate": 2.0524005989117703e-06, + "loss": 0.5526, + "step": 12637 + }, + { + "epoch": 3.354971458914111, + "grad_norm": 0.47661685066395887, + "learning_rate": 2.0520571166250435e-06, + "loss": 0.53, + "step": 12638 + }, + { + "epoch": 3.355236957387495, + "grad_norm": 0.46645999381889575, + "learning_rate": 2.0517136430742994e-06, + "loss": 0.5629, + "step": 12639 + }, + { + "epoch": 3.355502455860879, + "grad_norm": 0.4667808182590336, + "learning_rate": 2.051370178266237e-06, + "loss": 0.5686, + "step": 12640 + }, + { + "epoch": 3.3557679543342624, + "grad_norm": 0.45526729876677424, + "learning_rate": 2.0510267222075545e-06, + "loss": 0.5579, + "step": 12641 + }, + { + "epoch": 3.3560334528076465, + "grad_norm": 0.4555619777967573, + "learning_rate": 2.0506832749049516e-06, + "loss": 0.5639, + "step": 12642 + }, + { + "epoch": 3.35629895128103, + "grad_norm": 0.46278641999002956, + "learning_rate": 2.050339836365124e-06, + "loss": 0.511, + "step": 12643 + }, + { + "epoch": 3.356564449754414, + "grad_norm": 0.45812681937415933, + "learning_rate": 2.04999640659477e-06, + "loss": 0.5345, + "step": 12644 + }, + { + "epoch": 3.356829948227798, + "grad_norm": 0.46301954097363396, + "learning_rate": 2.0496529856005884e-06, + "loss": 0.5303, + "step": 12645 + }, + { + "epoch": 3.3570954467011815, + "grad_norm": 0.4555211957160118, + "learning_rate": 2.049309573389276e-06, + "loss": 0.5484, + "step": 12646 + }, + { + "epoch": 3.357360945174565, + "grad_norm": 0.4602332130468546, + "learning_rate": 2.0489661699675304e-06, + "loss": 0.5513, + "step": 12647 + }, + { + "epoch": 3.3576264436479493, + "grad_norm": 0.46493012312978776, + "learning_rate": 2.0486227753420496e-06, + "loss": 0.5313, + "step": 12648 + }, + { + "epoch": 3.357891942121333, + "grad_norm": 0.47455733903475056, + "learning_rate": 2.048279389519529e-06, + "loss": 0.5778, + "step": 12649 + }, + { + "epoch": 3.3581574405947165, + "grad_norm": 0.4653231034782884, + "learning_rate": 2.0479360125066664e-06, + "loss": 0.553, + "step": 12650 + }, + { + "epoch": 3.3584229390681, + "grad_norm": 0.4707784105566937, + "learning_rate": 2.047592644310158e-06, + "loss": 0.5125, + "step": 12651 + }, + { + "epoch": 3.3586884375414843, + "grad_norm": 0.46782438989601033, + "learning_rate": 2.0472492849367017e-06, + "loss": 0.5552, + "step": 12652 + }, + { + "epoch": 3.358953936014868, + "grad_norm": 0.47938251783339464, + "learning_rate": 2.046905934392992e-06, + "loss": 0.572, + "step": 12653 + }, + { + "epoch": 3.3592194344882516, + "grad_norm": 0.46808079929520907, + "learning_rate": 2.0465625926857272e-06, + "loss": 0.5283, + "step": 12654 + }, + { + "epoch": 3.3594849329616356, + "grad_norm": 0.456763664122745, + "learning_rate": 2.0462192598216013e-06, + "loss": 0.5387, + "step": 12655 + }, + { + "epoch": 3.3597504314350193, + "grad_norm": 0.47898908520285965, + "learning_rate": 2.0458759358073103e-06, + "loss": 0.5386, + "step": 12656 + }, + { + "epoch": 3.360015929908403, + "grad_norm": 0.4572871051451324, + "learning_rate": 2.045532620649551e-06, + "loss": 0.5593, + "step": 12657 + }, + { + "epoch": 3.3602814283817866, + "grad_norm": 0.46092080626684123, + "learning_rate": 2.0451893143550185e-06, + "loss": 0.5111, + "step": 12658 + }, + { + "epoch": 3.3605469268551706, + "grad_norm": 0.45457161365711435, + "learning_rate": 2.0448460169304076e-06, + "loss": 0.543, + "step": 12659 + }, + { + "epoch": 3.3608124253285543, + "grad_norm": 0.4577665914247044, + "learning_rate": 2.044502728382414e-06, + "loss": 0.5103, + "step": 12660 + }, + { + "epoch": 3.361077923801938, + "grad_norm": 0.44672302320097573, + "learning_rate": 2.0441594487177322e-06, + "loss": 0.5203, + "step": 12661 + }, + { + "epoch": 3.361343422275322, + "grad_norm": 0.4652005993774999, + "learning_rate": 2.043816177943057e-06, + "loss": 0.6096, + "step": 12662 + }, + { + "epoch": 3.3616089207487057, + "grad_norm": 0.4709952848217192, + "learning_rate": 2.0434729160650835e-06, + "loss": 0.5161, + "step": 12663 + }, + { + "epoch": 3.3618744192220893, + "grad_norm": 0.4558251077918749, + "learning_rate": 2.043129663090506e-06, + "loss": 0.5787, + "step": 12664 + }, + { + "epoch": 3.3621399176954734, + "grad_norm": 0.4574013331667824, + "learning_rate": 2.0427864190260193e-06, + "loss": 0.5739, + "step": 12665 + }, + { + "epoch": 3.362405416168857, + "grad_norm": 0.4552401248377977, + "learning_rate": 2.042443183878316e-06, + "loss": 0.511, + "step": 12666 + }, + { + "epoch": 3.3626709146422407, + "grad_norm": 0.4342146830621328, + "learning_rate": 2.042099957654091e-06, + "loss": 0.5048, + "step": 12667 + }, + { + "epoch": 3.3629364131156247, + "grad_norm": 0.4675184512527628, + "learning_rate": 2.041756740360038e-06, + "loss": 0.5704, + "step": 12668 + }, + { + "epoch": 3.3632019115890084, + "grad_norm": 0.46599059307563045, + "learning_rate": 2.0414135320028507e-06, + "loss": 0.5539, + "step": 12669 + }, + { + "epoch": 3.363467410062392, + "grad_norm": 0.4545448916893775, + "learning_rate": 2.041070332589222e-06, + "loss": 0.559, + "step": 12670 + }, + { + "epoch": 3.363732908535776, + "grad_norm": 0.46387004846292806, + "learning_rate": 2.040727142125847e-06, + "loss": 0.547, + "step": 12671 + }, + { + "epoch": 3.3639984070091598, + "grad_norm": 0.46405523679722044, + "learning_rate": 2.040383960619416e-06, + "loss": 0.546, + "step": 12672 + }, + { + "epoch": 3.3642639054825434, + "grad_norm": 0.45406283070999814, + "learning_rate": 2.0400407880766227e-06, + "loss": 0.5355, + "step": 12673 + }, + { + "epoch": 3.3645294039559275, + "grad_norm": 0.4553928011584363, + "learning_rate": 2.0396976245041604e-06, + "loss": 0.5368, + "step": 12674 + }, + { + "epoch": 3.364794902429311, + "grad_norm": 0.46421618356922373, + "learning_rate": 2.0393544699087217e-06, + "loss": 0.592, + "step": 12675 + }, + { + "epoch": 3.3650604009026948, + "grad_norm": 0.4699202017390662, + "learning_rate": 2.0390113242969984e-06, + "loss": 0.5088, + "step": 12676 + }, + { + "epoch": 3.3653258993760784, + "grad_norm": 0.4523131168077975, + "learning_rate": 2.0386681876756837e-06, + "loss": 0.5257, + "step": 12677 + }, + { + "epoch": 3.3655913978494625, + "grad_norm": 0.4613211736289155, + "learning_rate": 2.0383250600514686e-06, + "loss": 0.5161, + "step": 12678 + }, + { + "epoch": 3.365856896322846, + "grad_norm": 0.46259182936929705, + "learning_rate": 2.0379819414310447e-06, + "loss": 0.5228, + "step": 12679 + }, + { + "epoch": 3.3661223947962298, + "grad_norm": 0.4657565528463685, + "learning_rate": 2.037638831821104e-06, + "loss": 0.5343, + "step": 12680 + }, + { + "epoch": 3.366387893269614, + "grad_norm": 0.4671720927539018, + "learning_rate": 2.0372957312283387e-06, + "loss": 0.5561, + "step": 12681 + }, + { + "epoch": 3.3666533917429975, + "grad_norm": 0.45716590645948557, + "learning_rate": 2.0369526396594405e-06, + "loss": 0.5658, + "step": 12682 + }, + { + "epoch": 3.366918890216381, + "grad_norm": 0.4521434708036079, + "learning_rate": 2.0366095571210986e-06, + "loss": 0.5668, + "step": 12683 + }, + { + "epoch": 3.367184388689765, + "grad_norm": 0.47364572366767593, + "learning_rate": 2.0362664836200044e-06, + "loss": 0.561, + "step": 12684 + }, + { + "epoch": 3.367449887163149, + "grad_norm": 0.4682103062912439, + "learning_rate": 2.0359234191628495e-06, + "loss": 0.5766, + "step": 12685 + }, + { + "epoch": 3.3677153856365325, + "grad_norm": 0.46041100668933177, + "learning_rate": 2.035580363756324e-06, + "loss": 0.5719, + "step": 12686 + }, + { + "epoch": 3.367980884109916, + "grad_norm": 0.466542172915299, + "learning_rate": 2.035237317407118e-06, + "loss": 0.5345, + "step": 12687 + }, + { + "epoch": 3.3682463825833002, + "grad_norm": 0.4708751311317681, + "learning_rate": 2.034894280121924e-06, + "loss": 0.5268, + "step": 12688 + }, + { + "epoch": 3.368511881056684, + "grad_norm": 0.4533753567975711, + "learning_rate": 2.0345512519074296e-06, + "loss": 0.5605, + "step": 12689 + }, + { + "epoch": 3.3687773795300675, + "grad_norm": 0.4615146989112833, + "learning_rate": 2.034208232770325e-06, + "loss": 0.5503, + "step": 12690 + }, + { + "epoch": 3.3690428780034516, + "grad_norm": 0.45615675051746146, + "learning_rate": 2.0338652227173e-06, + "loss": 0.5485, + "step": 12691 + }, + { + "epoch": 3.3693083764768352, + "grad_norm": 0.45547259338486745, + "learning_rate": 2.033522221755045e-06, + "loss": 0.5412, + "step": 12692 + }, + { + "epoch": 3.369573874950219, + "grad_norm": 0.4652872244193801, + "learning_rate": 2.0331792298902488e-06, + "loss": 0.5727, + "step": 12693 + }, + { + "epoch": 3.369839373423603, + "grad_norm": 0.45665773321693587, + "learning_rate": 2.032836247129601e-06, + "loss": 0.4726, + "step": 12694 + }, + { + "epoch": 3.3701048718969866, + "grad_norm": 0.46254817676681753, + "learning_rate": 2.032493273479789e-06, + "loss": 0.5431, + "step": 12695 + }, + { + "epoch": 3.3703703703703702, + "grad_norm": 0.43649196613551833, + "learning_rate": 2.0321503089475037e-06, + "loss": 0.5207, + "step": 12696 + }, + { + "epoch": 3.3706358688437543, + "grad_norm": 0.4513365164719909, + "learning_rate": 2.0318073535394326e-06, + "loss": 0.5042, + "step": 12697 + }, + { + "epoch": 3.370901367317138, + "grad_norm": 0.4541239707611342, + "learning_rate": 2.031464407262264e-06, + "loss": 0.5562, + "step": 12698 + }, + { + "epoch": 3.3711668657905216, + "grad_norm": 0.46062478475997776, + "learning_rate": 2.0311214701226873e-06, + "loss": 0.5481, + "step": 12699 + }, + { + "epoch": 3.3714323642639057, + "grad_norm": 0.4763213131042151, + "learning_rate": 2.0307785421273908e-06, + "loss": 0.5234, + "step": 12700 + }, + { + "epoch": 3.3716978627372893, + "grad_norm": 0.44533687605477784, + "learning_rate": 2.0304356232830604e-06, + "loss": 0.5436, + "step": 12701 + }, + { + "epoch": 3.371963361210673, + "grad_norm": 0.4540547204679213, + "learning_rate": 2.0300927135963853e-06, + "loss": 0.5645, + "step": 12702 + }, + { + "epoch": 3.372228859684057, + "grad_norm": 0.4632642732790654, + "learning_rate": 2.0297498130740523e-06, + "loss": 0.5833, + "step": 12703 + }, + { + "epoch": 3.3724943581574407, + "grad_norm": 0.4563815304405746, + "learning_rate": 2.0294069217227495e-06, + "loss": 0.544, + "step": 12704 + }, + { + "epoch": 3.3727598566308243, + "grad_norm": 0.4543192415455526, + "learning_rate": 2.0290640395491644e-06, + "loss": 0.5484, + "step": 12705 + }, + { + "epoch": 3.373025355104208, + "grad_norm": 0.4583226164977842, + "learning_rate": 2.0287211665599837e-06, + "loss": 0.565, + "step": 12706 + }, + { + "epoch": 3.373290853577592, + "grad_norm": 0.47179552237435135, + "learning_rate": 2.0283783027618936e-06, + "loss": 0.5475, + "step": 12707 + }, + { + "epoch": 3.3735563520509757, + "grad_norm": 0.462000262766302, + "learning_rate": 2.0280354481615814e-06, + "loss": 0.5676, + "step": 12708 + }, + { + "epoch": 3.3738218505243593, + "grad_norm": 0.45905549356288383, + "learning_rate": 2.0276926027657336e-06, + "loss": 0.549, + "step": 12709 + }, + { + "epoch": 3.3740873489977434, + "grad_norm": 0.4676680465265721, + "learning_rate": 2.0273497665810366e-06, + "loss": 0.5845, + "step": 12710 + }, + { + "epoch": 3.374352847471127, + "grad_norm": 0.445421986051473, + "learning_rate": 2.027006939614177e-06, + "loss": 0.5281, + "step": 12711 + }, + { + "epoch": 3.3746183459445107, + "grad_norm": 0.4502513787802284, + "learning_rate": 2.0266641218718394e-06, + "loss": 0.5023, + "step": 12712 + }, + { + "epoch": 3.3748838444178944, + "grad_norm": 0.468636401900213, + "learning_rate": 2.02632131336071e-06, + "loss": 0.5382, + "step": 12713 + }, + { + "epoch": 3.3751493428912784, + "grad_norm": 0.45251351388224287, + "learning_rate": 2.0259785140874747e-06, + "loss": 0.5582, + "step": 12714 + }, + { + "epoch": 3.375414841364662, + "grad_norm": 0.46789445242809474, + "learning_rate": 2.0256357240588194e-06, + "loss": 0.5398, + "step": 12715 + }, + { + "epoch": 3.3756803398380457, + "grad_norm": 0.450838309436373, + "learning_rate": 2.025292943281429e-06, + "loss": 0.5535, + "step": 12716 + }, + { + "epoch": 3.37594583831143, + "grad_norm": 0.45656990226275346, + "learning_rate": 2.0249501717619894e-06, + "loss": 0.5375, + "step": 12717 + }, + { + "epoch": 3.3762113367848134, + "grad_norm": 0.4682129890930129, + "learning_rate": 2.0246074095071836e-06, + "loss": 0.5473, + "step": 12718 + }, + { + "epoch": 3.376476835258197, + "grad_norm": 0.46941676804578764, + "learning_rate": 2.024264656523697e-06, + "loss": 0.5514, + "step": 12719 + }, + { + "epoch": 3.376742333731581, + "grad_norm": 0.45003823293324147, + "learning_rate": 2.023921912818215e-06, + "loss": 0.5221, + "step": 12720 + }, + { + "epoch": 3.377007832204965, + "grad_norm": 0.46461030662741104, + "learning_rate": 2.0235791783974207e-06, + "loss": 0.5739, + "step": 12721 + }, + { + "epoch": 3.3772733306783485, + "grad_norm": 0.45386025062765284, + "learning_rate": 2.0232364532679995e-06, + "loss": 0.5746, + "step": 12722 + }, + { + "epoch": 3.3775388291517325, + "grad_norm": 0.4789801949002753, + "learning_rate": 2.0228937374366347e-06, + "loss": 0.5576, + "step": 12723 + }, + { + "epoch": 3.377804327625116, + "grad_norm": 0.4702227259538851, + "learning_rate": 2.0225510309100098e-06, + "loss": 0.545, + "step": 12724 + }, + { + "epoch": 3.3780698260985, + "grad_norm": 0.447527878863501, + "learning_rate": 2.0222083336948088e-06, + "loss": 0.5546, + "step": 12725 + }, + { + "epoch": 3.378335324571884, + "grad_norm": 0.4563851337757835, + "learning_rate": 2.0218656457977156e-06, + "loss": 0.5307, + "step": 12726 + }, + { + "epoch": 3.3786008230452675, + "grad_norm": 0.4524985408492862, + "learning_rate": 2.0215229672254124e-06, + "loss": 0.5521, + "step": 12727 + }, + { + "epoch": 3.378866321518651, + "grad_norm": 0.4686207721898212, + "learning_rate": 2.0211802979845833e-06, + "loss": 0.5308, + "step": 12728 + }, + { + "epoch": 3.3791318199920353, + "grad_norm": 0.47140076214538823, + "learning_rate": 2.0208376380819116e-06, + "loss": 0.5636, + "step": 12729 + }, + { + "epoch": 3.379397318465419, + "grad_norm": 0.5092787635550697, + "learning_rate": 2.020494987524078e-06, + "loss": 0.5522, + "step": 12730 + }, + { + "epoch": 3.3796628169388025, + "grad_norm": 0.4540468814921841, + "learning_rate": 2.020152346317766e-06, + "loss": 0.5034, + "step": 12731 + }, + { + "epoch": 3.379928315412186, + "grad_norm": 0.451221713730231, + "learning_rate": 2.0198097144696577e-06, + "loss": 0.5932, + "step": 12732 + }, + { + "epoch": 3.3801938138855703, + "grad_norm": 0.4770972343878027, + "learning_rate": 2.0194670919864364e-06, + "loss": 0.5505, + "step": 12733 + }, + { + "epoch": 3.380459312358954, + "grad_norm": 0.45090680707980824, + "learning_rate": 2.0191244788747842e-06, + "loss": 0.5514, + "step": 12734 + }, + { + "epoch": 3.3807248108323376, + "grad_norm": 0.4529283941542656, + "learning_rate": 2.0187818751413813e-06, + "loss": 0.5554, + "step": 12735 + }, + { + "epoch": 3.3809903093057216, + "grad_norm": 0.493989687412165, + "learning_rate": 2.01843928079291e-06, + "loss": 0.5319, + "step": 12736 + }, + { + "epoch": 3.3812558077791053, + "grad_norm": 0.4522283620390422, + "learning_rate": 2.0180966958360514e-06, + "loss": 0.512, + "step": 12737 + }, + { + "epoch": 3.381521306252489, + "grad_norm": 0.4479473287332283, + "learning_rate": 2.0177541202774876e-06, + "loss": 0.5212, + "step": 12738 + }, + { + "epoch": 3.381786804725873, + "grad_norm": 0.45242665165070406, + "learning_rate": 2.017411554123899e-06, + "loss": 0.5232, + "step": 12739 + }, + { + "epoch": 3.3820523031992566, + "grad_norm": 0.45451541064152395, + "learning_rate": 2.017068997381967e-06, + "loss": 0.5362, + "step": 12740 + }, + { + "epoch": 3.3823178016726403, + "grad_norm": 0.4460571725812792, + "learning_rate": 2.016726450058372e-06, + "loss": 0.5329, + "step": 12741 + }, + { + "epoch": 3.382583300146024, + "grad_norm": 0.46601782009726894, + "learning_rate": 2.016383912159794e-06, + "loss": 0.5566, + "step": 12742 + }, + { + "epoch": 3.382848798619408, + "grad_norm": 0.4674835143262182, + "learning_rate": 2.016041383692914e-06, + "loss": 0.5326, + "step": 12743 + }, + { + "epoch": 3.3831142970927917, + "grad_norm": 0.4883092000371992, + "learning_rate": 2.015698864664412e-06, + "loss": 0.5201, + "step": 12744 + }, + { + "epoch": 3.3833797955661753, + "grad_norm": 0.45165210404600376, + "learning_rate": 2.015356355080968e-06, + "loss": 0.5413, + "step": 12745 + }, + { + "epoch": 3.3836452940395594, + "grad_norm": 0.4646914973137166, + "learning_rate": 2.0150138549492625e-06, + "loss": 0.5855, + "step": 12746 + }, + { + "epoch": 3.383910792512943, + "grad_norm": 0.4716558753972731, + "learning_rate": 2.014671364275973e-06, + "loss": 0.5386, + "step": 12747 + }, + { + "epoch": 3.3841762909863267, + "grad_norm": 0.46898554050258334, + "learning_rate": 2.01432888306778e-06, + "loss": 0.5506, + "step": 12748 + }, + { + "epoch": 3.3844417894597107, + "grad_norm": 0.4638898678583906, + "learning_rate": 2.013986411331364e-06, + "loss": 0.5352, + "step": 12749 + }, + { + "epoch": 3.3847072879330944, + "grad_norm": 0.4696325181739473, + "learning_rate": 2.0136439490734023e-06, + "loss": 0.5275, + "step": 12750 + }, + { + "epoch": 3.384972786406478, + "grad_norm": 0.4554793897498558, + "learning_rate": 2.013301496300574e-06, + "loss": 0.5815, + "step": 12751 + }, + { + "epoch": 3.385238284879862, + "grad_norm": 0.470266714263882, + "learning_rate": 2.0129590530195593e-06, + "loss": 0.5662, + "step": 12752 + }, + { + "epoch": 3.3855037833532458, + "grad_norm": 0.45596083357900896, + "learning_rate": 2.012616619237035e-06, + "loss": 0.5521, + "step": 12753 + }, + { + "epoch": 3.3857692818266294, + "grad_norm": 0.47509611512895283, + "learning_rate": 2.01227419495968e-06, + "loss": 0.5094, + "step": 12754 + }, + { + "epoch": 3.3860347803000135, + "grad_norm": 0.45012087086726, + "learning_rate": 2.011931780194172e-06, + "loss": 0.5612, + "step": 12755 + }, + { + "epoch": 3.386300278773397, + "grad_norm": 0.4643096228219624, + "learning_rate": 2.0115893749471892e-06, + "loss": 0.5589, + "step": 12756 + }, + { + "epoch": 3.3865657772467808, + "grad_norm": 0.4614065325401749, + "learning_rate": 2.01124697922541e-06, + "loss": 0.5475, + "step": 12757 + }, + { + "epoch": 3.386831275720165, + "grad_norm": 0.4499457641025427, + "learning_rate": 2.010904593035511e-06, + "loss": 0.5424, + "step": 12758 + }, + { + "epoch": 3.3870967741935485, + "grad_norm": 0.4625461139829754, + "learning_rate": 2.01056221638417e-06, + "loss": 0.5426, + "step": 12759 + }, + { + "epoch": 3.387362272666932, + "grad_norm": 0.4594882605342509, + "learning_rate": 2.010219849278064e-06, + "loss": 0.566, + "step": 12760 + }, + { + "epoch": 3.3876277711403158, + "grad_norm": 0.4737230055503393, + "learning_rate": 2.0098774917238708e-06, + "loss": 0.5632, + "step": 12761 + }, + { + "epoch": 3.3878932696137, + "grad_norm": 0.46136477287257904, + "learning_rate": 2.0095351437282655e-06, + "loss": 0.5266, + "step": 12762 + }, + { + "epoch": 3.3881587680870835, + "grad_norm": 0.47064068519465296, + "learning_rate": 2.009192805297927e-06, + "loss": 0.5458, + "step": 12763 + }, + { + "epoch": 3.388424266560467, + "grad_norm": 0.45879778255256043, + "learning_rate": 2.00885047643953e-06, + "loss": 0.5607, + "step": 12764 + }, + { + "epoch": 3.388689765033851, + "grad_norm": 0.4744306629926466, + "learning_rate": 2.008508157159751e-06, + "loss": 0.5084, + "step": 12765 + }, + { + "epoch": 3.388955263507235, + "grad_norm": 0.4688473828309847, + "learning_rate": 2.0081658474652663e-06, + "loss": 0.5483, + "step": 12766 + }, + { + "epoch": 3.3892207619806185, + "grad_norm": 0.44937018408371066, + "learning_rate": 2.007823547362752e-06, + "loss": 0.5586, + "step": 12767 + }, + { + "epoch": 3.389486260454002, + "grad_norm": 0.47422754117108673, + "learning_rate": 2.007481256858883e-06, + "loss": 0.5158, + "step": 12768 + }, + { + "epoch": 3.3897517589273862, + "grad_norm": 0.45521373201687426, + "learning_rate": 2.0071389759603366e-06, + "loss": 0.5453, + "step": 12769 + }, + { + "epoch": 3.39001725740077, + "grad_norm": 0.46347686684427203, + "learning_rate": 2.006796704673786e-06, + "loss": 0.5724, + "step": 12770 + }, + { + "epoch": 3.3902827558741535, + "grad_norm": 0.4615950752991866, + "learning_rate": 2.006454443005907e-06, + "loss": 0.5555, + "step": 12771 + }, + { + "epoch": 3.3905482543475376, + "grad_norm": 0.46261355357107664, + "learning_rate": 2.006112190963375e-06, + "loss": 0.5579, + "step": 12772 + }, + { + "epoch": 3.3908137528209212, + "grad_norm": 0.45761945006015803, + "learning_rate": 2.005769948552865e-06, + "loss": 0.5504, + "step": 12773 + }, + { + "epoch": 3.391079251294305, + "grad_norm": 0.45213388161587553, + "learning_rate": 2.0054277157810505e-06, + "loss": 0.5432, + "step": 12774 + }, + { + "epoch": 3.391344749767689, + "grad_norm": 0.4751857821962484, + "learning_rate": 2.0050854926546078e-06, + "loss": 0.5867, + "step": 12775 + }, + { + "epoch": 3.3916102482410726, + "grad_norm": 0.4623675886448786, + "learning_rate": 2.0047432791802084e-06, + "loss": 0.5486, + "step": 12776 + }, + { + "epoch": 3.3918757467144562, + "grad_norm": 0.45858716547291906, + "learning_rate": 2.004401075364527e-06, + "loss": 0.5366, + "step": 12777 + }, + { + "epoch": 3.3921412451878403, + "grad_norm": 0.4756204065848985, + "learning_rate": 2.004058881214239e-06, + "loss": 0.5279, + "step": 12778 + }, + { + "epoch": 3.392406743661224, + "grad_norm": 0.47153198509465694, + "learning_rate": 2.0037166967360174e-06, + "loss": 0.5312, + "step": 12779 + }, + { + "epoch": 3.3926722421346076, + "grad_norm": 0.4589978915018234, + "learning_rate": 2.003374521936536e-06, + "loss": 0.5728, + "step": 12780 + }, + { + "epoch": 3.3929377406079917, + "grad_norm": 0.4814911934067407, + "learning_rate": 2.0030323568224662e-06, + "loss": 0.5221, + "step": 12781 + }, + { + "epoch": 3.3932032390813753, + "grad_norm": 0.44575684633522467, + "learning_rate": 2.002690201400482e-06, + "loss": 0.4994, + "step": 12782 + }, + { + "epoch": 3.393468737554759, + "grad_norm": 0.4414049810759403, + "learning_rate": 2.002348055677257e-06, + "loss": 0.5841, + "step": 12783 + }, + { + "epoch": 3.393734236028143, + "grad_norm": 0.46338737339316516, + "learning_rate": 2.002005919659463e-06, + "loss": 0.5258, + "step": 12784 + }, + { + "epoch": 3.3939997345015267, + "grad_norm": 0.44667358661694545, + "learning_rate": 2.001663793353773e-06, + "loss": 0.5495, + "step": 12785 + }, + { + "epoch": 3.3942652329749103, + "grad_norm": 0.4560120278022204, + "learning_rate": 2.0013216767668597e-06, + "loss": 0.5649, + "step": 12786 + }, + { + "epoch": 3.394530731448294, + "grad_norm": 0.4622811354421234, + "learning_rate": 2.0009795699053942e-06, + "loss": 0.5595, + "step": 12787 + }, + { + "epoch": 3.394796229921678, + "grad_norm": 0.4496964910300942, + "learning_rate": 2.000637472776049e-06, + "loss": 0.5578, + "step": 12788 + }, + { + "epoch": 3.3950617283950617, + "grad_norm": 0.4568951263216965, + "learning_rate": 2.0002953853854955e-06, + "loss": 0.5776, + "step": 12789 + }, + { + "epoch": 3.3953272268684453, + "grad_norm": 0.45282303515490324, + "learning_rate": 1.9999533077404055e-06, + "loss": 0.5275, + "step": 12790 + }, + { + "epoch": 3.3955927253418294, + "grad_norm": 0.44016764025507116, + "learning_rate": 1.9996112398474506e-06, + "loss": 0.5463, + "step": 12791 + }, + { + "epoch": 3.395858223815213, + "grad_norm": 0.46519713526303824, + "learning_rate": 1.9992691817133025e-06, + "loss": 0.5531, + "step": 12792 + }, + { + "epoch": 3.3961237222885967, + "grad_norm": 0.45163241962601036, + "learning_rate": 1.9989271333446303e-06, + "loss": 0.5233, + "step": 12793 + }, + { + "epoch": 3.396389220761981, + "grad_norm": 0.46807815723645896, + "learning_rate": 1.998585094748106e-06, + "loss": 0.5694, + "step": 12794 + }, + { + "epoch": 3.3966547192353644, + "grad_norm": 0.4521198123980459, + "learning_rate": 1.9982430659304e-06, + "loss": 0.5744, + "step": 12795 + }, + { + "epoch": 3.396920217708748, + "grad_norm": 0.48158550107251835, + "learning_rate": 1.997901046898183e-06, + "loss": 0.5159, + "step": 12796 + }, + { + "epoch": 3.3971857161821317, + "grad_norm": 0.46534561546315134, + "learning_rate": 1.997559037658124e-06, + "loss": 0.5612, + "step": 12797 + }, + { + "epoch": 3.397451214655516, + "grad_norm": 0.46056984890636204, + "learning_rate": 1.9972170382168954e-06, + "loss": 0.573, + "step": 12798 + }, + { + "epoch": 3.3977167131288994, + "grad_norm": 0.47148293572806876, + "learning_rate": 1.9968750485811646e-06, + "loss": 0.5287, + "step": 12799 + }, + { + "epoch": 3.397982211602283, + "grad_norm": 0.48154856205884095, + "learning_rate": 1.9965330687576023e-06, + "loss": 0.5334, + "step": 12800 + }, + { + "epoch": 3.398247710075667, + "grad_norm": 0.47307298459509983, + "learning_rate": 1.9961910987528778e-06, + "loss": 0.5813, + "step": 12801 + }, + { + "epoch": 3.398513208549051, + "grad_norm": 0.43470676471261355, + "learning_rate": 1.9958491385736604e-06, + "loss": 0.5156, + "step": 12802 + }, + { + "epoch": 3.3987787070224345, + "grad_norm": 0.4686682255237146, + "learning_rate": 1.99550718822662e-06, + "loss": 0.5589, + "step": 12803 + }, + { + "epoch": 3.3990442054958185, + "grad_norm": 0.45497913772036674, + "learning_rate": 1.995165247718424e-06, + "loss": 0.5435, + "step": 12804 + }, + { + "epoch": 3.399309703969202, + "grad_norm": 0.4732442319763158, + "learning_rate": 1.9948233170557417e-06, + "loss": 0.5388, + "step": 12805 + }, + { + "epoch": 3.399575202442586, + "grad_norm": 0.45095023470538853, + "learning_rate": 1.9944813962452414e-06, + "loss": 0.5729, + "step": 12806 + }, + { + "epoch": 3.39984070091597, + "grad_norm": 0.47412774582553213, + "learning_rate": 1.9941394852935914e-06, + "loss": 0.5553, + "step": 12807 + }, + { + "epoch": 3.4001061993893535, + "grad_norm": 0.47183938419715915, + "learning_rate": 1.9937975842074604e-06, + "loss": 0.5535, + "step": 12808 + }, + { + "epoch": 3.400371697862737, + "grad_norm": 0.4680973187737429, + "learning_rate": 1.9934556929935174e-06, + "loss": 0.5124, + "step": 12809 + }, + { + "epoch": 3.4006371963361213, + "grad_norm": 0.4621455439858813, + "learning_rate": 1.993113811658427e-06, + "loss": 0.553, + "step": 12810 + }, + { + "epoch": 3.400902694809505, + "grad_norm": 0.4599085011440135, + "learning_rate": 1.9927719402088585e-06, + "loss": 0.5355, + "step": 12811 + }, + { + "epoch": 3.4011681932828886, + "grad_norm": 0.49478933307566275, + "learning_rate": 1.9924300786514787e-06, + "loss": 0.5294, + "step": 12812 + }, + { + "epoch": 3.4014336917562726, + "grad_norm": 0.4607392388405931, + "learning_rate": 1.9920882269929554e-06, + "loss": 0.5538, + "step": 12813 + }, + { + "epoch": 3.4016991902296563, + "grad_norm": 0.451448263256135, + "learning_rate": 1.991746385239955e-06, + "loss": 0.5277, + "step": 12814 + }, + { + "epoch": 3.40196468870304, + "grad_norm": 0.44729227497842544, + "learning_rate": 1.9914045533991456e-06, + "loss": 0.548, + "step": 12815 + }, + { + "epoch": 3.4022301871764236, + "grad_norm": 0.4564741795468326, + "learning_rate": 1.991062731477192e-06, + "loss": 0.5302, + "step": 12816 + }, + { + "epoch": 3.4024956856498076, + "grad_norm": 0.4610699670531906, + "learning_rate": 1.9907209194807607e-06, + "loss": 0.5071, + "step": 12817 + }, + { + "epoch": 3.4027611841231913, + "grad_norm": 0.4690518790762982, + "learning_rate": 1.9903791174165187e-06, + "loss": 0.5551, + "step": 12818 + }, + { + "epoch": 3.403026682596575, + "grad_norm": 0.4593606997471958, + "learning_rate": 1.9900373252911318e-06, + "loss": 0.5211, + "step": 12819 + }, + { + "epoch": 3.403292181069959, + "grad_norm": 0.4532941863254376, + "learning_rate": 1.9896955431112656e-06, + "loss": 0.5791, + "step": 12820 + }, + { + "epoch": 3.4035576795433427, + "grad_norm": 0.4720703055954754, + "learning_rate": 1.989353770883587e-06, + "loss": 0.5906, + "step": 12821 + }, + { + "epoch": 3.4038231780167263, + "grad_norm": 0.4624228327745245, + "learning_rate": 1.9890120086147586e-06, + "loss": 0.5592, + "step": 12822 + }, + { + "epoch": 3.40408867649011, + "grad_norm": 0.4717813246024548, + "learning_rate": 1.988670256311447e-06, + "loss": 0.5621, + "step": 12823 + }, + { + "epoch": 3.404354174963494, + "grad_norm": 0.4823129694044606, + "learning_rate": 1.9883285139803175e-06, + "loss": 0.5399, + "step": 12824 + }, + { + "epoch": 3.4046196734368777, + "grad_norm": 0.4545405613069321, + "learning_rate": 1.987986781628035e-06, + "loss": 0.5336, + "step": 12825 + }, + { + "epoch": 3.4048851719102613, + "grad_norm": 0.46806772701864774, + "learning_rate": 1.9876450592612643e-06, + "loss": 0.5493, + "step": 12826 + }, + { + "epoch": 3.4051506703836454, + "grad_norm": 0.46297188021680963, + "learning_rate": 1.98730334688667e-06, + "loss": 0.547, + "step": 12827 + }, + { + "epoch": 3.405416168857029, + "grad_norm": 0.46214696435760505, + "learning_rate": 1.9869616445109146e-06, + "loss": 0.5257, + "step": 12828 + }, + { + "epoch": 3.4056816673304127, + "grad_norm": 0.45961305683228143, + "learning_rate": 1.986619952140664e-06, + "loss": 0.5374, + "step": 12829 + }, + { + "epoch": 3.4059471658037968, + "grad_norm": 0.4539132220365548, + "learning_rate": 1.9862782697825807e-06, + "loss": 0.5046, + "step": 12830 + }, + { + "epoch": 3.4062126642771804, + "grad_norm": 0.45665071879455377, + "learning_rate": 1.9859365974433286e-06, + "loss": 0.5807, + "step": 12831 + }, + { + "epoch": 3.406478162750564, + "grad_norm": 0.47370034791747617, + "learning_rate": 1.9855949351295724e-06, + "loss": 0.522, + "step": 12832 + }, + { + "epoch": 3.406743661223948, + "grad_norm": 0.4650789210759116, + "learning_rate": 1.985253282847974e-06, + "loss": 0.575, + "step": 12833 + }, + { + "epoch": 3.4070091596973318, + "grad_norm": 0.45900766862452075, + "learning_rate": 1.9849116406051967e-06, + "loss": 0.5447, + "step": 12834 + }, + { + "epoch": 3.4072746581707154, + "grad_norm": 0.4765245161450273, + "learning_rate": 1.984570008407904e-06, + "loss": 0.5527, + "step": 12835 + }, + { + "epoch": 3.4075401566440995, + "grad_norm": 0.4503584108780535, + "learning_rate": 1.9842283862627575e-06, + "loss": 0.5766, + "step": 12836 + }, + { + "epoch": 3.407805655117483, + "grad_norm": 0.4500123549928934, + "learning_rate": 1.9838867741764207e-06, + "loss": 0.5438, + "step": 12837 + }, + { + "epoch": 3.4080711535908668, + "grad_norm": 0.44972832557463593, + "learning_rate": 1.9835451721555564e-06, + "loss": 0.5684, + "step": 12838 + }, + { + "epoch": 3.408336652064251, + "grad_norm": 0.4705877893456991, + "learning_rate": 1.983203580206825e-06, + "loss": 0.5453, + "step": 12839 + }, + { + "epoch": 3.4086021505376345, + "grad_norm": 0.46942107608376754, + "learning_rate": 1.9828619983368884e-06, + "loss": 0.5519, + "step": 12840 + }, + { + "epoch": 3.408867649011018, + "grad_norm": 0.4598823261174286, + "learning_rate": 1.9825204265524093e-06, + "loss": 0.5294, + "step": 12841 + }, + { + "epoch": 3.4091331474844018, + "grad_norm": 0.4518402010744265, + "learning_rate": 1.982178864860049e-06, + "loss": 0.5091, + "step": 12842 + }, + { + "epoch": 3.409398645957786, + "grad_norm": 0.4418022717976794, + "learning_rate": 1.9818373132664693e-06, + "loss": 0.5701, + "step": 12843 + }, + { + "epoch": 3.4096641444311695, + "grad_norm": 0.45969587376638854, + "learning_rate": 1.9814957717783305e-06, + "loss": 0.5976, + "step": 12844 + }, + { + "epoch": 3.409929642904553, + "grad_norm": 0.4529674799169325, + "learning_rate": 1.981154240402293e-06, + "loss": 0.5543, + "step": 12845 + }, + { + "epoch": 3.4101951413779372, + "grad_norm": 0.4615682052425449, + "learning_rate": 1.9808127191450186e-06, + "loss": 0.5519, + "step": 12846 + }, + { + "epoch": 3.410460639851321, + "grad_norm": 0.47708133755211074, + "learning_rate": 1.9804712080131672e-06, + "loss": 0.566, + "step": 12847 + }, + { + "epoch": 3.4107261383247045, + "grad_norm": 0.46859110158502626, + "learning_rate": 1.9801297070133994e-06, + "loss": 0.5742, + "step": 12848 + }, + { + "epoch": 3.4109916367980886, + "grad_norm": 0.4632276221339485, + "learning_rate": 1.979788216152375e-06, + "loss": 0.5521, + "step": 12849 + }, + { + "epoch": 3.4112571352714722, + "grad_norm": 0.4580371246156873, + "learning_rate": 1.9794467354367546e-06, + "loss": 0.5754, + "step": 12850 + }, + { + "epoch": 3.411522633744856, + "grad_norm": 0.4762097544937925, + "learning_rate": 1.979105264873197e-06, + "loss": 0.5409, + "step": 12851 + }, + { + "epoch": 3.4117881322182395, + "grad_norm": 0.4572323394054003, + "learning_rate": 1.9787638044683618e-06, + "loss": 0.5906, + "step": 12852 + }, + { + "epoch": 3.4120536306916236, + "grad_norm": 0.4591241225194067, + "learning_rate": 1.978422354228909e-06, + "loss": 0.5308, + "step": 12853 + }, + { + "epoch": 3.4123191291650072, + "grad_norm": 0.4731521598331077, + "learning_rate": 1.978080914161497e-06, + "loss": 0.5366, + "step": 12854 + }, + { + "epoch": 3.412584627638391, + "grad_norm": 0.47882979229162537, + "learning_rate": 1.9777394842727863e-06, + "loss": 0.4872, + "step": 12855 + }, + { + "epoch": 3.412850126111775, + "grad_norm": 0.46590549077603866, + "learning_rate": 1.9773980645694332e-06, + "loss": 0.5266, + "step": 12856 + }, + { + "epoch": 3.4131156245851586, + "grad_norm": 0.4614104090952774, + "learning_rate": 1.9770566550580977e-06, + "loss": 0.5015, + "step": 12857 + }, + { + "epoch": 3.4133811230585422, + "grad_norm": 0.45889354545369804, + "learning_rate": 1.9767152557454373e-06, + "loss": 0.5583, + "step": 12858 + }, + { + "epoch": 3.4136466215319263, + "grad_norm": 0.46555821475775977, + "learning_rate": 1.976373866638111e-06, + "loss": 0.5592, + "step": 12859 + }, + { + "epoch": 3.41391212000531, + "grad_norm": 0.45068964719599014, + "learning_rate": 1.9760324877427763e-06, + "loss": 0.539, + "step": 12860 + }, + { + "epoch": 3.4141776184786936, + "grad_norm": 0.45570304991383503, + "learning_rate": 1.9756911190660915e-06, + "loss": 0.5334, + "step": 12861 + }, + { + "epoch": 3.4144431169520777, + "grad_norm": 0.4501857328043723, + "learning_rate": 1.975349760614713e-06, + "loss": 0.5464, + "step": 12862 + }, + { + "epoch": 3.4147086154254613, + "grad_norm": 0.46148786629963523, + "learning_rate": 1.975008412395299e-06, + "loss": 0.5751, + "step": 12863 + }, + { + "epoch": 3.414974113898845, + "grad_norm": 0.4585975356195316, + "learning_rate": 1.974667074414506e-06, + "loss": 0.5191, + "step": 12864 + }, + { + "epoch": 3.415239612372229, + "grad_norm": 0.4510225928457239, + "learning_rate": 1.9743257466789917e-06, + "loss": 0.525, + "step": 12865 + }, + { + "epoch": 3.4155051108456127, + "grad_norm": 0.4654253341380489, + "learning_rate": 1.973984429195412e-06, + "loss": 0.5664, + "step": 12866 + }, + { + "epoch": 3.4157706093189963, + "grad_norm": 0.4694476702514629, + "learning_rate": 1.973643121970425e-06, + "loss": 0.5712, + "step": 12867 + }, + { + "epoch": 3.4160361077923804, + "grad_norm": 0.4722267490423635, + "learning_rate": 1.973301825010685e-06, + "loss": 0.5667, + "step": 12868 + }, + { + "epoch": 3.416301606265764, + "grad_norm": 0.47336771971303976, + "learning_rate": 1.9729605383228485e-06, + "loss": 0.5635, + "step": 12869 + }, + { + "epoch": 3.4165671047391477, + "grad_norm": 0.4704264060272286, + "learning_rate": 1.972619261913572e-06, + "loss": 0.5388, + "step": 12870 + }, + { + "epoch": 3.4168326032125313, + "grad_norm": 0.46361806422929314, + "learning_rate": 1.9722779957895114e-06, + "loss": 0.5522, + "step": 12871 + }, + { + "epoch": 3.4170981016859154, + "grad_norm": 0.4410717204039472, + "learning_rate": 1.971936739957322e-06, + "loss": 0.5249, + "step": 12872 + }, + { + "epoch": 3.417363600159299, + "grad_norm": 0.45280642979272157, + "learning_rate": 1.9715954944236603e-06, + "loss": 0.555, + "step": 12873 + }, + { + "epoch": 3.4176290986326827, + "grad_norm": 0.4587876387956169, + "learning_rate": 1.9712542591951787e-06, + "loss": 0.548, + "step": 12874 + }, + { + "epoch": 3.417894597106067, + "grad_norm": 0.44137031716794967, + "learning_rate": 1.970913034278534e-06, + "loss": 0.5423, + "step": 12875 + }, + { + "epoch": 3.4181600955794504, + "grad_norm": 0.45230503039464187, + "learning_rate": 1.9705718196803807e-06, + "loss": 0.5187, + "step": 12876 + }, + { + "epoch": 3.418425594052834, + "grad_norm": 0.4706104204206226, + "learning_rate": 1.970230615407373e-06, + "loss": 0.5585, + "step": 12877 + }, + { + "epoch": 3.418691092526218, + "grad_norm": 0.44932892089176757, + "learning_rate": 1.969889421466166e-06, + "loss": 0.5303, + "step": 12878 + }, + { + "epoch": 3.418956590999602, + "grad_norm": 0.4603762722031276, + "learning_rate": 1.9695482378634124e-06, + "loss": 0.5377, + "step": 12879 + }, + { + "epoch": 3.4192220894729854, + "grad_norm": 0.47030706162554686, + "learning_rate": 1.969207064605767e-06, + "loss": 0.5482, + "step": 12880 + }, + { + "epoch": 3.419487587946369, + "grad_norm": 0.46159296769742875, + "learning_rate": 1.9688659016998836e-06, + "loss": 0.5831, + "step": 12881 + }, + { + "epoch": 3.419753086419753, + "grad_norm": 0.4910339424156637, + "learning_rate": 1.9685247491524157e-06, + "loss": 0.5573, + "step": 12882 + }, + { + "epoch": 3.420018584893137, + "grad_norm": 0.44733412723024585, + "learning_rate": 1.9681836069700163e-06, + "loss": 0.5749, + "step": 12883 + }, + { + "epoch": 3.4202840833665205, + "grad_norm": 0.4471894912424108, + "learning_rate": 1.9678424751593397e-06, + "loss": 0.5334, + "step": 12884 + }, + { + "epoch": 3.4205495818399045, + "grad_norm": 0.4644009215745443, + "learning_rate": 1.9675013537270367e-06, + "loss": 0.5209, + "step": 12885 + }, + { + "epoch": 3.420815080313288, + "grad_norm": 0.45148650407953794, + "learning_rate": 1.9671602426797613e-06, + "loss": 0.5392, + "step": 12886 + }, + { + "epoch": 3.421080578786672, + "grad_norm": 0.4612184389492654, + "learning_rate": 1.9668191420241656e-06, + "loss": 0.5413, + "step": 12887 + }, + { + "epoch": 3.421346077260056, + "grad_norm": 0.45381529045092095, + "learning_rate": 1.9664780517669018e-06, + "loss": 0.5318, + "step": 12888 + }, + { + "epoch": 3.4216115757334395, + "grad_norm": 0.46233706778645767, + "learning_rate": 1.9661369719146224e-06, + "loss": 0.6002, + "step": 12889 + }, + { + "epoch": 3.421877074206823, + "grad_norm": 0.44544983775776853, + "learning_rate": 1.96579590247398e-06, + "loss": 0.5177, + "step": 12890 + }, + { + "epoch": 3.4221425726802073, + "grad_norm": 0.4582796123374127, + "learning_rate": 1.965454843451625e-06, + "loss": 0.5341, + "step": 12891 + }, + { + "epoch": 3.422408071153591, + "grad_norm": 0.4605727241122683, + "learning_rate": 1.9651137948542093e-06, + "loss": 0.5346, + "step": 12892 + }, + { + "epoch": 3.4226735696269746, + "grad_norm": 0.4624770345395566, + "learning_rate": 1.9647727566883838e-06, + "loss": 0.5203, + "step": 12893 + }, + { + "epoch": 3.4229390681003586, + "grad_norm": 0.45980284378791414, + "learning_rate": 1.9644317289608006e-06, + "loss": 0.5474, + "step": 12894 + }, + { + "epoch": 3.4232045665737423, + "grad_norm": 0.46819796216641063, + "learning_rate": 1.96409071167811e-06, + "loss": 0.5601, + "step": 12895 + }, + { + "epoch": 3.423470065047126, + "grad_norm": 0.4523077971838003, + "learning_rate": 1.9637497048469625e-06, + "loss": 0.5344, + "step": 12896 + }, + { + "epoch": 3.42373556352051, + "grad_norm": 0.45830761716803503, + "learning_rate": 1.9634087084740085e-06, + "loss": 0.545, + "step": 12897 + }, + { + "epoch": 3.4240010619938936, + "grad_norm": 0.45830484494469326, + "learning_rate": 1.9630677225658984e-06, + "loss": 0.5344, + "step": 12898 + }, + { + "epoch": 3.4242665604672773, + "grad_norm": 0.46134814845585176, + "learning_rate": 1.9627267471292826e-06, + "loss": 0.5385, + "step": 12899 + }, + { + "epoch": 3.424532058940661, + "grad_norm": 0.45729474976246104, + "learning_rate": 1.9623857821708105e-06, + "loss": 0.5267, + "step": 12900 + }, + { + "epoch": 3.424797557414045, + "grad_norm": 0.46117831968806905, + "learning_rate": 1.962044827697133e-06, + "loss": 0.5662, + "step": 12901 + }, + { + "epoch": 3.4250630558874287, + "grad_norm": 0.4641718812991612, + "learning_rate": 1.961703883714898e-06, + "loss": 0.5484, + "step": 12902 + }, + { + "epoch": 3.4253285543608123, + "grad_norm": 0.45924887664451797, + "learning_rate": 1.9613629502307547e-06, + "loss": 0.5623, + "step": 12903 + }, + { + "epoch": 3.4255940528341964, + "grad_norm": 0.47499072665285175, + "learning_rate": 1.9610220272513527e-06, + "loss": 0.5635, + "step": 12904 + }, + { + "epoch": 3.42585955130758, + "grad_norm": 0.4596436615054621, + "learning_rate": 1.9606811147833408e-06, + "loss": 0.5324, + "step": 12905 + }, + { + "epoch": 3.4261250497809637, + "grad_norm": 0.4579141422195126, + "learning_rate": 1.9603402128333676e-06, + "loss": 0.5477, + "step": 12906 + }, + { + "epoch": 3.4263905482543473, + "grad_norm": 0.4663383945138551, + "learning_rate": 1.959999321408082e-06, + "loss": 0.5543, + "step": 12907 + }, + { + "epoch": 3.4266560467277314, + "grad_norm": 0.46575955643248895, + "learning_rate": 1.9596584405141317e-06, + "loss": 0.5628, + "step": 12908 + }, + { + "epoch": 3.426921545201115, + "grad_norm": 0.46297050723337135, + "learning_rate": 1.959317570158165e-06, + "loss": 0.5413, + "step": 12909 + }, + { + "epoch": 3.4271870436744987, + "grad_norm": 0.46136492395227163, + "learning_rate": 1.958976710346829e-06, + "loss": 0.5624, + "step": 12910 + }, + { + "epoch": 3.4274525421478828, + "grad_norm": 0.47160562951935325, + "learning_rate": 1.9586358610867715e-06, + "loss": 0.5293, + "step": 12911 + }, + { + "epoch": 3.4277180406212664, + "grad_norm": 0.4485740460042685, + "learning_rate": 1.9582950223846405e-06, + "loss": 0.5745, + "step": 12912 + }, + { + "epoch": 3.42798353909465, + "grad_norm": 0.4658432484885228, + "learning_rate": 1.9579541942470844e-06, + "loss": 0.5131, + "step": 12913 + }, + { + "epoch": 3.428249037568034, + "grad_norm": 0.45407041267032655, + "learning_rate": 1.9576133766807463e-06, + "loss": 0.529, + "step": 12914 + }, + { + "epoch": 3.4285145360414178, + "grad_norm": 0.4725532226115244, + "learning_rate": 1.957272569692276e-06, + "loss": 0.57, + "step": 12915 + }, + { + "epoch": 3.4287800345148014, + "grad_norm": 0.46011816212871426, + "learning_rate": 1.95693177328832e-06, + "loss": 0.5403, + "step": 12916 + }, + { + "epoch": 3.4290455329881855, + "grad_norm": 0.4468210798597445, + "learning_rate": 1.956590987475524e-06, + "loss": 0.557, + "step": 12917 + }, + { + "epoch": 3.429311031461569, + "grad_norm": 0.4625799592730683, + "learning_rate": 1.956250212260534e-06, + "loss": 0.5736, + "step": 12918 + }, + { + "epoch": 3.4295765299349528, + "grad_norm": 0.4549201024386158, + "learning_rate": 1.9559094476499973e-06, + "loss": 0.5381, + "step": 12919 + }, + { + "epoch": 3.429842028408337, + "grad_norm": 0.4553946652009198, + "learning_rate": 1.955568693650557e-06, + "loss": 0.5448, + "step": 12920 + }, + { + "epoch": 3.4301075268817205, + "grad_norm": 0.45028598328021646, + "learning_rate": 1.955227950268861e-06, + "loss": 0.5606, + "step": 12921 + }, + { + "epoch": 3.430373025355104, + "grad_norm": 0.4461221682248535, + "learning_rate": 1.954887217511553e-06, + "loss": 0.5507, + "step": 12922 + }, + { + "epoch": 3.430638523828488, + "grad_norm": 0.4574590538434611, + "learning_rate": 1.9545464953852796e-06, + "loss": 0.5421, + "step": 12923 + }, + { + "epoch": 3.430904022301872, + "grad_norm": 0.46589754237031117, + "learning_rate": 1.9542057838966854e-06, + "loss": 0.5535, + "step": 12924 + }, + { + "epoch": 3.4311695207752555, + "grad_norm": 0.44298247358947884, + "learning_rate": 1.953865083052414e-06, + "loss": 0.4863, + "step": 12925 + }, + { + "epoch": 3.431435019248639, + "grad_norm": 0.46628421302932604, + "learning_rate": 1.9535243928591108e-06, + "loss": 0.5141, + "step": 12926 + }, + { + "epoch": 3.4317005177220232, + "grad_norm": 0.45992650915481853, + "learning_rate": 1.9531837133234195e-06, + "loss": 0.5544, + "step": 12927 + }, + { + "epoch": 3.431966016195407, + "grad_norm": 0.4592860348279172, + "learning_rate": 1.9528430444519853e-06, + "loss": 0.5605, + "step": 12928 + }, + { + "epoch": 3.4322315146687905, + "grad_norm": 0.4607819633495851, + "learning_rate": 1.952502386251451e-06, + "loss": 0.5341, + "step": 12929 + }, + { + "epoch": 3.4324970131421746, + "grad_norm": 0.461864596051747, + "learning_rate": 1.952161738728462e-06, + "loss": 0.5465, + "step": 12930 + }, + { + "epoch": 3.4327625116155582, + "grad_norm": 0.45195735747783655, + "learning_rate": 1.9518211018896593e-06, + "loss": 0.5769, + "step": 12931 + }, + { + "epoch": 3.433028010088942, + "grad_norm": 0.479496426249268, + "learning_rate": 1.9514804757416873e-06, + "loss": 0.5449, + "step": 12932 + }, + { + "epoch": 3.433293508562326, + "grad_norm": 0.4517342348341856, + "learning_rate": 1.9511398602911888e-06, + "loss": 0.5433, + "step": 12933 + }, + { + "epoch": 3.4335590070357096, + "grad_norm": 0.46119555680016766, + "learning_rate": 1.950799255544807e-06, + "loss": 0.5689, + "step": 12934 + }, + { + "epoch": 3.4338245055090932, + "grad_norm": 0.4691819288154536, + "learning_rate": 1.9504586615091838e-06, + "loss": 0.5724, + "step": 12935 + }, + { + "epoch": 3.434090003982477, + "grad_norm": 0.4742284134038077, + "learning_rate": 1.950118078190964e-06, + "loss": 0.5507, + "step": 12936 + }, + { + "epoch": 3.434355502455861, + "grad_norm": 0.4628072947168689, + "learning_rate": 1.9497775055967868e-06, + "loss": 0.5636, + "step": 12937 + }, + { + "epoch": 3.4346210009292446, + "grad_norm": 0.44871096409521455, + "learning_rate": 1.9494369437332957e-06, + "loss": 0.6042, + "step": 12938 + }, + { + "epoch": 3.4348864994026282, + "grad_norm": 0.45240624036476185, + "learning_rate": 1.9490963926071318e-06, + "loss": 0.5242, + "step": 12939 + }, + { + "epoch": 3.4351519978760123, + "grad_norm": 0.45733864780703987, + "learning_rate": 1.948755852224937e-06, + "loss": 0.5379, + "step": 12940 + }, + { + "epoch": 3.435417496349396, + "grad_norm": 0.44943786161537186, + "learning_rate": 1.9484153225933535e-06, + "loss": 0.5392, + "step": 12941 + }, + { + "epoch": 3.4356829948227796, + "grad_norm": 0.4613805516585124, + "learning_rate": 1.9480748037190215e-06, + "loss": 0.5409, + "step": 12942 + }, + { + "epoch": 3.4359484932961637, + "grad_norm": 0.4488877160524104, + "learning_rate": 1.947734295608582e-06, + "loss": 0.5633, + "step": 12943 + }, + { + "epoch": 3.4362139917695473, + "grad_norm": 0.4856772261979697, + "learning_rate": 1.947393798268676e-06, + "loss": 0.5423, + "step": 12944 + }, + { + "epoch": 3.436479490242931, + "grad_norm": 0.4620520066414312, + "learning_rate": 1.9470533117059436e-06, + "loss": 0.5693, + "step": 12945 + }, + { + "epoch": 3.436744988716315, + "grad_norm": 0.4622929628487836, + "learning_rate": 1.9467128359270255e-06, + "loss": 0.5579, + "step": 12946 + }, + { + "epoch": 3.4370104871896987, + "grad_norm": 0.4635115627694077, + "learning_rate": 1.9463723709385617e-06, + "loss": 0.5498, + "step": 12947 + }, + { + "epoch": 3.4372759856630823, + "grad_norm": 0.46035499387710876, + "learning_rate": 1.9460319167471934e-06, + "loss": 0.5891, + "step": 12948 + }, + { + "epoch": 3.4375414841364664, + "grad_norm": 0.46650348115065166, + "learning_rate": 1.9456914733595584e-06, + "loss": 0.5179, + "step": 12949 + }, + { + "epoch": 3.43780698260985, + "grad_norm": 0.45488514103637406, + "learning_rate": 1.9453510407822967e-06, + "loss": 0.5342, + "step": 12950 + }, + { + "epoch": 3.4380724810832337, + "grad_norm": 0.46553115481654656, + "learning_rate": 1.9450106190220475e-06, + "loss": 0.5726, + "step": 12951 + }, + { + "epoch": 3.438337979556618, + "grad_norm": 0.4700238232533625, + "learning_rate": 1.94467020808545e-06, + "loss": 0.564, + "step": 12952 + }, + { + "epoch": 3.4386034780300014, + "grad_norm": 0.45134818803427124, + "learning_rate": 1.944329807979144e-06, + "loss": 0.5195, + "step": 12953 + }, + { + "epoch": 3.438868976503385, + "grad_norm": 0.46680281992986916, + "learning_rate": 1.9439894187097665e-06, + "loss": 0.558, + "step": 12954 + }, + { + "epoch": 3.4391344749767687, + "grad_norm": 0.469956051572363, + "learning_rate": 1.9436490402839563e-06, + "loss": 0.5668, + "step": 12955 + }, + { + "epoch": 3.439399973450153, + "grad_norm": 0.4542521471498369, + "learning_rate": 1.9433086727083527e-06, + "loss": 0.5394, + "step": 12956 + }, + { + "epoch": 3.4396654719235364, + "grad_norm": 0.45752943744675856, + "learning_rate": 1.9429683159895926e-06, + "loss": 0.5587, + "step": 12957 + }, + { + "epoch": 3.43993097039692, + "grad_norm": 0.4741053283748696, + "learning_rate": 1.9426279701343143e-06, + "loss": 0.5798, + "step": 12958 + }, + { + "epoch": 3.440196468870304, + "grad_norm": 0.455475030055455, + "learning_rate": 1.9422876351491563e-06, + "loss": 0.5558, + "step": 12959 + }, + { + "epoch": 3.440461967343688, + "grad_norm": 0.46591850052394224, + "learning_rate": 1.9419473110407535e-06, + "loss": 0.554, + "step": 12960 + }, + { + "epoch": 3.4407274658170715, + "grad_norm": 0.45612183076371154, + "learning_rate": 1.9416069978157444e-06, + "loss": 0.5206, + "step": 12961 + }, + { + "epoch": 3.440992964290455, + "grad_norm": 0.47201723229335385, + "learning_rate": 1.941266695480767e-06, + "loss": 0.5328, + "step": 12962 + }, + { + "epoch": 3.441258462763839, + "grad_norm": 0.44387207177367005, + "learning_rate": 1.9409264040424562e-06, + "loss": 0.5684, + "step": 12963 + }, + { + "epoch": 3.441523961237223, + "grad_norm": 0.44364764654564254, + "learning_rate": 1.9405861235074497e-06, + "loss": 0.5585, + "step": 12964 + }, + { + "epoch": 3.4417894597106065, + "grad_norm": 0.4776868517754954, + "learning_rate": 1.9402458538823846e-06, + "loss": 0.5169, + "step": 12965 + }, + { + "epoch": 3.4420549581839905, + "grad_norm": 0.44619152084240254, + "learning_rate": 1.9399055951738945e-06, + "loss": 0.5228, + "step": 12966 + }, + { + "epoch": 3.442320456657374, + "grad_norm": 0.4524314091409978, + "learning_rate": 1.939565347388617e-06, + "loss": 0.5239, + "step": 12967 + }, + { + "epoch": 3.442585955130758, + "grad_norm": 0.45549877485236334, + "learning_rate": 1.939225110533187e-06, + "loss": 0.5657, + "step": 12968 + }, + { + "epoch": 3.442851453604142, + "grad_norm": 0.46692350590442394, + "learning_rate": 1.938884884614241e-06, + "loss": 0.5609, + "step": 12969 + }, + { + "epoch": 3.4431169520775255, + "grad_norm": 0.4802941260261063, + "learning_rate": 1.938544669638413e-06, + "loss": 0.5422, + "step": 12970 + }, + { + "epoch": 3.443382450550909, + "grad_norm": 0.45933349881384117, + "learning_rate": 1.9382044656123395e-06, + "loss": 0.5698, + "step": 12971 + }, + { + "epoch": 3.4436479490242933, + "grad_norm": 0.4785034673748016, + "learning_rate": 1.937864272542654e-06, + "loss": 0.5752, + "step": 12972 + }, + { + "epoch": 3.443913447497677, + "grad_norm": 0.47396198014458224, + "learning_rate": 1.937524090435991e-06, + "loss": 0.5533, + "step": 12973 + }, + { + "epoch": 3.4441789459710606, + "grad_norm": 0.46184131634312225, + "learning_rate": 1.937183919298986e-06, + "loss": 0.5911, + "step": 12974 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.4636070941821404, + "learning_rate": 1.936843759138272e-06, + "loss": 0.5669, + "step": 12975 + }, + { + "epoch": 3.4447099429178283, + "grad_norm": 0.4625025838055727, + "learning_rate": 1.9365036099604853e-06, + "loss": 0.555, + "step": 12976 + }, + { + "epoch": 3.444975441391212, + "grad_norm": 0.4637125271824205, + "learning_rate": 1.9361634717722564e-06, + "loss": 0.5129, + "step": 12977 + }, + { + "epoch": 3.445240939864596, + "grad_norm": 0.45672742797316584, + "learning_rate": 1.9358233445802207e-06, + "loss": 0.5504, + "step": 12978 + }, + { + "epoch": 3.4455064383379796, + "grad_norm": 0.47064989311627237, + "learning_rate": 1.9354832283910104e-06, + "loss": 0.5462, + "step": 12979 + }, + { + "epoch": 3.4457719368113633, + "grad_norm": 0.4597680371772616, + "learning_rate": 1.9351431232112595e-06, + "loss": 0.5383, + "step": 12980 + }, + { + "epoch": 3.446037435284747, + "grad_norm": 0.4489850745812362, + "learning_rate": 1.9348030290476007e-06, + "loss": 0.5017, + "step": 12981 + }, + { + "epoch": 3.446302933758131, + "grad_norm": 0.4747827448856693, + "learning_rate": 1.934462945906668e-06, + "loss": 0.5641, + "step": 12982 + }, + { + "epoch": 3.4465684322315147, + "grad_norm": 0.45141052289917666, + "learning_rate": 1.9341228737950913e-06, + "loss": 0.5332, + "step": 12983 + }, + { + "epoch": 3.4468339307048983, + "grad_norm": 0.47190703453491, + "learning_rate": 1.933782812719504e-06, + "loss": 0.5269, + "step": 12984 + }, + { + "epoch": 3.4470994291782824, + "grad_norm": 0.4565627979082249, + "learning_rate": 1.9334427626865383e-06, + "loss": 0.5484, + "step": 12985 + }, + { + "epoch": 3.447364927651666, + "grad_norm": 0.46595123566203817, + "learning_rate": 1.933102723702826e-06, + "loss": 0.5344, + "step": 12986 + }, + { + "epoch": 3.4476304261250497, + "grad_norm": 0.4510514834444204, + "learning_rate": 1.9327626957749986e-06, + "loss": 0.5501, + "step": 12987 + }, + { + "epoch": 3.4478959245984337, + "grad_norm": 0.45489740136255896, + "learning_rate": 1.932422678909688e-06, + "loss": 0.5627, + "step": 12988 + }, + { + "epoch": 3.4481614230718174, + "grad_norm": 0.45673515331617426, + "learning_rate": 1.9320826731135243e-06, + "loss": 0.543, + "step": 12989 + }, + { + "epoch": 3.448426921545201, + "grad_norm": 0.4640846262798768, + "learning_rate": 1.931742678393139e-06, + "loss": 0.5384, + "step": 12990 + }, + { + "epoch": 3.4486924200185847, + "grad_norm": 0.4542670857082774, + "learning_rate": 1.9314026947551626e-06, + "loss": 0.5359, + "step": 12991 + }, + { + "epoch": 3.4489579184919688, + "grad_norm": 0.46515483064203755, + "learning_rate": 1.931062722206226e-06, + "loss": 0.566, + "step": 12992 + }, + { + "epoch": 3.4492234169653524, + "grad_norm": 0.4604447888085944, + "learning_rate": 1.9307227607529593e-06, + "loss": 0.5551, + "step": 12993 + }, + { + "epoch": 3.449488915438736, + "grad_norm": 0.4446424993801546, + "learning_rate": 1.9303828104019937e-06, + "loss": 0.5346, + "step": 12994 + }, + { + "epoch": 3.44975441391212, + "grad_norm": 0.4484489012863962, + "learning_rate": 1.9300428711599566e-06, + "loss": 0.5352, + "step": 12995 + }, + { + "epoch": 3.4500199123855038, + "grad_norm": 0.46231579352536706, + "learning_rate": 1.9297029430334795e-06, + "loss": 0.532, + "step": 12996 + }, + { + "epoch": 3.4502854108588874, + "grad_norm": 0.4559535463049683, + "learning_rate": 1.929363026029191e-06, + "loss": 0.5567, + "step": 12997 + }, + { + "epoch": 3.4505509093322715, + "grad_norm": 0.46000416486733225, + "learning_rate": 1.929023120153721e-06, + "loss": 0.5744, + "step": 12998 + }, + { + "epoch": 3.450816407805655, + "grad_norm": 0.47586689951210737, + "learning_rate": 1.928683225413698e-06, + "loss": 0.5478, + "step": 12999 + }, + { + "epoch": 3.4510819062790388, + "grad_norm": 0.46517710338815815, + "learning_rate": 1.928343341815751e-06, + "loss": 0.5794, + "step": 13000 + }, + { + "epoch": 3.451347404752423, + "grad_norm": 0.46423245458865164, + "learning_rate": 1.928003469366508e-06, + "loss": 0.5449, + "step": 13001 + }, + { + "epoch": 3.4516129032258065, + "grad_norm": 0.4565970700937282, + "learning_rate": 1.9276636080725982e-06, + "loss": 0.535, + "step": 13002 + }, + { + "epoch": 3.45187840169919, + "grad_norm": 0.4743534714770601, + "learning_rate": 1.9273237579406494e-06, + "loss": 0.5566, + "step": 13003 + }, + { + "epoch": 3.452143900172574, + "grad_norm": 0.4611510372062144, + "learning_rate": 1.9269839189772895e-06, + "loss": 0.5505, + "step": 13004 + }, + { + "epoch": 3.452409398645958, + "grad_norm": 0.4615038078690836, + "learning_rate": 1.9266440911891465e-06, + "loss": 0.5423, + "step": 13005 + }, + { + "epoch": 3.4526748971193415, + "grad_norm": 0.45557747074812305, + "learning_rate": 1.926304274582847e-06, + "loss": 0.5415, + "step": 13006 + }, + { + "epoch": 3.4529403955927256, + "grad_norm": 0.4528829310353177, + "learning_rate": 1.925964469165019e-06, + "loss": 0.5423, + "step": 13007 + }, + { + "epoch": 3.4532058940661092, + "grad_norm": 0.47067752584222516, + "learning_rate": 1.925624674942289e-06, + "loss": 0.5628, + "step": 13008 + }, + { + "epoch": 3.453471392539493, + "grad_norm": 0.4607514602768546, + "learning_rate": 1.9252848919212846e-06, + "loss": 0.5355, + "step": 13009 + }, + { + "epoch": 3.4537368910128765, + "grad_norm": 0.44985792204100933, + "learning_rate": 1.9249451201086317e-06, + "loss": 0.5654, + "step": 13010 + }, + { + "epoch": 3.4540023894862606, + "grad_norm": 0.45449182303826563, + "learning_rate": 1.924605359510958e-06, + "loss": 0.5616, + "step": 13011 + }, + { + "epoch": 3.4542678879596442, + "grad_norm": 0.47226212212236696, + "learning_rate": 1.9242656101348874e-06, + "loss": 0.586, + "step": 13012 + }, + { + "epoch": 3.454533386433028, + "grad_norm": 0.4691818821114095, + "learning_rate": 1.9239258719870473e-06, + "loss": 0.5587, + "step": 13013 + }, + { + "epoch": 3.454798884906412, + "grad_norm": 0.45479814401412116, + "learning_rate": 1.923586145074063e-06, + "loss": 0.5378, + "step": 13014 + }, + { + "epoch": 3.4550643833797956, + "grad_norm": 0.4621556722674822, + "learning_rate": 1.9232464294025603e-06, + "loss": 0.5463, + "step": 13015 + }, + { + "epoch": 3.4553298818531792, + "grad_norm": 0.46983334038367863, + "learning_rate": 1.9229067249791642e-06, + "loss": 0.5186, + "step": 13016 + }, + { + "epoch": 3.455595380326563, + "grad_norm": 0.46574689452112356, + "learning_rate": 1.9225670318105004e-06, + "loss": 0.4987, + "step": 13017 + }, + { + "epoch": 3.455860878799947, + "grad_norm": 0.4493221731157662, + "learning_rate": 1.922227349903193e-06, + "loss": 0.5164, + "step": 13018 + }, + { + "epoch": 3.4561263772733306, + "grad_norm": 0.45585204320790923, + "learning_rate": 1.921887679263867e-06, + "loss": 0.5781, + "step": 13019 + }, + { + "epoch": 3.4563918757467142, + "grad_norm": 0.46515726418237735, + "learning_rate": 1.9215480198991466e-06, + "loss": 0.5308, + "step": 13020 + }, + { + "epoch": 3.4566573742200983, + "grad_norm": 0.45750436723102567, + "learning_rate": 1.921208371815656e-06, + "loss": 0.5863, + "step": 13021 + }, + { + "epoch": 3.456922872693482, + "grad_norm": 0.46999797936050514, + "learning_rate": 1.9208687350200203e-06, + "loss": 0.525, + "step": 13022 + }, + { + "epoch": 3.4571883711668656, + "grad_norm": 0.4576964641477319, + "learning_rate": 1.920529109518861e-06, + "loss": 0.518, + "step": 13023 + }, + { + "epoch": 3.4574538696402497, + "grad_norm": 0.4600663580915891, + "learning_rate": 1.9201894953188033e-06, + "loss": 0.5569, + "step": 13024 + }, + { + "epoch": 3.4577193681136333, + "grad_norm": 0.4544749581659812, + "learning_rate": 1.9198498924264696e-06, + "loss": 0.5049, + "step": 13025 + }, + { + "epoch": 3.457984866587017, + "grad_norm": 0.469239365371383, + "learning_rate": 1.919510300848483e-06, + "loss": 0.5541, + "step": 13026 + }, + { + "epoch": 3.458250365060401, + "grad_norm": 0.45208879213401876, + "learning_rate": 1.9191707205914677e-06, + "loss": 0.5682, + "step": 13027 + }, + { + "epoch": 3.4585158635337847, + "grad_norm": 0.44926342839868555, + "learning_rate": 1.9188311516620466e-06, + "loss": 0.5358, + "step": 13028 + }, + { + "epoch": 3.4587813620071683, + "grad_norm": 0.46910612332612955, + "learning_rate": 1.9184915940668395e-06, + "loss": 0.5737, + "step": 13029 + }, + { + "epoch": 3.4590468604805524, + "grad_norm": 0.4606913784856473, + "learning_rate": 1.9181520478124705e-06, + "loss": 0.5358, + "step": 13030 + }, + { + "epoch": 3.459312358953936, + "grad_norm": 0.4548509454852839, + "learning_rate": 1.917812512905561e-06, + "loss": 0.5558, + "step": 13031 + }, + { + "epoch": 3.4595778574273197, + "grad_norm": 0.4642261328111714, + "learning_rate": 1.917472989352733e-06, + "loss": 0.5515, + "step": 13032 + }, + { + "epoch": 3.459843355900704, + "grad_norm": 0.45112194009618994, + "learning_rate": 1.9171334771606076e-06, + "loss": 0.5609, + "step": 13033 + }, + { + "epoch": 3.4601088543740874, + "grad_norm": 0.4477191705133069, + "learning_rate": 1.9167939763358072e-06, + "loss": 0.5354, + "step": 13034 + }, + { + "epoch": 3.460374352847471, + "grad_norm": 0.4524173428885542, + "learning_rate": 1.9164544868849517e-06, + "loss": 0.5604, + "step": 13035 + }, + { + "epoch": 3.4606398513208547, + "grad_norm": 0.4625634275331173, + "learning_rate": 1.9161150088146627e-06, + "loss": 0.5377, + "step": 13036 + }, + { + "epoch": 3.460905349794239, + "grad_norm": 0.4739278956968761, + "learning_rate": 1.9157755421315607e-06, + "loss": 0.5264, + "step": 13037 + }, + { + "epoch": 3.4611708482676224, + "grad_norm": 0.45844106438913584, + "learning_rate": 1.9154360868422656e-06, + "loss": 0.5721, + "step": 13038 + }, + { + "epoch": 3.461436346741006, + "grad_norm": 0.4700908487606907, + "learning_rate": 1.9150966429533982e-06, + "loss": 0.5785, + "step": 13039 + }, + { + "epoch": 3.46170184521439, + "grad_norm": 0.47715054490431075, + "learning_rate": 1.9147572104715795e-06, + "loss": 0.5617, + "step": 13040 + }, + { + "epoch": 3.461967343687774, + "grad_norm": 0.45784774837914427, + "learning_rate": 1.9144177894034273e-06, + "loss": 0.5082, + "step": 13041 + }, + { + "epoch": 3.4622328421611575, + "grad_norm": 0.4518804508076868, + "learning_rate": 1.9140783797555615e-06, + "loss": 0.5167, + "step": 13042 + }, + { + "epoch": 3.4624983406345415, + "grad_norm": 0.47300834412284143, + "learning_rate": 1.9137389815346017e-06, + "loss": 0.5326, + "step": 13043 + }, + { + "epoch": 3.462763839107925, + "grad_norm": 0.4616618705479794, + "learning_rate": 1.913399594747168e-06, + "loss": 0.5402, + "step": 13044 + }, + { + "epoch": 3.463029337581309, + "grad_norm": 0.4551845044573789, + "learning_rate": 1.9130602193998785e-06, + "loss": 0.5754, + "step": 13045 + }, + { + "epoch": 3.4632948360546925, + "grad_norm": 0.46970169612768575, + "learning_rate": 1.912720855499351e-06, + "loss": 0.5335, + "step": 13046 + }, + { + "epoch": 3.4635603345280765, + "grad_norm": 0.4655763088520913, + "learning_rate": 1.912381503052205e-06, + "loss": 0.543, + "step": 13047 + }, + { + "epoch": 3.46382583300146, + "grad_norm": 0.5140019203188393, + "learning_rate": 1.912042162065059e-06, + "loss": 0.5473, + "step": 13048 + }, + { + "epoch": 3.464091331474844, + "grad_norm": 0.46056213612832314, + "learning_rate": 1.9117028325445295e-06, + "loss": 0.5518, + "step": 13049 + }, + { + "epoch": 3.464356829948228, + "grad_norm": 0.4616426820011735, + "learning_rate": 1.9113635144972355e-06, + "loss": 0.5824, + "step": 13050 + }, + { + "epoch": 3.4646223284216116, + "grad_norm": 0.45991151898802507, + "learning_rate": 1.9110242079297947e-06, + "loss": 0.5245, + "step": 13051 + }, + { + "epoch": 3.464887826894995, + "grad_norm": 0.4620963646500981, + "learning_rate": 1.9106849128488237e-06, + "loss": 0.5492, + "step": 13052 + }, + { + "epoch": 3.4651533253683793, + "grad_norm": 0.4534765755682093, + "learning_rate": 1.9103456292609395e-06, + "loss": 0.4942, + "step": 13053 + }, + { + "epoch": 3.465418823841763, + "grad_norm": 0.48193355425528095, + "learning_rate": 1.910006357172759e-06, + "loss": 0.5458, + "step": 13054 + }, + { + "epoch": 3.4656843223151466, + "grad_norm": 0.4687853146649047, + "learning_rate": 1.9096670965908997e-06, + "loss": 0.5549, + "step": 13055 + }, + { + "epoch": 3.4659498207885306, + "grad_norm": 0.4586536230777629, + "learning_rate": 1.909327847521977e-06, + "loss": 0.5435, + "step": 13056 + }, + { + "epoch": 3.4662153192619143, + "grad_norm": 0.461099145124684, + "learning_rate": 1.9089886099726085e-06, + "loss": 0.5491, + "step": 13057 + }, + { + "epoch": 3.466480817735298, + "grad_norm": 0.4747190918901796, + "learning_rate": 1.9086493839494082e-06, + "loss": 0.5778, + "step": 13058 + }, + { + "epoch": 3.466746316208682, + "grad_norm": 0.4496562357210341, + "learning_rate": 1.9083101694589927e-06, + "loss": 0.5205, + "step": 13059 + }, + { + "epoch": 3.4670118146820657, + "grad_norm": 0.4628152852119069, + "learning_rate": 1.907970966507978e-06, + "loss": 0.549, + "step": 13060 + }, + { + "epoch": 3.4672773131554493, + "grad_norm": 0.4632163628687318, + "learning_rate": 1.9076317751029784e-06, + "loss": 0.5389, + "step": 13061 + }, + { + "epoch": 3.4675428116288334, + "grad_norm": 0.4619241150626122, + "learning_rate": 1.90729259525061e-06, + "loss": 0.5224, + "step": 13062 + }, + { + "epoch": 3.467808310102217, + "grad_norm": 0.44826178100420744, + "learning_rate": 1.9069534269574875e-06, + "loss": 0.5668, + "step": 13063 + }, + { + "epoch": 3.4680738085756007, + "grad_norm": 0.45579877008943154, + "learning_rate": 1.9066142702302248e-06, + "loss": 0.5458, + "step": 13064 + }, + { + "epoch": 3.4683393070489843, + "grad_norm": 0.4617797553771512, + "learning_rate": 1.9062751250754367e-06, + "loss": 0.5638, + "step": 13065 + }, + { + "epoch": 3.4686048055223684, + "grad_norm": 0.4699196368062382, + "learning_rate": 1.9059359914997372e-06, + "loss": 0.5572, + "step": 13066 + }, + { + "epoch": 3.468870303995752, + "grad_norm": 0.45506100296512775, + "learning_rate": 1.9055968695097404e-06, + "loss": 0.5786, + "step": 13067 + }, + { + "epoch": 3.4691358024691357, + "grad_norm": 0.46458393426558037, + "learning_rate": 1.9052577591120603e-06, + "loss": 0.5197, + "step": 13068 + }, + { + "epoch": 3.4694013009425198, + "grad_norm": 0.4633086425999223, + "learning_rate": 1.9049186603133108e-06, + "loss": 0.5708, + "step": 13069 + }, + { + "epoch": 3.4696667994159034, + "grad_norm": 0.4554800080836122, + "learning_rate": 1.9045795731201034e-06, + "loss": 0.5383, + "step": 13070 + }, + { + "epoch": 3.469932297889287, + "grad_norm": 0.45265343120360074, + "learning_rate": 1.9042404975390522e-06, + "loss": 0.5342, + "step": 13071 + }, + { + "epoch": 3.4701977963626707, + "grad_norm": 0.4601467132721148, + "learning_rate": 1.9039014335767694e-06, + "loss": 0.5106, + "step": 13072 + }, + { + "epoch": 3.4704632948360548, + "grad_norm": 0.4726864455980346, + "learning_rate": 1.9035623812398688e-06, + "loss": 0.5233, + "step": 13073 + }, + { + "epoch": 3.4707287933094384, + "grad_norm": 0.4746648009140776, + "learning_rate": 1.9032233405349632e-06, + "loss": 0.5532, + "step": 13074 + }, + { + "epoch": 3.470994291782822, + "grad_norm": 0.4548252819721672, + "learning_rate": 1.9028843114686625e-06, + "loss": 0.5431, + "step": 13075 + }, + { + "epoch": 3.471259790256206, + "grad_norm": 0.4611773211640143, + "learning_rate": 1.9025452940475795e-06, + "loss": 0.5221, + "step": 13076 + }, + { + "epoch": 3.4715252887295898, + "grad_norm": 0.4482303071975979, + "learning_rate": 1.9022062882783263e-06, + "loss": 0.5704, + "step": 13077 + }, + { + "epoch": 3.4717907872029734, + "grad_norm": 0.4576110816729715, + "learning_rate": 1.9018672941675142e-06, + "loss": 0.5407, + "step": 13078 + }, + { + "epoch": 3.4720562856763575, + "grad_norm": 0.48543926866284937, + "learning_rate": 1.9015283117217543e-06, + "loss": 0.5391, + "step": 13079 + }, + { + "epoch": 3.472321784149741, + "grad_norm": 0.4781050667958576, + "learning_rate": 1.901189340947658e-06, + "loss": 0.5377, + "step": 13080 + }, + { + "epoch": 3.4725872826231248, + "grad_norm": 0.4586390122398653, + "learning_rate": 1.9008503818518354e-06, + "loss": 0.5141, + "step": 13081 + }, + { + "epoch": 3.472852781096509, + "grad_norm": 0.4669321724121264, + "learning_rate": 1.900511434440897e-06, + "loss": 0.5449, + "step": 13082 + }, + { + "epoch": 3.4731182795698925, + "grad_norm": 0.47109310017189765, + "learning_rate": 1.9001724987214538e-06, + "loss": 0.5487, + "step": 13083 + }, + { + "epoch": 3.473383778043276, + "grad_norm": 0.4677290850715841, + "learning_rate": 1.8998335747001151e-06, + "loss": 0.5061, + "step": 13084 + }, + { + "epoch": 3.4736492765166602, + "grad_norm": 0.4688399397970539, + "learning_rate": 1.8994946623834915e-06, + "loss": 0.5545, + "step": 13085 + }, + { + "epoch": 3.473914774990044, + "grad_norm": 0.4750485003142622, + "learning_rate": 1.899155761778193e-06, + "loss": 0.5892, + "step": 13086 + }, + { + "epoch": 3.4741802734634275, + "grad_norm": 0.4636533595233714, + "learning_rate": 1.8988168728908277e-06, + "loss": 0.5482, + "step": 13087 + }, + { + "epoch": 3.4744457719368116, + "grad_norm": 0.4851767125423052, + "learning_rate": 1.898477995728005e-06, + "loss": 0.5363, + "step": 13088 + }, + { + "epoch": 3.4747112704101952, + "grad_norm": 0.47499587715252584, + "learning_rate": 1.8981391302963342e-06, + "loss": 0.5709, + "step": 13089 + }, + { + "epoch": 3.474976768883579, + "grad_norm": 0.45779698420299514, + "learning_rate": 1.897800276602424e-06, + "loss": 0.5754, + "step": 13090 + }, + { + "epoch": 3.4752422673569625, + "grad_norm": 0.4649477856952448, + "learning_rate": 1.8974614346528827e-06, + "loss": 0.5193, + "step": 13091 + }, + { + "epoch": 3.4755077658303466, + "grad_norm": 0.4672334249412886, + "learning_rate": 1.8971226044543195e-06, + "loss": 0.5794, + "step": 13092 + }, + { + "epoch": 3.4757732643037302, + "grad_norm": 0.4561299731054907, + "learning_rate": 1.8967837860133408e-06, + "loss": 0.531, + "step": 13093 + }, + { + "epoch": 3.476038762777114, + "grad_norm": 0.46007594291993337, + "learning_rate": 1.8964449793365552e-06, + "loss": 0.544, + "step": 13094 + }, + { + "epoch": 3.476304261250498, + "grad_norm": 0.44921333712032363, + "learning_rate": 1.8961061844305707e-06, + "loss": 0.5076, + "step": 13095 + }, + { + "epoch": 3.4765697597238816, + "grad_norm": 0.4424093869623847, + "learning_rate": 1.8957674013019939e-06, + "loss": 0.5549, + "step": 13096 + }, + { + "epoch": 3.4768352581972652, + "grad_norm": 0.4656226841477639, + "learning_rate": 1.8954286299574326e-06, + "loss": 0.5394, + "step": 13097 + }, + { + "epoch": 3.4771007566706493, + "grad_norm": 0.49083088494877003, + "learning_rate": 1.8950898704034928e-06, + "loss": 0.5352, + "step": 13098 + }, + { + "epoch": 3.477366255144033, + "grad_norm": 0.43954675686839756, + "learning_rate": 1.8947511226467816e-06, + "loss": 0.5623, + "step": 13099 + }, + { + "epoch": 3.4776317536174166, + "grad_norm": 0.4614653643503416, + "learning_rate": 1.8944123866939053e-06, + "loss": 0.5407, + "step": 13100 + }, + { + "epoch": 3.4778972520908003, + "grad_norm": 0.4609393066520792, + "learning_rate": 1.8940736625514705e-06, + "loss": 0.593, + "step": 13101 + }, + { + "epoch": 3.4781627505641843, + "grad_norm": 0.4606841016586335, + "learning_rate": 1.8937349502260827e-06, + "loss": 0.4745, + "step": 13102 + }, + { + "epoch": 3.478428249037568, + "grad_norm": 0.47236562273268456, + "learning_rate": 1.893396249724349e-06, + "loss": 0.5428, + "step": 13103 + }, + { + "epoch": 3.4786937475109516, + "grad_norm": 0.46115447466717713, + "learning_rate": 1.8930575610528723e-06, + "loss": 0.53, + "step": 13104 + }, + { + "epoch": 3.4789592459843357, + "grad_norm": 0.4510127108505656, + "learning_rate": 1.8927188842182592e-06, + "loss": 0.5666, + "step": 13105 + }, + { + "epoch": 3.4792247444577193, + "grad_norm": 0.46337362742117966, + "learning_rate": 1.8923802192271147e-06, + "loss": 0.5573, + "step": 13106 + }, + { + "epoch": 3.479490242931103, + "grad_norm": 0.4516553698833458, + "learning_rate": 1.8920415660860442e-06, + "loss": 0.564, + "step": 13107 + }, + { + "epoch": 3.479755741404487, + "grad_norm": 0.45462409865497344, + "learning_rate": 1.891702924801651e-06, + "loss": 0.5397, + "step": 13108 + }, + { + "epoch": 3.4800212398778707, + "grad_norm": 0.44521164333003743, + "learning_rate": 1.8913642953805412e-06, + "loss": 0.548, + "step": 13109 + }, + { + "epoch": 3.4802867383512543, + "grad_norm": 0.47200593064224217, + "learning_rate": 1.8910256778293169e-06, + "loss": 0.5094, + "step": 13110 + }, + { + "epoch": 3.4805522368246384, + "grad_norm": 0.4741676227031946, + "learning_rate": 1.8906870721545832e-06, + "loss": 0.5818, + "step": 13111 + }, + { + "epoch": 3.480817735298022, + "grad_norm": 0.45919879621874155, + "learning_rate": 1.8903484783629433e-06, + "loss": 0.5626, + "step": 13112 + }, + { + "epoch": 3.4810832337714057, + "grad_norm": 0.4667153948976328, + "learning_rate": 1.8900098964610009e-06, + "loss": 0.5697, + "step": 13113 + }, + { + "epoch": 3.48134873224479, + "grad_norm": 0.4818615202166082, + "learning_rate": 1.8896713264553588e-06, + "loss": 0.5132, + "step": 13114 + }, + { + "epoch": 3.4816142307181734, + "grad_norm": 0.4571252144887379, + "learning_rate": 1.8893327683526214e-06, + "loss": 0.4993, + "step": 13115 + }, + { + "epoch": 3.481879729191557, + "grad_norm": 0.4508327623192309, + "learning_rate": 1.888994222159389e-06, + "loss": 0.506, + "step": 13116 + }, + { + "epoch": 3.482145227664941, + "grad_norm": 0.4604709533209294, + "learning_rate": 1.8886556878822648e-06, + "loss": 0.548, + "step": 13117 + }, + { + "epoch": 3.482410726138325, + "grad_norm": 0.4602635516792217, + "learning_rate": 1.888317165527852e-06, + "loss": 0.5701, + "step": 13118 + }, + { + "epoch": 3.4826762246117084, + "grad_norm": 0.46044951665657796, + "learning_rate": 1.8879786551027524e-06, + "loss": 0.5653, + "step": 13119 + }, + { + "epoch": 3.482941723085092, + "grad_norm": 0.45350328146382823, + "learning_rate": 1.8876401566135683e-06, + "loss": 0.5705, + "step": 13120 + }, + { + "epoch": 3.483207221558476, + "grad_norm": 0.46948995429357676, + "learning_rate": 1.8873016700668996e-06, + "loss": 0.5526, + "step": 13121 + }, + { + "epoch": 3.48347272003186, + "grad_norm": 0.4654642207694232, + "learning_rate": 1.8869631954693478e-06, + "loss": 0.5666, + "step": 13122 + }, + { + "epoch": 3.4837382185052435, + "grad_norm": 0.4556160800819771, + "learning_rate": 1.8866247328275156e-06, + "loss": 0.5538, + "step": 13123 + }, + { + "epoch": 3.4840037169786275, + "grad_norm": 0.45978900889141083, + "learning_rate": 1.8862862821480023e-06, + "loss": 0.536, + "step": 13124 + }, + { + "epoch": 3.484269215452011, + "grad_norm": 0.45080616960655173, + "learning_rate": 1.8859478434374095e-06, + "loss": 0.5598, + "step": 13125 + }, + { + "epoch": 3.484534713925395, + "grad_norm": 0.4648588181649765, + "learning_rate": 1.8856094167023375e-06, + "loss": 0.5736, + "step": 13126 + }, + { + "epoch": 3.4848002123987785, + "grad_norm": 0.45755327070054214, + "learning_rate": 1.8852710019493858e-06, + "loss": 0.5467, + "step": 13127 + }, + { + "epoch": 3.4850657108721625, + "grad_norm": 0.45627472745113706, + "learning_rate": 1.8849325991851543e-06, + "loss": 0.5125, + "step": 13128 + }, + { + "epoch": 3.485331209345546, + "grad_norm": 0.4505009862628162, + "learning_rate": 1.8845942084162435e-06, + "loss": 0.5431, + "step": 13129 + }, + { + "epoch": 3.48559670781893, + "grad_norm": 0.46495264564422034, + "learning_rate": 1.8842558296492525e-06, + "loss": 0.5709, + "step": 13130 + }, + { + "epoch": 3.485862206292314, + "grad_norm": 0.4562322356932985, + "learning_rate": 1.88391746289078e-06, + "loss": 0.5348, + "step": 13131 + }, + { + "epoch": 3.4861277047656976, + "grad_norm": 0.4534614736578336, + "learning_rate": 1.8835791081474266e-06, + "loss": 0.5564, + "step": 13132 + }, + { + "epoch": 3.486393203239081, + "grad_norm": 0.46696695532369054, + "learning_rate": 1.8832407654257888e-06, + "loss": 0.5597, + "step": 13133 + }, + { + "epoch": 3.4866587017124653, + "grad_norm": 0.46089847027854514, + "learning_rate": 1.8829024347324662e-06, + "loss": 0.5627, + "step": 13134 + }, + { + "epoch": 3.486924200185849, + "grad_norm": 0.4833492203102998, + "learning_rate": 1.8825641160740572e-06, + "loss": 0.6206, + "step": 13135 + }, + { + "epoch": 3.4871896986592326, + "grad_norm": 0.4697515245265211, + "learning_rate": 1.8822258094571594e-06, + "loss": 0.5412, + "step": 13136 + }, + { + "epoch": 3.4874551971326166, + "grad_norm": 0.4426142784437776, + "learning_rate": 1.881887514888371e-06, + "loss": 0.5224, + "step": 13137 + }, + { + "epoch": 3.4877206956060003, + "grad_norm": 0.47129788669639094, + "learning_rate": 1.88154923237429e-06, + "loss": 0.5188, + "step": 13138 + }, + { + "epoch": 3.487986194079384, + "grad_norm": 0.4369495855962656, + "learning_rate": 1.8812109619215127e-06, + "loss": 0.5014, + "step": 13139 + }, + { + "epoch": 3.488251692552768, + "grad_norm": 0.47178333216586293, + "learning_rate": 1.8808727035366367e-06, + "loss": 0.5642, + "step": 13140 + }, + { + "epoch": 3.4885171910261517, + "grad_norm": 0.4800236894260246, + "learning_rate": 1.8805344572262588e-06, + "loss": 0.5151, + "step": 13141 + }, + { + "epoch": 3.4887826894995353, + "grad_norm": 0.46405485935384677, + "learning_rate": 1.8801962229969756e-06, + "loss": 0.5248, + "step": 13142 + }, + { + "epoch": 3.4890481879729194, + "grad_norm": 0.4642600858831924, + "learning_rate": 1.8798580008553844e-06, + "loss": 0.5311, + "step": 13143 + }, + { + "epoch": 3.489313686446303, + "grad_norm": 0.45091809800597543, + "learning_rate": 1.8795197908080798e-06, + "loss": 0.5763, + "step": 13144 + }, + { + "epoch": 3.4895791849196867, + "grad_norm": 0.47113371351316113, + "learning_rate": 1.8791815928616585e-06, + "loss": 0.536, + "step": 13145 + }, + { + "epoch": 3.4898446833930703, + "grad_norm": 0.46543894029201194, + "learning_rate": 1.8788434070227163e-06, + "loss": 0.5632, + "step": 13146 + }, + { + "epoch": 3.4901101818664544, + "grad_norm": 0.4474759154916583, + "learning_rate": 1.8785052332978485e-06, + "loss": 0.5287, + "step": 13147 + }, + { + "epoch": 3.490375680339838, + "grad_norm": 0.44304691706684995, + "learning_rate": 1.87816707169365e-06, + "loss": 0.5583, + "step": 13148 + }, + { + "epoch": 3.4906411788132217, + "grad_norm": 0.4587295735368923, + "learning_rate": 1.8778289222167175e-06, + "loss": 0.5431, + "step": 13149 + }, + { + "epoch": 3.4909066772866058, + "grad_norm": 0.45910701903488704, + "learning_rate": 1.8774907848736434e-06, + "loss": 0.5565, + "step": 13150 + }, + { + "epoch": 3.4911721757599894, + "grad_norm": 0.45228110901400725, + "learning_rate": 1.877152659671023e-06, + "loss": 0.5687, + "step": 13151 + }, + { + "epoch": 3.491437674233373, + "grad_norm": 0.4577606175365332, + "learning_rate": 1.8768145466154505e-06, + "loss": 0.5327, + "step": 13152 + }, + { + "epoch": 3.491703172706757, + "grad_norm": 0.48210337548405086, + "learning_rate": 1.8764764457135204e-06, + "loss": 0.5609, + "step": 13153 + }, + { + "epoch": 3.4919686711801408, + "grad_norm": 0.43958827660965805, + "learning_rate": 1.8761383569718265e-06, + "loss": 0.5586, + "step": 13154 + }, + { + "epoch": 3.4922341696535244, + "grad_norm": 0.4688682857359886, + "learning_rate": 1.875800280396962e-06, + "loss": 0.5621, + "step": 13155 + }, + { + "epoch": 3.492499668126908, + "grad_norm": 0.4592577645187981, + "learning_rate": 1.8754622159955205e-06, + "loss": 0.5482, + "step": 13156 + }, + { + "epoch": 3.492765166600292, + "grad_norm": 0.46793632483283865, + "learning_rate": 1.8751241637740947e-06, + "loss": 0.5267, + "step": 13157 + }, + { + "epoch": 3.4930306650736758, + "grad_norm": 0.4627307671019581, + "learning_rate": 1.8747861237392772e-06, + "loss": 0.5212, + "step": 13158 + }, + { + "epoch": 3.4932961635470594, + "grad_norm": 0.46754340511612763, + "learning_rate": 1.8744480958976619e-06, + "loss": 0.572, + "step": 13159 + }, + { + "epoch": 3.4935616620204435, + "grad_norm": 0.4473161739180905, + "learning_rate": 1.8741100802558399e-06, + "loss": 0.5654, + "step": 13160 + }, + { + "epoch": 3.493827160493827, + "grad_norm": 0.47247497464222454, + "learning_rate": 1.8737720768204047e-06, + "loss": 0.5145, + "step": 13161 + }, + { + "epoch": 3.4940926589672108, + "grad_norm": 0.4584942994941058, + "learning_rate": 1.873434085597946e-06, + "loss": 0.5269, + "step": 13162 + }, + { + "epoch": 3.494358157440595, + "grad_norm": 0.44534585119093867, + "learning_rate": 1.8730961065950567e-06, + "loss": 0.5443, + "step": 13163 + }, + { + "epoch": 3.4946236559139785, + "grad_norm": 0.4577392355231514, + "learning_rate": 1.8727581398183287e-06, + "loss": 0.5473, + "step": 13164 + }, + { + "epoch": 3.494889154387362, + "grad_norm": 0.46469570974731267, + "learning_rate": 1.8724201852743526e-06, + "loss": 0.545, + "step": 13165 + }, + { + "epoch": 3.4951546528607462, + "grad_norm": 0.4401786987559962, + "learning_rate": 1.8720822429697193e-06, + "loss": 0.5353, + "step": 13166 + }, + { + "epoch": 3.49542015133413, + "grad_norm": 0.4542033800032534, + "learning_rate": 1.8717443129110212e-06, + "loss": 0.5262, + "step": 13167 + }, + { + "epoch": 3.4956856498075135, + "grad_norm": 0.46175017544644553, + "learning_rate": 1.8714063951048462e-06, + "loss": 0.5101, + "step": 13168 + }, + { + "epoch": 3.4959511482808976, + "grad_norm": 0.4600274337983664, + "learning_rate": 1.8710684895577853e-06, + "loss": 0.5822, + "step": 13169 + }, + { + "epoch": 3.4962166467542812, + "grad_norm": 0.4845600631862799, + "learning_rate": 1.8707305962764283e-06, + "loss": 0.5585, + "step": 13170 + }, + { + "epoch": 3.496482145227665, + "grad_norm": 0.4637113917385523, + "learning_rate": 1.870392715267366e-06, + "loss": 0.5427, + "step": 13171 + }, + { + "epoch": 3.496747643701049, + "grad_norm": 0.4651673444293219, + "learning_rate": 1.8700548465371877e-06, + "loss": 0.5845, + "step": 13172 + }, + { + "epoch": 3.4970131421744326, + "grad_norm": 0.460838142013931, + "learning_rate": 1.8697169900924817e-06, + "loss": 0.5315, + "step": 13173 + }, + { + "epoch": 3.4972786406478162, + "grad_norm": 0.45938638451473546, + "learning_rate": 1.8693791459398376e-06, + "loss": 0.5612, + "step": 13174 + }, + { + "epoch": 3.4975441391212, + "grad_norm": 0.46938756741124216, + "learning_rate": 1.8690413140858438e-06, + "loss": 0.566, + "step": 13175 + }, + { + "epoch": 3.497809637594584, + "grad_norm": 0.44987329047846875, + "learning_rate": 1.8687034945370896e-06, + "loss": 0.5032, + "step": 13176 + }, + { + "epoch": 3.4980751360679676, + "grad_norm": 0.46084191858840934, + "learning_rate": 1.868365687300163e-06, + "loss": 0.5133, + "step": 13177 + }, + { + "epoch": 3.4983406345413512, + "grad_norm": 0.4457448430336164, + "learning_rate": 1.868027892381653e-06, + "loss": 0.5457, + "step": 13178 + }, + { + "epoch": 3.4986061330147353, + "grad_norm": 0.45071788882914327, + "learning_rate": 1.8676901097881452e-06, + "loss": 0.4888, + "step": 13179 + }, + { + "epoch": 3.498871631488119, + "grad_norm": 0.439683004064656, + "learning_rate": 1.8673523395262289e-06, + "loss": 0.5582, + "step": 13180 + }, + { + "epoch": 3.4991371299615026, + "grad_norm": 0.4723019366949332, + "learning_rate": 1.8670145816024904e-06, + "loss": 0.5814, + "step": 13181 + }, + { + "epoch": 3.4994026284348863, + "grad_norm": 0.444985374888047, + "learning_rate": 1.8666768360235178e-06, + "loss": 0.5524, + "step": 13182 + }, + { + "epoch": 3.4996681269082703, + "grad_norm": 0.4520293573667949, + "learning_rate": 1.8663391027958971e-06, + "loss": 0.5306, + "step": 13183 + }, + { + "epoch": 3.499933625381654, + "grad_norm": 0.4513887029299811, + "learning_rate": 1.8660013819262162e-06, + "loss": 0.5699, + "step": 13184 + }, + { + "epoch": 3.5001991238550376, + "grad_norm": 0.4734639150604951, + "learning_rate": 1.86566367342106e-06, + "loss": 0.537, + "step": 13185 + }, + { + "epoch": 3.5004646223284217, + "grad_norm": 0.4483989773127945, + "learning_rate": 1.8653259772870153e-06, + "loss": 0.5544, + "step": 13186 + }, + { + "epoch": 3.5007301208018053, + "grad_norm": 0.4646666182030601, + "learning_rate": 1.8649882935306682e-06, + "loss": 0.5561, + "step": 13187 + }, + { + "epoch": 3.500995619275189, + "grad_norm": 0.4661581327853897, + "learning_rate": 1.864650622158604e-06, + "loss": 0.5582, + "step": 13188 + }, + { + "epoch": 3.501261117748573, + "grad_norm": 0.45406504418374377, + "learning_rate": 1.864312963177408e-06, + "loss": 0.5521, + "step": 13189 + }, + { + "epoch": 3.5015266162219567, + "grad_norm": 0.46087495382865173, + "learning_rate": 1.8639753165936666e-06, + "loss": 0.5447, + "step": 13190 + }, + { + "epoch": 3.5017921146953404, + "grad_norm": 0.4535285068202552, + "learning_rate": 1.8636376824139631e-06, + "loss": 0.5587, + "step": 13191 + }, + { + "epoch": 3.5020576131687244, + "grad_norm": 0.45915532879592114, + "learning_rate": 1.863300060644883e-06, + "loss": 0.5646, + "step": 13192 + }, + { + "epoch": 3.502323111642108, + "grad_norm": 0.4645994524977131, + "learning_rate": 1.8629624512930106e-06, + "loss": 0.5365, + "step": 13193 + }, + { + "epoch": 3.5025886101154917, + "grad_norm": 0.4513661305357719, + "learning_rate": 1.86262485436493e-06, + "loss": 0.535, + "step": 13194 + }, + { + "epoch": 3.502854108588876, + "grad_norm": 0.43827157958341256, + "learning_rate": 1.8622872698672262e-06, + "loss": 0.5154, + "step": 13195 + }, + { + "epoch": 3.5031196070622594, + "grad_norm": 0.4506053522555669, + "learning_rate": 1.8619496978064813e-06, + "loss": 0.5467, + "step": 13196 + }, + { + "epoch": 3.503385105535643, + "grad_norm": 0.46505737547395676, + "learning_rate": 1.8616121381892793e-06, + "loss": 0.5495, + "step": 13197 + }, + { + "epoch": 3.503650604009027, + "grad_norm": 0.4510448468374083, + "learning_rate": 1.8612745910222038e-06, + "loss": 0.5717, + "step": 13198 + }, + { + "epoch": 3.503916102482411, + "grad_norm": 0.47066050125560743, + "learning_rate": 1.8609370563118375e-06, + "loss": 0.5596, + "step": 13199 + }, + { + "epoch": 3.5041816009557945, + "grad_norm": 0.466524672061839, + "learning_rate": 1.8605995340647637e-06, + "loss": 0.5591, + "step": 13200 + }, + { + "epoch": 3.5044470994291785, + "grad_norm": 0.44756731624600676, + "learning_rate": 1.8602620242875646e-06, + "loss": 0.5313, + "step": 13201 + }, + { + "epoch": 3.504712597902562, + "grad_norm": 0.47791408719069856, + "learning_rate": 1.8599245269868218e-06, + "loss": 0.5553, + "step": 13202 + }, + { + "epoch": 3.504978096375946, + "grad_norm": 0.46507393385105034, + "learning_rate": 1.8595870421691181e-06, + "loss": 0.5799, + "step": 13203 + }, + { + "epoch": 3.5052435948493295, + "grad_norm": 0.467800533648253, + "learning_rate": 1.8592495698410352e-06, + "loss": 0.5519, + "step": 13204 + }, + { + "epoch": 3.5055090933227135, + "grad_norm": 0.4532616912575588, + "learning_rate": 1.8589121100091543e-06, + "loss": 0.5192, + "step": 13205 + }, + { + "epoch": 3.505774591796097, + "grad_norm": 0.4532209423553535, + "learning_rate": 1.8585746626800568e-06, + "loss": 0.5155, + "step": 13206 + }, + { + "epoch": 3.506040090269481, + "grad_norm": 0.46289448648902665, + "learning_rate": 1.8582372278603251e-06, + "loss": 0.5157, + "step": 13207 + }, + { + "epoch": 3.5063055887428645, + "grad_norm": 0.4561396995106719, + "learning_rate": 1.8578998055565378e-06, + "loss": 0.5636, + "step": 13208 + }, + { + "epoch": 3.5065710872162486, + "grad_norm": 0.47240421547746986, + "learning_rate": 1.8575623957752758e-06, + "loss": 0.5351, + "step": 13209 + }, + { + "epoch": 3.506836585689632, + "grad_norm": 0.47129968397973104, + "learning_rate": 1.8572249985231206e-06, + "loss": 0.548, + "step": 13210 + }, + { + "epoch": 3.507102084163016, + "grad_norm": 0.46654157997260776, + "learning_rate": 1.8568876138066515e-06, + "loss": 0.544, + "step": 13211 + }, + { + "epoch": 3.5073675826364, + "grad_norm": 0.449067104004744, + "learning_rate": 1.8565502416324488e-06, + "loss": 0.5486, + "step": 13212 + }, + { + "epoch": 3.5076330811097836, + "grad_norm": 0.4837915043261843, + "learning_rate": 1.8562128820070927e-06, + "loss": 0.5792, + "step": 13213 + }, + { + "epoch": 3.507898579583167, + "grad_norm": 0.47048910569028435, + "learning_rate": 1.8558755349371607e-06, + "loss": 0.5497, + "step": 13214 + }, + { + "epoch": 3.5081640780565513, + "grad_norm": 0.4502862274042531, + "learning_rate": 1.8555382004292328e-06, + "loss": 0.5461, + "step": 13215 + }, + { + "epoch": 3.508429576529935, + "grad_norm": 0.48896827945236027, + "learning_rate": 1.855200878489888e-06, + "loss": 0.5508, + "step": 13216 + }, + { + "epoch": 3.5086950750033186, + "grad_norm": 0.47388248050262494, + "learning_rate": 1.8548635691257047e-06, + "loss": 0.5559, + "step": 13217 + }, + { + "epoch": 3.5089605734767026, + "grad_norm": 0.4491553182116303, + "learning_rate": 1.854526272343262e-06, + "loss": 0.55, + "step": 13218 + }, + { + "epoch": 3.5092260719500863, + "grad_norm": 0.4756736157273259, + "learning_rate": 1.8541889881491367e-06, + "loss": 0.5712, + "step": 13219 + }, + { + "epoch": 3.50949157042347, + "grad_norm": 0.5005277079223958, + "learning_rate": 1.8538517165499077e-06, + "loss": 0.5456, + "step": 13220 + }, + { + "epoch": 3.509757068896854, + "grad_norm": 0.4475895248385321, + "learning_rate": 1.853514457552152e-06, + "loss": 0.5684, + "step": 13221 + }, + { + "epoch": 3.5100225673702377, + "grad_norm": 0.44870713024964237, + "learning_rate": 1.853177211162447e-06, + "loss": 0.5213, + "step": 13222 + }, + { + "epoch": 3.5102880658436213, + "grad_norm": 0.45681337885406564, + "learning_rate": 1.8528399773873705e-06, + "loss": 0.5413, + "step": 13223 + }, + { + "epoch": 3.5105535643170054, + "grad_norm": 0.4712305929267227, + "learning_rate": 1.8525027562335e-06, + "loss": 0.5611, + "step": 13224 + }, + { + "epoch": 3.510819062790389, + "grad_norm": 0.45428853962533106, + "learning_rate": 1.8521655477074096e-06, + "loss": 0.5669, + "step": 13225 + }, + { + "epoch": 3.5110845612637727, + "grad_norm": 0.46873811593732023, + "learning_rate": 1.8518283518156777e-06, + "loss": 0.5712, + "step": 13226 + }, + { + "epoch": 3.5113500597371567, + "grad_norm": 0.46113602968033557, + "learning_rate": 1.8514911685648798e-06, + "loss": 0.5457, + "step": 13227 + }, + { + "epoch": 3.5116155582105404, + "grad_norm": 0.4729629403413077, + "learning_rate": 1.8511539979615917e-06, + "loss": 0.5614, + "step": 13228 + }, + { + "epoch": 3.511881056683924, + "grad_norm": 0.4514113331773916, + "learning_rate": 1.8508168400123892e-06, + "loss": 0.5333, + "step": 13229 + }, + { + "epoch": 3.512146555157308, + "grad_norm": 0.4529695813427385, + "learning_rate": 1.850479694723849e-06, + "loss": 0.5571, + "step": 13230 + }, + { + "epoch": 3.5124120536306918, + "grad_norm": 0.4783513167964535, + "learning_rate": 1.8501425621025442e-06, + "loss": 0.5584, + "step": 13231 + }, + { + "epoch": 3.5126775521040754, + "grad_norm": 0.4628705015491312, + "learning_rate": 1.8498054421550503e-06, + "loss": 0.5116, + "step": 13232 + }, + { + "epoch": 3.512943050577459, + "grad_norm": 0.4273953725805247, + "learning_rate": 1.8494683348879425e-06, + "loss": 0.5605, + "step": 13233 + }, + { + "epoch": 3.513208549050843, + "grad_norm": 0.4632686395473244, + "learning_rate": 1.8491312403077949e-06, + "loss": 0.5283, + "step": 13234 + }, + { + "epoch": 3.5134740475242268, + "grad_norm": 0.44693044747784266, + "learning_rate": 1.8487941584211816e-06, + "loss": 0.5589, + "step": 13235 + }, + { + "epoch": 3.5137395459976104, + "grad_norm": 0.47825926918762085, + "learning_rate": 1.8484570892346773e-06, + "loss": 0.5395, + "step": 13236 + }, + { + "epoch": 3.514005044470994, + "grad_norm": 0.46580654070141414, + "learning_rate": 1.8481200327548543e-06, + "loss": 0.5442, + "step": 13237 + }, + { + "epoch": 3.514270542944378, + "grad_norm": 0.47061693879046246, + "learning_rate": 1.847782988988287e-06, + "loss": 0.5436, + "step": 13238 + }, + { + "epoch": 3.5145360414177618, + "grad_norm": 0.4424925894558598, + "learning_rate": 1.8474459579415483e-06, + "loss": 0.5084, + "step": 13239 + }, + { + "epoch": 3.5148015398911454, + "grad_norm": 0.46315892406069886, + "learning_rate": 1.847108939621211e-06, + "loss": 0.5303, + "step": 13240 + }, + { + "epoch": 3.5150670383645295, + "grad_norm": 0.4674993342925463, + "learning_rate": 1.8467719340338488e-06, + "loss": 0.5426, + "step": 13241 + }, + { + "epoch": 3.515332536837913, + "grad_norm": 0.4602981208284301, + "learning_rate": 1.8464349411860328e-06, + "loss": 0.5852, + "step": 13242 + }, + { + "epoch": 3.5155980353112968, + "grad_norm": 0.46143714845301903, + "learning_rate": 1.846097961084335e-06, + "loss": 0.5694, + "step": 13243 + }, + { + "epoch": 3.515863533784681, + "grad_norm": 0.4715205101564161, + "learning_rate": 1.8457609937353282e-06, + "loss": 0.5562, + "step": 13244 + }, + { + "epoch": 3.5161290322580645, + "grad_norm": 0.45765554411315496, + "learning_rate": 1.845424039145584e-06, + "loss": 0.5587, + "step": 13245 + }, + { + "epoch": 3.516394530731448, + "grad_norm": 0.4709391265367314, + "learning_rate": 1.8450870973216737e-06, + "loss": 0.5535, + "step": 13246 + }, + { + "epoch": 3.5166600292048322, + "grad_norm": 0.47008690742043996, + "learning_rate": 1.844750168270169e-06, + "loss": 0.5601, + "step": 13247 + }, + { + "epoch": 3.516925527678216, + "grad_norm": 0.47340117550938954, + "learning_rate": 1.8444132519976398e-06, + "loss": 0.5372, + "step": 13248 + }, + { + "epoch": 3.5171910261515995, + "grad_norm": 0.4457831933419617, + "learning_rate": 1.8440763485106571e-06, + "loss": 0.5237, + "step": 13249 + }, + { + "epoch": 3.5174565246249836, + "grad_norm": 0.45639209819098053, + "learning_rate": 1.843739457815792e-06, + "loss": 0.5234, + "step": 13250 + }, + { + "epoch": 3.5177220230983672, + "grad_norm": 0.45561845301128556, + "learning_rate": 1.8434025799196143e-06, + "loss": 0.5538, + "step": 13251 + }, + { + "epoch": 3.517987521571751, + "grad_norm": 0.4709719895481183, + "learning_rate": 1.8430657148286935e-06, + "loss": 0.5438, + "step": 13252 + }, + { + "epoch": 3.518253020045135, + "grad_norm": 0.45973882588537845, + "learning_rate": 1.842728862549601e-06, + "loss": 0.5702, + "step": 13253 + }, + { + "epoch": 3.5185185185185186, + "grad_norm": 0.45834895771424367, + "learning_rate": 1.8423920230889034e-06, + "loss": 0.5649, + "step": 13254 + }, + { + "epoch": 3.5187840169919022, + "grad_norm": 0.46665031091715986, + "learning_rate": 1.842055196453172e-06, + "loss": 0.5558, + "step": 13255 + }, + { + "epoch": 3.5190495154652863, + "grad_norm": 0.45325246521907353, + "learning_rate": 1.841718382648975e-06, + "loss": 0.5408, + "step": 13256 + }, + { + "epoch": 3.51931501393867, + "grad_norm": 0.470805498621211, + "learning_rate": 1.8413815816828814e-06, + "loss": 0.574, + "step": 13257 + }, + { + "epoch": 3.5195805124120536, + "grad_norm": 0.45198319067684056, + "learning_rate": 1.8410447935614594e-06, + "loss": 0.5724, + "step": 13258 + }, + { + "epoch": 3.5198460108854372, + "grad_norm": 0.4902640714583506, + "learning_rate": 1.840708018291279e-06, + "loss": 0.5593, + "step": 13259 + }, + { + "epoch": 3.5201115093588213, + "grad_norm": 0.4641633326195899, + "learning_rate": 1.8403712558789047e-06, + "loss": 0.5484, + "step": 13260 + }, + { + "epoch": 3.520377007832205, + "grad_norm": 0.4629416724239757, + "learning_rate": 1.8400345063309061e-06, + "loss": 0.5422, + "step": 13261 + }, + { + "epoch": 3.5206425063055886, + "grad_norm": 0.48338434642896316, + "learning_rate": 1.839697769653851e-06, + "loss": 0.56, + "step": 13262 + }, + { + "epoch": 3.5209080047789723, + "grad_norm": 0.47800442909365365, + "learning_rate": 1.8393610458543053e-06, + "loss": 0.572, + "step": 13263 + }, + { + "epoch": 3.5211735032523563, + "grad_norm": 0.46909204054396975, + "learning_rate": 1.8390243349388378e-06, + "loss": 0.5188, + "step": 13264 + }, + { + "epoch": 3.52143900172574, + "grad_norm": 0.4599590445302613, + "learning_rate": 1.8386876369140133e-06, + "loss": 0.536, + "step": 13265 + }, + { + "epoch": 3.5217045001991236, + "grad_norm": 0.4583910969891272, + "learning_rate": 1.8383509517863995e-06, + "loss": 0.5417, + "step": 13266 + }, + { + "epoch": 3.5219699986725077, + "grad_norm": 0.45847752569190753, + "learning_rate": 1.8380142795625616e-06, + "loss": 0.5387, + "step": 13267 + }, + { + "epoch": 3.5222354971458913, + "grad_norm": 0.4483954538654492, + "learning_rate": 1.8376776202490666e-06, + "loss": 0.5008, + "step": 13268 + }, + { + "epoch": 3.522500995619275, + "grad_norm": 0.484081330855067, + "learning_rate": 1.8373409738524792e-06, + "loss": 0.5649, + "step": 13269 + }, + { + "epoch": 3.522766494092659, + "grad_norm": 0.46437315742621915, + "learning_rate": 1.8370043403793664e-06, + "loss": 0.5676, + "step": 13270 + }, + { + "epoch": 3.5230319925660427, + "grad_norm": 0.47277666110222477, + "learning_rate": 1.8366677198362915e-06, + "loss": 0.5514, + "step": 13271 + }, + { + "epoch": 3.5232974910394264, + "grad_norm": 0.44436856196423663, + "learning_rate": 1.83633111222982e-06, + "loss": 0.5391, + "step": 13272 + }, + { + "epoch": 3.5235629895128104, + "grad_norm": 0.4668212535172757, + "learning_rate": 1.8359945175665168e-06, + "loss": 0.5115, + "step": 13273 + }, + { + "epoch": 3.523828487986194, + "grad_norm": 0.4670259967674747, + "learning_rate": 1.835657935852946e-06, + "loss": 0.5693, + "step": 13274 + }, + { + "epoch": 3.5240939864595777, + "grad_norm": 0.4613363608669837, + "learning_rate": 1.8353213670956715e-06, + "loss": 0.5382, + "step": 13275 + }, + { + "epoch": 3.524359484932962, + "grad_norm": 0.4713032662397352, + "learning_rate": 1.8349848113012596e-06, + "loss": 0.5622, + "step": 13276 + }, + { + "epoch": 3.5246249834063454, + "grad_norm": 0.470170149949293, + "learning_rate": 1.8346482684762711e-06, + "loss": 0.5837, + "step": 13277 + }, + { + "epoch": 3.524890481879729, + "grad_norm": 0.47276090716932484, + "learning_rate": 1.8343117386272704e-06, + "loss": 0.5413, + "step": 13278 + }, + { + "epoch": 3.525155980353113, + "grad_norm": 0.46894148530938673, + "learning_rate": 1.8339752217608206e-06, + "loss": 0.5307, + "step": 13279 + }, + { + "epoch": 3.525421478826497, + "grad_norm": 0.47187297321402716, + "learning_rate": 1.8336387178834846e-06, + "loss": 0.5284, + "step": 13280 + }, + { + "epoch": 3.5256869772998805, + "grad_norm": 0.4543840406280816, + "learning_rate": 1.8333022270018252e-06, + "loss": 0.5367, + "step": 13281 + }, + { + "epoch": 3.5259524757732645, + "grad_norm": 0.47936599613911385, + "learning_rate": 1.8329657491224056e-06, + "loss": 0.5662, + "step": 13282 + }, + { + "epoch": 3.526217974246648, + "grad_norm": 0.4625249162753347, + "learning_rate": 1.8326292842517862e-06, + "loss": 0.5406, + "step": 13283 + }, + { + "epoch": 3.526483472720032, + "grad_norm": 0.46943946690036775, + "learning_rate": 1.8322928323965296e-06, + "loss": 0.5277, + "step": 13284 + }, + { + "epoch": 3.526748971193416, + "grad_norm": 0.46599423073980717, + "learning_rate": 1.8319563935631978e-06, + "loss": 0.5462, + "step": 13285 + }, + { + "epoch": 3.5270144696667995, + "grad_norm": 0.4634110607645398, + "learning_rate": 1.831619967758352e-06, + "loss": 0.562, + "step": 13286 + }, + { + "epoch": 3.527279968140183, + "grad_norm": 0.463352001772562, + "learning_rate": 1.8312835549885533e-06, + "loss": 0.5208, + "step": 13287 + }, + { + "epoch": 3.527545466613567, + "grad_norm": 0.46889574647607046, + "learning_rate": 1.8309471552603635e-06, + "loss": 0.5265, + "step": 13288 + }, + { + "epoch": 3.527810965086951, + "grad_norm": 0.4777500723757507, + "learning_rate": 1.8306107685803414e-06, + "loss": 0.526, + "step": 13289 + }, + { + "epoch": 3.5280764635603346, + "grad_norm": 0.46716339708865007, + "learning_rate": 1.8302743949550482e-06, + "loss": 0.5448, + "step": 13290 + }, + { + "epoch": 3.528341962033718, + "grad_norm": 0.4665450385486929, + "learning_rate": 1.8299380343910444e-06, + "loss": 0.5611, + "step": 13291 + }, + { + "epoch": 3.528607460507102, + "grad_norm": 0.47304557238745437, + "learning_rate": 1.829601686894889e-06, + "loss": 0.572, + "step": 13292 + }, + { + "epoch": 3.528872958980486, + "grad_norm": 0.4527224953404673, + "learning_rate": 1.829265352473143e-06, + "loss": 0.5777, + "step": 13293 + }, + { + "epoch": 3.5291384574538696, + "grad_norm": 0.47273454086721106, + "learning_rate": 1.8289290311323643e-06, + "loss": 0.5353, + "step": 13294 + }, + { + "epoch": 3.529403955927253, + "grad_norm": 0.4766050228456209, + "learning_rate": 1.8285927228791122e-06, + "loss": 0.5744, + "step": 13295 + }, + { + "epoch": 3.5296694544006373, + "grad_norm": 0.4677897275648548, + "learning_rate": 1.8282564277199464e-06, + "loss": 0.5861, + "step": 13296 + }, + { + "epoch": 3.529934952874021, + "grad_norm": 0.4598820405046237, + "learning_rate": 1.8279201456614248e-06, + "loss": 0.5292, + "step": 13297 + }, + { + "epoch": 3.5302004513474046, + "grad_norm": 0.4598023766553167, + "learning_rate": 1.8275838767101057e-06, + "loss": 0.569, + "step": 13298 + }, + { + "epoch": 3.5304659498207887, + "grad_norm": 0.4746521254565789, + "learning_rate": 1.8272476208725488e-06, + "loss": 0.5104, + "step": 13299 + }, + { + "epoch": 3.5307314482941723, + "grad_norm": 0.44422174385585755, + "learning_rate": 1.8269113781553084e-06, + "loss": 0.5079, + "step": 13300 + }, + { + "epoch": 3.530996946767556, + "grad_norm": 0.4478682705457898, + "learning_rate": 1.826575148564945e-06, + "loss": 0.5537, + "step": 13301 + }, + { + "epoch": 3.53126244524094, + "grad_norm": 0.4483312850326237, + "learning_rate": 1.8262389321080149e-06, + "loss": 0.5324, + "step": 13302 + }, + { + "epoch": 3.5315279437143237, + "grad_norm": 0.4769387268999272, + "learning_rate": 1.8259027287910753e-06, + "loss": 0.5995, + "step": 13303 + }, + { + "epoch": 3.5317934421877073, + "grad_norm": 0.4615593927356842, + "learning_rate": 1.8255665386206828e-06, + "loss": 0.5621, + "step": 13304 + }, + { + "epoch": 3.5320589406610914, + "grad_norm": 0.4528036825045676, + "learning_rate": 1.8252303616033956e-06, + "loss": 0.5389, + "step": 13305 + }, + { + "epoch": 3.532324439134475, + "grad_norm": 0.44333208725803025, + "learning_rate": 1.8248941977457669e-06, + "loss": 0.5225, + "step": 13306 + }, + { + "epoch": 3.5325899376078587, + "grad_norm": 0.4667592701124061, + "learning_rate": 1.8245580470543546e-06, + "loss": 0.5749, + "step": 13307 + }, + { + "epoch": 3.5328554360812428, + "grad_norm": 0.46159898879308614, + "learning_rate": 1.8242219095357138e-06, + "loss": 0.5513, + "step": 13308 + }, + { + "epoch": 3.5331209345546264, + "grad_norm": 0.45633665558466274, + "learning_rate": 1.823885785196401e-06, + "loss": 0.5144, + "step": 13309 + }, + { + "epoch": 3.53338643302801, + "grad_norm": 0.46867442588239344, + "learning_rate": 1.8235496740429704e-06, + "loss": 0.541, + "step": 13310 + }, + { + "epoch": 3.533651931501394, + "grad_norm": 0.4573686548094638, + "learning_rate": 1.823213576081978e-06, + "loss": 0.5883, + "step": 13311 + }, + { + "epoch": 3.5339174299747778, + "grad_norm": 0.47876951728198947, + "learning_rate": 1.822877491319977e-06, + "loss": 0.546, + "step": 13312 + }, + { + "epoch": 3.5341829284481614, + "grad_norm": 0.46581391849534404, + "learning_rate": 1.8225414197635233e-06, + "loss": 0.5482, + "step": 13313 + }, + { + "epoch": 3.534448426921545, + "grad_norm": 0.45135367091489537, + "learning_rate": 1.8222053614191703e-06, + "loss": 0.5238, + "step": 13314 + }, + { + "epoch": 3.534713925394929, + "grad_norm": 0.47063411666491073, + "learning_rate": 1.8218693162934725e-06, + "loss": 0.6163, + "step": 13315 + }, + { + "epoch": 3.5349794238683128, + "grad_norm": 0.45800671457918946, + "learning_rate": 1.8215332843929844e-06, + "loss": 0.5251, + "step": 13316 + }, + { + "epoch": 3.5352449223416964, + "grad_norm": 0.46450362297552295, + "learning_rate": 1.8211972657242574e-06, + "loss": 0.5337, + "step": 13317 + }, + { + "epoch": 3.53551042081508, + "grad_norm": 0.46501257474805646, + "learning_rate": 1.8208612602938458e-06, + "loss": 0.5403, + "step": 13318 + }, + { + "epoch": 3.535775919288464, + "grad_norm": 0.4728168066816079, + "learning_rate": 1.8205252681083022e-06, + "loss": 0.5622, + "step": 13319 + }, + { + "epoch": 3.5360414177618478, + "grad_norm": 0.46987943080612654, + "learning_rate": 1.8201892891741796e-06, + "loss": 0.5315, + "step": 13320 + }, + { + "epoch": 3.5363069162352314, + "grad_norm": 0.46578639058516846, + "learning_rate": 1.8198533234980299e-06, + "loss": 0.5766, + "step": 13321 + }, + { + "epoch": 3.5365724147086155, + "grad_norm": 0.4612838261516672, + "learning_rate": 1.8195173710864072e-06, + "loss": 0.5404, + "step": 13322 + }, + { + "epoch": 3.536837913181999, + "grad_norm": 0.464509874657138, + "learning_rate": 1.8191814319458607e-06, + "loss": 0.5232, + "step": 13323 + }, + { + "epoch": 3.537103411655383, + "grad_norm": 0.46551828232540565, + "learning_rate": 1.8188455060829433e-06, + "loss": 0.5487, + "step": 13324 + }, + { + "epoch": 3.537368910128767, + "grad_norm": 0.4704121344797628, + "learning_rate": 1.8185095935042063e-06, + "loss": 0.5288, + "step": 13325 + }, + { + "epoch": 3.5376344086021505, + "grad_norm": 0.458157532290727, + "learning_rate": 1.818173694216201e-06, + "loss": 0.5308, + "step": 13326 + }, + { + "epoch": 3.537899907075534, + "grad_norm": 0.463476029786354, + "learning_rate": 1.8178378082254778e-06, + "loss": 0.5279, + "step": 13327 + }, + { + "epoch": 3.5381654055489182, + "grad_norm": 0.46049158692446607, + "learning_rate": 1.8175019355385881e-06, + "loss": 0.51, + "step": 13328 + }, + { + "epoch": 3.538430904022302, + "grad_norm": 0.46050540381666566, + "learning_rate": 1.8171660761620814e-06, + "loss": 0.5399, + "step": 13329 + }, + { + "epoch": 3.5386964024956855, + "grad_norm": 0.46728068136518575, + "learning_rate": 1.8168302301025076e-06, + "loss": 0.5721, + "step": 13330 + }, + { + "epoch": 3.5389619009690696, + "grad_norm": 0.45899014975898283, + "learning_rate": 1.8164943973664173e-06, + "loss": 0.5815, + "step": 13331 + }, + { + "epoch": 3.5392273994424532, + "grad_norm": 0.46072481237734786, + "learning_rate": 1.8161585779603597e-06, + "loss": 0.5524, + "step": 13332 + }, + { + "epoch": 3.539492897915837, + "grad_norm": 0.4624546498758775, + "learning_rate": 1.8158227718908844e-06, + "loss": 0.5309, + "step": 13333 + }, + { + "epoch": 3.539758396389221, + "grad_norm": 0.466793258928801, + "learning_rate": 1.8154869791645407e-06, + "loss": 0.5571, + "step": 13334 + }, + { + "epoch": 3.5400238948626046, + "grad_norm": 0.47937328580533517, + "learning_rate": 1.8151511997878765e-06, + "loss": 0.5315, + "step": 13335 + }, + { + "epoch": 3.5402893933359882, + "grad_norm": 0.4645636547339201, + "learning_rate": 1.8148154337674404e-06, + "loss": 0.4943, + "step": 13336 + }, + { + "epoch": 3.5405548918093723, + "grad_norm": 0.47437267430919217, + "learning_rate": 1.8144796811097808e-06, + "loss": 0.5421, + "step": 13337 + }, + { + "epoch": 3.540820390282756, + "grad_norm": 0.4572086120215907, + "learning_rate": 1.8141439418214462e-06, + "loss": 0.5388, + "step": 13338 + }, + { + "epoch": 3.5410858887561396, + "grad_norm": 0.46296689253857026, + "learning_rate": 1.8138082159089842e-06, + "loss": 0.5641, + "step": 13339 + }, + { + "epoch": 3.5413513872295237, + "grad_norm": 0.48060611012956767, + "learning_rate": 1.813472503378942e-06, + "loss": 0.5231, + "step": 13340 + }, + { + "epoch": 3.5416168857029073, + "grad_norm": 0.45447153441145993, + "learning_rate": 1.8131368042378667e-06, + "loss": 0.5404, + "step": 13341 + }, + { + "epoch": 3.541882384176291, + "grad_norm": 0.4619167152753581, + "learning_rate": 1.812801118492305e-06, + "loss": 0.5541, + "step": 13342 + }, + { + "epoch": 3.5421478826496746, + "grad_norm": 0.46565886796446704, + "learning_rate": 1.8124654461488044e-06, + "loss": 0.5262, + "step": 13343 + }, + { + "epoch": 3.5424133811230587, + "grad_norm": 0.454052094043848, + "learning_rate": 1.812129787213911e-06, + "loss": 0.5651, + "step": 13344 + }, + { + "epoch": 3.5426788795964423, + "grad_norm": 0.46296804636010425, + "learning_rate": 1.811794141694172e-06, + "loss": 0.5417, + "step": 13345 + }, + { + "epoch": 3.542944378069826, + "grad_norm": 0.4624527419520514, + "learning_rate": 1.8114585095961307e-06, + "loss": 0.5738, + "step": 13346 + }, + { + "epoch": 3.5432098765432096, + "grad_norm": 0.45415210932607414, + "learning_rate": 1.8111228909263349e-06, + "loss": 0.5565, + "step": 13347 + }, + { + "epoch": 3.5434753750165937, + "grad_norm": 0.458967212229915, + "learning_rate": 1.8107872856913293e-06, + "loss": 0.5361, + "step": 13348 + }, + { + "epoch": 3.5437408734899773, + "grad_norm": 0.4754983463032878, + "learning_rate": 1.8104516938976591e-06, + "loss": 0.5695, + "step": 13349 + }, + { + "epoch": 3.544006371963361, + "grad_norm": 0.44949523539603625, + "learning_rate": 1.8101161155518688e-06, + "loss": 0.5524, + "step": 13350 + }, + { + "epoch": 3.544271870436745, + "grad_norm": 0.4619064471165001, + "learning_rate": 1.8097805506605048e-06, + "loss": 0.5784, + "step": 13351 + }, + { + "epoch": 3.5445373689101287, + "grad_norm": 0.465932710885596, + "learning_rate": 1.8094449992301088e-06, + "loss": 0.5349, + "step": 13352 + }, + { + "epoch": 3.5448028673835124, + "grad_norm": 0.4430064312820136, + "learning_rate": 1.8091094612672267e-06, + "loss": 0.5495, + "step": 13353 + }, + { + "epoch": 3.5450683658568964, + "grad_norm": 0.4574567695152756, + "learning_rate": 1.808773936778401e-06, + "loss": 0.5439, + "step": 13354 + }, + { + "epoch": 3.54533386433028, + "grad_norm": 0.44565611113583403, + "learning_rate": 1.8084384257701762e-06, + "loss": 0.5222, + "step": 13355 + }, + { + "epoch": 3.5455993628036637, + "grad_norm": 0.4747693722507459, + "learning_rate": 1.8081029282490954e-06, + "loss": 0.5768, + "step": 13356 + }, + { + "epoch": 3.545864861277048, + "grad_norm": 0.4605451950163074, + "learning_rate": 1.807767444221702e-06, + "loss": 0.5553, + "step": 13357 + }, + { + "epoch": 3.5461303597504314, + "grad_norm": 0.47668760621679146, + "learning_rate": 1.8074319736945375e-06, + "loss": 0.55, + "step": 13358 + }, + { + "epoch": 3.546395858223815, + "grad_norm": 0.45765602951094014, + "learning_rate": 1.8070965166741456e-06, + "loss": 0.5563, + "step": 13359 + }, + { + "epoch": 3.546661356697199, + "grad_norm": 0.4426848554524709, + "learning_rate": 1.806761073167068e-06, + "loss": 0.5485, + "step": 13360 + }, + { + "epoch": 3.546926855170583, + "grad_norm": 0.4614411963566644, + "learning_rate": 1.8064256431798468e-06, + "loss": 0.4962, + "step": 13361 + }, + { + "epoch": 3.5471923536439665, + "grad_norm": 0.46302872444446624, + "learning_rate": 1.806090226719025e-06, + "loss": 0.573, + "step": 13362 + }, + { + "epoch": 3.5474578521173505, + "grad_norm": 0.4654944901248699, + "learning_rate": 1.8057548237911415e-06, + "loss": 0.5523, + "step": 13363 + }, + { + "epoch": 3.547723350590734, + "grad_norm": 0.46221678040871367, + "learning_rate": 1.8054194344027386e-06, + "loss": 0.5587, + "step": 13364 + }, + { + "epoch": 3.547988849064118, + "grad_norm": 0.48212872324430195, + "learning_rate": 1.8050840585603575e-06, + "loss": 0.5835, + "step": 13365 + }, + { + "epoch": 3.548254347537502, + "grad_norm": 0.4677183548544849, + "learning_rate": 1.8047486962705382e-06, + "loss": 0.5494, + "step": 13366 + }, + { + "epoch": 3.5485198460108855, + "grad_norm": 0.4734338976072318, + "learning_rate": 1.8044133475398224e-06, + "loss": 0.5296, + "step": 13367 + }, + { + "epoch": 3.548785344484269, + "grad_norm": 0.47970326485162174, + "learning_rate": 1.8040780123747497e-06, + "loss": 0.5268, + "step": 13368 + }, + { + "epoch": 3.549050842957653, + "grad_norm": 0.45692631691054686, + "learning_rate": 1.8037426907818595e-06, + "loss": 0.5367, + "step": 13369 + }, + { + "epoch": 3.549316341431037, + "grad_norm": 0.45819299834256083, + "learning_rate": 1.803407382767691e-06, + "loss": 0.5437, + "step": 13370 + }, + { + "epoch": 3.5495818399044206, + "grad_norm": 0.4606304405507358, + "learning_rate": 1.8030720883387843e-06, + "loss": 0.5561, + "step": 13371 + }, + { + "epoch": 3.549847338377804, + "grad_norm": 0.4512011432969114, + "learning_rate": 1.8027368075016783e-06, + "loss": 0.5257, + "step": 13372 + }, + { + "epoch": 3.550112836851188, + "grad_norm": 0.4446618314979587, + "learning_rate": 1.8024015402629114e-06, + "loss": 0.5242, + "step": 13373 + }, + { + "epoch": 3.550378335324572, + "grad_norm": 0.4584723772905301, + "learning_rate": 1.8020662866290234e-06, + "loss": 0.5497, + "step": 13374 + }, + { + "epoch": 3.5506438337979556, + "grad_norm": 0.45836627776091277, + "learning_rate": 1.8017310466065508e-06, + "loss": 0.5582, + "step": 13375 + }, + { + "epoch": 3.550909332271339, + "grad_norm": 0.4586827143790453, + "learning_rate": 1.8013958202020327e-06, + "loss": 0.5259, + "step": 13376 + }, + { + "epoch": 3.5511748307447233, + "grad_norm": 0.4822886002524828, + "learning_rate": 1.8010606074220066e-06, + "loss": 0.5386, + "step": 13377 + }, + { + "epoch": 3.551440329218107, + "grad_norm": 0.4491399056760416, + "learning_rate": 1.8007254082730098e-06, + "loss": 0.568, + "step": 13378 + }, + { + "epoch": 3.5517058276914906, + "grad_norm": 0.4590620565459339, + "learning_rate": 1.8003902227615797e-06, + "loss": 0.4901, + "step": 13379 + }, + { + "epoch": 3.5519713261648747, + "grad_norm": 0.4658584849920062, + "learning_rate": 1.8000550508942543e-06, + "loss": 0.5742, + "step": 13380 + }, + { + "epoch": 3.5522368246382583, + "grad_norm": 0.4791880490889075, + "learning_rate": 1.799719892677568e-06, + "loss": 0.5765, + "step": 13381 + }, + { + "epoch": 3.552502323111642, + "grad_norm": 0.45790078740687573, + "learning_rate": 1.7993847481180584e-06, + "loss": 0.5211, + "step": 13382 + }, + { + "epoch": 3.552767821585026, + "grad_norm": 0.4603405802422378, + "learning_rate": 1.7990496172222618e-06, + "loss": 0.5659, + "step": 13383 + }, + { + "epoch": 3.5530333200584097, + "grad_norm": 0.47657050938825063, + "learning_rate": 1.7987144999967138e-06, + "loss": 0.588, + "step": 13384 + }, + { + "epoch": 3.5532988185317933, + "grad_norm": 0.4820428828671815, + "learning_rate": 1.7983793964479506e-06, + "loss": 0.5881, + "step": 13385 + }, + { + "epoch": 3.5535643170051774, + "grad_norm": 0.47247994579564584, + "learning_rate": 1.7980443065825064e-06, + "loss": 0.5497, + "step": 13386 + }, + { + "epoch": 3.553829815478561, + "grad_norm": 0.4628977112928438, + "learning_rate": 1.7977092304069172e-06, + "loss": 0.5633, + "step": 13387 + }, + { + "epoch": 3.5540953139519447, + "grad_norm": 0.4603827851613823, + "learning_rate": 1.797374167927717e-06, + "loss": 0.5617, + "step": 13388 + }, + { + "epoch": 3.5543608124253288, + "grad_norm": 0.4685640936464193, + "learning_rate": 1.7970391191514413e-06, + "loss": 0.5288, + "step": 13389 + }, + { + "epoch": 3.5546263108987124, + "grad_norm": 0.49156489185667773, + "learning_rate": 1.7967040840846238e-06, + "loss": 0.5546, + "step": 13390 + }, + { + "epoch": 3.554891809372096, + "grad_norm": 0.48724063683434726, + "learning_rate": 1.7963690627337991e-06, + "loss": 0.5267, + "step": 13391 + }, + { + "epoch": 3.55515730784548, + "grad_norm": 0.4621055018985786, + "learning_rate": 1.7960340551055e-06, + "loss": 0.5234, + "step": 13392 + }, + { + "epoch": 3.5554228063188638, + "grad_norm": 0.4572332916654144, + "learning_rate": 1.7956990612062603e-06, + "loss": 0.5884, + "step": 13393 + }, + { + "epoch": 3.5556883047922474, + "grad_norm": 0.45258758492541956, + "learning_rate": 1.7953640810426134e-06, + "loss": 0.5837, + "step": 13394 + }, + { + "epoch": 3.5559538032656315, + "grad_norm": 0.47264031210983315, + "learning_rate": 1.7950291146210921e-06, + "loss": 0.5471, + "step": 13395 + }, + { + "epoch": 3.556219301739015, + "grad_norm": 0.4571966160127003, + "learning_rate": 1.7946941619482295e-06, + "loss": 0.5404, + "step": 13396 + }, + { + "epoch": 3.5564848002123988, + "grad_norm": 0.4525961944234777, + "learning_rate": 1.7943592230305583e-06, + "loss": 0.5436, + "step": 13397 + }, + { + "epoch": 3.5567502986857824, + "grad_norm": 0.47068183889238013, + "learning_rate": 1.794024297874609e-06, + "loss": 0.5781, + "step": 13398 + }, + { + "epoch": 3.5570157971591665, + "grad_norm": 0.4685688698136096, + "learning_rate": 1.7936893864869143e-06, + "loss": 0.5266, + "step": 13399 + }, + { + "epoch": 3.55728129563255, + "grad_norm": 0.46791732086375687, + "learning_rate": 1.7933544888740062e-06, + "loss": 0.5597, + "step": 13400 + }, + { + "epoch": 3.5575467941059338, + "grad_norm": 0.46104317132614264, + "learning_rate": 1.7930196050424153e-06, + "loss": 0.5126, + "step": 13401 + }, + { + "epoch": 3.5578122925793174, + "grad_norm": 0.4510859060957634, + "learning_rate": 1.7926847349986738e-06, + "loss": 0.5766, + "step": 13402 + }, + { + "epoch": 3.5580777910527015, + "grad_norm": 0.4663932058041748, + "learning_rate": 1.7923498787493115e-06, + "loss": 0.6024, + "step": 13403 + }, + { + "epoch": 3.558343289526085, + "grad_norm": 0.46559606932309877, + "learning_rate": 1.7920150363008592e-06, + "loss": 0.5404, + "step": 13404 + }, + { + "epoch": 3.558608787999469, + "grad_norm": 0.46039578516232343, + "learning_rate": 1.791680207659847e-06, + "loss": 0.552, + "step": 13405 + }, + { + "epoch": 3.558874286472853, + "grad_norm": 0.4595817941004434, + "learning_rate": 1.7913453928328047e-06, + "loss": 0.5589, + "step": 13406 + }, + { + "epoch": 3.5591397849462365, + "grad_norm": 0.4494892316142684, + "learning_rate": 1.7910105918262627e-06, + "loss": 0.5478, + "step": 13407 + }, + { + "epoch": 3.55940528341962, + "grad_norm": 0.47063165969361465, + "learning_rate": 1.7906758046467499e-06, + "loss": 0.5527, + "step": 13408 + }, + { + "epoch": 3.5596707818930042, + "grad_norm": 0.4580673824266431, + "learning_rate": 1.7903410313007968e-06, + "loss": 0.4956, + "step": 13409 + }, + { + "epoch": 3.559936280366388, + "grad_norm": 0.43336700747446555, + "learning_rate": 1.79000627179493e-06, + "loss": 0.558, + "step": 13410 + }, + { + "epoch": 3.5602017788397715, + "grad_norm": 0.46408108982588064, + "learning_rate": 1.7896715261356795e-06, + "loss": 0.5732, + "step": 13411 + }, + { + "epoch": 3.5604672773131556, + "grad_norm": 0.4632723039607594, + "learning_rate": 1.7893367943295725e-06, + "loss": 0.5676, + "step": 13412 + }, + { + "epoch": 3.5607327757865392, + "grad_norm": 0.4647143154743714, + "learning_rate": 1.789002076383139e-06, + "loss": 0.5426, + "step": 13413 + }, + { + "epoch": 3.560998274259923, + "grad_norm": 0.4691977312057472, + "learning_rate": 1.7886673723029063e-06, + "loss": 0.5492, + "step": 13414 + }, + { + "epoch": 3.561263772733307, + "grad_norm": 0.47854810178648244, + "learning_rate": 1.7883326820954008e-06, + "loss": 0.5756, + "step": 13415 + }, + { + "epoch": 3.5615292712066906, + "grad_norm": 0.4653652352121288, + "learning_rate": 1.7879980057671503e-06, + "loss": 0.546, + "step": 13416 + }, + { + "epoch": 3.5617947696800742, + "grad_norm": 0.46469167744999185, + "learning_rate": 1.7876633433246823e-06, + "loss": 0.5435, + "step": 13417 + }, + { + "epoch": 3.5620602681534583, + "grad_norm": 0.4747169375924761, + "learning_rate": 1.787328694774523e-06, + "loss": 0.5287, + "step": 13418 + }, + { + "epoch": 3.562325766626842, + "grad_norm": 0.4712499618288174, + "learning_rate": 1.7869940601231989e-06, + "loss": 0.5247, + "step": 13419 + }, + { + "epoch": 3.5625912651002256, + "grad_norm": 0.4685053149937222, + "learning_rate": 1.7866594393772375e-06, + "loss": 0.5649, + "step": 13420 + }, + { + "epoch": 3.5628567635736097, + "grad_norm": 0.46585725039815645, + "learning_rate": 1.7863248325431622e-06, + "loss": 0.5578, + "step": 13421 + }, + { + "epoch": 3.5631222620469933, + "grad_norm": 0.47506420689991613, + "learning_rate": 1.7859902396275006e-06, + "loss": 0.5072, + "step": 13422 + }, + { + "epoch": 3.563387760520377, + "grad_norm": 0.45989845537698193, + "learning_rate": 1.7856556606367776e-06, + "loss": 0.5701, + "step": 13423 + }, + { + "epoch": 3.5636532589937606, + "grad_norm": 0.4545099109375478, + "learning_rate": 1.785321095577518e-06, + "loss": 0.5377, + "step": 13424 + }, + { + "epoch": 3.5639187574671447, + "grad_norm": 0.4671811360140433, + "learning_rate": 1.7849865444562466e-06, + "loss": 0.5417, + "step": 13425 + }, + { + "epoch": 3.5641842559405283, + "grad_norm": 0.4551352377572186, + "learning_rate": 1.7846520072794899e-06, + "loss": 0.5207, + "step": 13426 + }, + { + "epoch": 3.564449754413912, + "grad_norm": 0.4392007843676805, + "learning_rate": 1.7843174840537691e-06, + "loss": 0.5423, + "step": 13427 + }, + { + "epoch": 3.5647152528872956, + "grad_norm": 0.4632468758790232, + "learning_rate": 1.7839829747856096e-06, + "loss": 0.5118, + "step": 13428 + }, + { + "epoch": 3.5649807513606797, + "grad_norm": 0.452307858788729, + "learning_rate": 1.7836484794815354e-06, + "loss": 0.5903, + "step": 13429 + }, + { + "epoch": 3.5652462498340634, + "grad_norm": 0.47923715578627024, + "learning_rate": 1.7833139981480695e-06, + "loss": 0.5838, + "step": 13430 + }, + { + "epoch": 3.565511748307447, + "grad_norm": 0.4543391655277271, + "learning_rate": 1.7829795307917359e-06, + "loss": 0.5345, + "step": 13431 + }, + { + "epoch": 3.565777246780831, + "grad_norm": 0.46217600378670654, + "learning_rate": 1.782645077419057e-06, + "loss": 0.5208, + "step": 13432 + }, + { + "epoch": 3.5660427452542147, + "grad_norm": 0.4553002923613811, + "learning_rate": 1.782310638036555e-06, + "loss": 0.5692, + "step": 13433 + }, + { + "epoch": 3.5663082437275984, + "grad_norm": 0.46997920258942016, + "learning_rate": 1.7819762126507531e-06, + "loss": 0.5726, + "step": 13434 + }, + { + "epoch": 3.5665737422009824, + "grad_norm": 0.45042019117233795, + "learning_rate": 1.7816418012681729e-06, + "loss": 0.5507, + "step": 13435 + }, + { + "epoch": 3.566839240674366, + "grad_norm": 0.4646775304034689, + "learning_rate": 1.7813074038953365e-06, + "loss": 0.5105, + "step": 13436 + }, + { + "epoch": 3.5671047391477497, + "grad_norm": 0.46897989614462854, + "learning_rate": 1.780973020538766e-06, + "loss": 0.5315, + "step": 13437 + }, + { + "epoch": 3.567370237621134, + "grad_norm": 0.45409252558744917, + "learning_rate": 1.7806386512049812e-06, + "loss": 0.531, + "step": 13438 + }, + { + "epoch": 3.5676357360945175, + "grad_norm": 0.4485736420246746, + "learning_rate": 1.7803042959005044e-06, + "loss": 0.5159, + "step": 13439 + }, + { + "epoch": 3.567901234567901, + "grad_norm": 0.44362931415859147, + "learning_rate": 1.7799699546318554e-06, + "loss": 0.5182, + "step": 13440 + }, + { + "epoch": 3.568166733041285, + "grad_norm": 0.4738063188948455, + "learning_rate": 1.779635627405556e-06, + "loss": 0.5762, + "step": 13441 + }, + { + "epoch": 3.568432231514669, + "grad_norm": 0.49126397181579917, + "learning_rate": 1.7793013142281252e-06, + "loss": 0.5744, + "step": 13442 + }, + { + "epoch": 3.5686977299880525, + "grad_norm": 0.48246570501670905, + "learning_rate": 1.7789670151060844e-06, + "loss": 0.5727, + "step": 13443 + }, + { + "epoch": 3.5689632284614365, + "grad_norm": 0.4698829369249473, + "learning_rate": 1.778632730045951e-06, + "loss": 0.5378, + "step": 13444 + }, + { + "epoch": 3.56922872693482, + "grad_norm": 0.4462295579486581, + "learning_rate": 1.7782984590542457e-06, + "loss": 0.5196, + "step": 13445 + }, + { + "epoch": 3.569494225408204, + "grad_norm": 0.4586111211394395, + "learning_rate": 1.7779642021374878e-06, + "loss": 0.5242, + "step": 13446 + }, + { + "epoch": 3.569759723881588, + "grad_norm": 0.46778783325941115, + "learning_rate": 1.7776299593021954e-06, + "loss": 0.5402, + "step": 13447 + }, + { + "epoch": 3.5700252223549716, + "grad_norm": 0.46404866019170077, + "learning_rate": 1.7772957305548878e-06, + "loss": 0.5167, + "step": 13448 + }, + { + "epoch": 3.570290720828355, + "grad_norm": 0.4689455599244113, + "learning_rate": 1.776961515902083e-06, + "loss": 0.5509, + "step": 13449 + }, + { + "epoch": 3.5705562193017393, + "grad_norm": 0.46995535060411964, + "learning_rate": 1.7766273153502987e-06, + "loss": 0.5289, + "step": 13450 + }, + { + "epoch": 3.570821717775123, + "grad_norm": 0.46512019045487785, + "learning_rate": 1.7762931289060526e-06, + "loss": 0.5431, + "step": 13451 + }, + { + "epoch": 3.5710872162485066, + "grad_norm": 0.4533239612693231, + "learning_rate": 1.7759589565758628e-06, + "loss": 0.5519, + "step": 13452 + }, + { + "epoch": 3.57135271472189, + "grad_norm": 0.47385085007076516, + "learning_rate": 1.7756247983662462e-06, + "loss": 0.5228, + "step": 13453 + }, + { + "epoch": 3.5716182131952743, + "grad_norm": 0.4715054638091034, + "learning_rate": 1.775290654283719e-06, + "loss": 0.5533, + "step": 13454 + }, + { + "epoch": 3.571883711668658, + "grad_norm": 0.46916100755297774, + "learning_rate": 1.7749565243348e-06, + "loss": 0.545, + "step": 13455 + }, + { + "epoch": 3.5721492101420416, + "grad_norm": 0.49002467578488784, + "learning_rate": 1.7746224085260027e-06, + "loss": 0.5574, + "step": 13456 + }, + { + "epoch": 3.572414708615425, + "grad_norm": 0.4573229806166725, + "learning_rate": 1.7742883068638447e-06, + "loss": 0.5359, + "step": 13457 + }, + { + "epoch": 3.5726802070888093, + "grad_norm": 0.47149043313575806, + "learning_rate": 1.7739542193548409e-06, + "loss": 0.5477, + "step": 13458 + }, + { + "epoch": 3.572945705562193, + "grad_norm": 0.4713092628733232, + "learning_rate": 1.7736201460055081e-06, + "loss": 0.5601, + "step": 13459 + }, + { + "epoch": 3.5732112040355766, + "grad_norm": 0.4632081871651939, + "learning_rate": 1.7732860868223618e-06, + "loss": 0.5214, + "step": 13460 + }, + { + "epoch": 3.5734767025089607, + "grad_norm": 0.46795326791708436, + "learning_rate": 1.772952041811915e-06, + "loss": 0.5181, + "step": 13461 + }, + { + "epoch": 3.5737422009823443, + "grad_norm": 0.4642618365293363, + "learning_rate": 1.772618010980684e-06, + "loss": 0.5059, + "step": 13462 + }, + { + "epoch": 3.574007699455728, + "grad_norm": 0.45538823820263696, + "learning_rate": 1.772283994335182e-06, + "loss": 0.544, + "step": 13463 + }, + { + "epoch": 3.574273197929112, + "grad_norm": 0.4497995411658983, + "learning_rate": 1.7719499918819245e-06, + "loss": 0.5492, + "step": 13464 + }, + { + "epoch": 3.5745386964024957, + "grad_norm": 0.44930793691615756, + "learning_rate": 1.7716160036274244e-06, + "loss": 0.5199, + "step": 13465 + }, + { + "epoch": 3.5748041948758793, + "grad_norm": 0.4656445415602291, + "learning_rate": 1.7712820295781964e-06, + "loss": 0.5162, + "step": 13466 + }, + { + "epoch": 3.5750696933492634, + "grad_norm": 0.45028278674883476, + "learning_rate": 1.7709480697407522e-06, + "loss": 0.5173, + "step": 13467 + }, + { + "epoch": 3.575335191822647, + "grad_norm": 0.46591458074512626, + "learning_rate": 1.7706141241216058e-06, + "loss": 0.5449, + "step": 13468 + }, + { + "epoch": 3.5756006902960307, + "grad_norm": 0.44660273366065795, + "learning_rate": 1.77028019272727e-06, + "loss": 0.533, + "step": 13469 + }, + { + "epoch": 3.5758661887694148, + "grad_norm": 0.4772883622271824, + "learning_rate": 1.769946275564257e-06, + "loss": 0.5429, + "step": 13470 + }, + { + "epoch": 3.5761316872427984, + "grad_norm": 0.46090785423914027, + "learning_rate": 1.7696123726390791e-06, + "loss": 0.5291, + "step": 13471 + }, + { + "epoch": 3.576397185716182, + "grad_norm": 0.45780964091687737, + "learning_rate": 1.769278483958249e-06, + "loss": 0.5564, + "step": 13472 + }, + { + "epoch": 3.576662684189566, + "grad_norm": 0.48494254412986126, + "learning_rate": 1.7689446095282767e-06, + "loss": 0.5765, + "step": 13473 + }, + { + "epoch": 3.5769281826629498, + "grad_norm": 0.46426098328621784, + "learning_rate": 1.7686107493556745e-06, + "loss": 0.5181, + "step": 13474 + }, + { + "epoch": 3.5771936811363334, + "grad_norm": 0.4532630970030804, + "learning_rate": 1.7682769034469535e-06, + "loss": 0.561, + "step": 13475 + }, + { + "epoch": 3.5774591796097175, + "grad_norm": 0.4650211973398225, + "learning_rate": 1.7679430718086244e-06, + "loss": 0.5805, + "step": 13476 + }, + { + "epoch": 3.577724678083101, + "grad_norm": 0.470134843266266, + "learning_rate": 1.767609254447198e-06, + "loss": 0.5227, + "step": 13477 + }, + { + "epoch": 3.5779901765564848, + "grad_norm": 0.45509087622096434, + "learning_rate": 1.7672754513691842e-06, + "loss": 0.5653, + "step": 13478 + }, + { + "epoch": 3.578255675029869, + "grad_norm": 0.4501670867602088, + "learning_rate": 1.766941662581093e-06, + "loss": 0.5434, + "step": 13479 + }, + { + "epoch": 3.5785211735032525, + "grad_norm": 0.47383790659574726, + "learning_rate": 1.7666078880894344e-06, + "loss": 0.5778, + "step": 13480 + }, + { + "epoch": 3.578786671976636, + "grad_norm": 0.47332026242927877, + "learning_rate": 1.7662741279007175e-06, + "loss": 0.5381, + "step": 13481 + }, + { + "epoch": 3.5790521704500198, + "grad_norm": 0.4630851023373439, + "learning_rate": 1.7659403820214516e-06, + "loss": 0.5707, + "step": 13482 + }, + { + "epoch": 3.5793176689234034, + "grad_norm": 0.4569584486873443, + "learning_rate": 1.765606650458146e-06, + "loss": 0.5771, + "step": 13483 + }, + { + "epoch": 3.5795831673967875, + "grad_norm": 0.45945231131832176, + "learning_rate": 1.7652729332173082e-06, + "loss": 0.5387, + "step": 13484 + }, + { + "epoch": 3.579848665870171, + "grad_norm": 0.46590589802264126, + "learning_rate": 1.7649392303054475e-06, + "loss": 0.5704, + "step": 13485 + }, + { + "epoch": 3.580114164343555, + "grad_norm": 0.4705960375163898, + "learning_rate": 1.7646055417290712e-06, + "loss": 0.5646, + "step": 13486 + }, + { + "epoch": 3.580379662816939, + "grad_norm": 0.4708874203168277, + "learning_rate": 1.7642718674946875e-06, + "loss": 0.5523, + "step": 13487 + }, + { + "epoch": 3.5806451612903225, + "grad_norm": 0.45605608095120925, + "learning_rate": 1.7639382076088036e-06, + "loss": 0.4931, + "step": 13488 + }, + { + "epoch": 3.580910659763706, + "grad_norm": 0.4669148709602922, + "learning_rate": 1.763604562077928e-06, + "loss": 0.5468, + "step": 13489 + }, + { + "epoch": 3.5811761582370902, + "grad_norm": 0.46088999239476847, + "learning_rate": 1.7632709309085656e-06, + "loss": 0.5757, + "step": 13490 + }, + { + "epoch": 3.581441656710474, + "grad_norm": 0.4836082446183638, + "learning_rate": 1.7629373141072237e-06, + "loss": 0.5408, + "step": 13491 + }, + { + "epoch": 3.5817071551838575, + "grad_norm": 0.4306765177362808, + "learning_rate": 1.7626037116804087e-06, + "loss": 0.5186, + "step": 13492 + }, + { + "epoch": 3.5819726536572416, + "grad_norm": 0.4629337087409011, + "learning_rate": 1.7622701236346267e-06, + "loss": 0.555, + "step": 13493 + }, + { + "epoch": 3.5822381521306252, + "grad_norm": 0.4728193109940487, + "learning_rate": 1.7619365499763841e-06, + "loss": 0.5235, + "step": 13494 + }, + { + "epoch": 3.582503650604009, + "grad_norm": 0.4584089852387107, + "learning_rate": 1.761602990712186e-06, + "loss": 0.5403, + "step": 13495 + }, + { + "epoch": 3.582769149077393, + "grad_norm": 0.4696528660413869, + "learning_rate": 1.7612694458485368e-06, + "loss": 0.5293, + "step": 13496 + }, + { + "epoch": 3.5830346475507766, + "grad_norm": 0.45349439629781474, + "learning_rate": 1.7609359153919426e-06, + "loss": 0.5565, + "step": 13497 + }, + { + "epoch": 3.5833001460241602, + "grad_norm": 0.46409306959296853, + "learning_rate": 1.7606023993489071e-06, + "loss": 0.5087, + "step": 13498 + }, + { + "epoch": 3.5835656444975443, + "grad_norm": 0.45655254362140885, + "learning_rate": 1.7602688977259354e-06, + "loss": 0.5362, + "step": 13499 + }, + { + "epoch": 3.583831142970928, + "grad_norm": 0.4609754259998067, + "learning_rate": 1.7599354105295313e-06, + "loss": 0.5014, + "step": 13500 + }, + { + "epoch": 3.5840966414443116, + "grad_norm": 0.4585782267669953, + "learning_rate": 1.7596019377662e-06, + "loss": 0.5489, + "step": 13501 + }, + { + "epoch": 3.5843621399176957, + "grad_norm": 0.45110713391992563, + "learning_rate": 1.7592684794424425e-06, + "loss": 0.5418, + "step": 13502 + }, + { + "epoch": 3.5846276383910793, + "grad_norm": 0.47947962867186145, + "learning_rate": 1.7589350355647628e-06, + "loss": 0.5563, + "step": 13503 + }, + { + "epoch": 3.584893136864463, + "grad_norm": 0.4614398257400887, + "learning_rate": 1.7586016061396646e-06, + "loss": 0.5509, + "step": 13504 + }, + { + "epoch": 3.585158635337847, + "grad_norm": 0.4494460046415863, + "learning_rate": 1.7582681911736505e-06, + "loss": 0.5392, + "step": 13505 + }, + { + "epoch": 3.5854241338112307, + "grad_norm": 0.4501420768803699, + "learning_rate": 1.7579347906732238e-06, + "loss": 0.5527, + "step": 13506 + }, + { + "epoch": 3.5856896322846143, + "grad_norm": 0.4569535939121678, + "learning_rate": 1.7576014046448846e-06, + "loss": 0.5068, + "step": 13507 + }, + { + "epoch": 3.585955130757998, + "grad_norm": 0.44355688328757864, + "learning_rate": 1.7572680330951359e-06, + "loss": 0.5525, + "step": 13508 + }, + { + "epoch": 3.586220629231382, + "grad_norm": 0.4720060511626888, + "learning_rate": 1.7569346760304786e-06, + "loss": 0.5423, + "step": 13509 + }, + { + "epoch": 3.5864861277047657, + "grad_norm": 0.4536943308848015, + "learning_rate": 1.7566013334574146e-06, + "loss": 0.5576, + "step": 13510 + }, + { + "epoch": 3.5867516261781494, + "grad_norm": 0.4584524395324461, + "learning_rate": 1.7562680053824448e-06, + "loss": 0.5554, + "step": 13511 + }, + { + "epoch": 3.587017124651533, + "grad_norm": 0.45760156151223275, + "learning_rate": 1.7559346918120708e-06, + "loss": 0.5545, + "step": 13512 + }, + { + "epoch": 3.587282623124917, + "grad_norm": 0.45162586451809417, + "learning_rate": 1.755601392752791e-06, + "loss": 0.5416, + "step": 13513 + }, + { + "epoch": 3.5875481215983007, + "grad_norm": 0.47282761243564664, + "learning_rate": 1.7552681082111067e-06, + "loss": 0.559, + "step": 13514 + }, + { + "epoch": 3.5878136200716844, + "grad_norm": 0.46998835058176913, + "learning_rate": 1.754934838193518e-06, + "loss": 0.5829, + "step": 13515 + }, + { + "epoch": 3.5880791185450684, + "grad_norm": 0.47521146468018266, + "learning_rate": 1.754601582706524e-06, + "loss": 0.5337, + "step": 13516 + }, + { + "epoch": 3.588344617018452, + "grad_norm": 0.45311333421120553, + "learning_rate": 1.7542683417566242e-06, + "loss": 0.5303, + "step": 13517 + }, + { + "epoch": 3.5886101154918357, + "grad_norm": 0.45546536908517565, + "learning_rate": 1.7539351153503187e-06, + "loss": 0.5385, + "step": 13518 + }, + { + "epoch": 3.58887561396522, + "grad_norm": 0.4690156912406919, + "learning_rate": 1.7536019034941042e-06, + "loss": 0.5606, + "step": 13519 + }, + { + "epoch": 3.5891411124386035, + "grad_norm": 0.45947773561372895, + "learning_rate": 1.7532687061944798e-06, + "loss": 0.5275, + "step": 13520 + }, + { + "epoch": 3.589406610911987, + "grad_norm": 0.46085606263172146, + "learning_rate": 1.7529355234579443e-06, + "loss": 0.5482, + "step": 13521 + }, + { + "epoch": 3.589672109385371, + "grad_norm": 0.461940512163368, + "learning_rate": 1.752602355290995e-06, + "loss": 0.5789, + "step": 13522 + }, + { + "epoch": 3.589937607858755, + "grad_norm": 0.4730792736893672, + "learning_rate": 1.7522692017001296e-06, + "loss": 0.5216, + "step": 13523 + }, + { + "epoch": 3.5902031063321385, + "grad_norm": 0.4844521368790733, + "learning_rate": 1.7519360626918464e-06, + "loss": 0.5008, + "step": 13524 + }, + { + "epoch": 3.5904686048055225, + "grad_norm": 0.45045897881455704, + "learning_rate": 1.7516029382726408e-06, + "loss": 0.542, + "step": 13525 + }, + { + "epoch": 3.590734103278906, + "grad_norm": 0.4676984019742277, + "learning_rate": 1.7512698284490101e-06, + "loss": 0.5667, + "step": 13526 + }, + { + "epoch": 3.59099960175229, + "grad_norm": 0.4616785801193104, + "learning_rate": 1.7509367332274512e-06, + "loss": 0.5572, + "step": 13527 + }, + { + "epoch": 3.591265100225674, + "grad_norm": 0.47411424307148375, + "learning_rate": 1.7506036526144599e-06, + "loss": 0.5511, + "step": 13528 + }, + { + "epoch": 3.5915305986990576, + "grad_norm": 0.5009161702096905, + "learning_rate": 1.7502705866165326e-06, + "loss": 0.5535, + "step": 13529 + }, + { + "epoch": 3.591796097172441, + "grad_norm": 0.48698262732125325, + "learning_rate": 1.7499375352401646e-06, + "loss": 0.5026, + "step": 13530 + }, + { + "epoch": 3.5920615956458253, + "grad_norm": 0.4773655651478285, + "learning_rate": 1.7496044984918508e-06, + "loss": 0.5447, + "step": 13531 + }, + { + "epoch": 3.592327094119209, + "grad_norm": 0.44994684746788105, + "learning_rate": 1.7492714763780865e-06, + "loss": 0.5118, + "step": 13532 + }, + { + "epoch": 3.5925925925925926, + "grad_norm": 0.47142237175539886, + "learning_rate": 1.7489384689053663e-06, + "loss": 0.5763, + "step": 13533 + }, + { + "epoch": 3.5928580910659766, + "grad_norm": 0.49761640814276, + "learning_rate": 1.7486054760801853e-06, + "loss": 0.5431, + "step": 13534 + }, + { + "epoch": 3.5931235895393603, + "grad_norm": 0.523945669984737, + "learning_rate": 1.7482724979090382e-06, + "loss": 0.5603, + "step": 13535 + }, + { + "epoch": 3.593389088012744, + "grad_norm": 0.48456622887973827, + "learning_rate": 1.7479395343984164e-06, + "loss": 0.5684, + "step": 13536 + }, + { + "epoch": 3.5936545864861276, + "grad_norm": 0.4783161901373854, + "learning_rate": 1.7476065855548158e-06, + "loss": 0.5755, + "step": 13537 + }, + { + "epoch": 3.593920084959511, + "grad_norm": 0.46748155178915257, + "learning_rate": 1.7472736513847283e-06, + "loss": 0.4945, + "step": 13538 + }, + { + "epoch": 3.5941855834328953, + "grad_norm": 0.44811314369627037, + "learning_rate": 1.746940731894648e-06, + "loss": 0.5287, + "step": 13539 + }, + { + "epoch": 3.594451081906279, + "grad_norm": 0.4682869275336895, + "learning_rate": 1.746607827091067e-06, + "loss": 0.5801, + "step": 13540 + }, + { + "epoch": 3.5947165803796626, + "grad_norm": 0.45917664176170214, + "learning_rate": 1.7462749369804788e-06, + "loss": 0.5722, + "step": 13541 + }, + { + "epoch": 3.5949820788530467, + "grad_norm": 0.4728410025339026, + "learning_rate": 1.745942061569374e-06, + "loss": 0.5576, + "step": 13542 + }, + { + "epoch": 3.5952475773264303, + "grad_norm": 0.47159865805610757, + "learning_rate": 1.745609200864245e-06, + "loss": 0.569, + "step": 13543 + }, + { + "epoch": 3.595513075799814, + "grad_norm": 0.48419035298459817, + "learning_rate": 1.745276354871584e-06, + "loss": 0.5319, + "step": 13544 + }, + { + "epoch": 3.595778574273198, + "grad_norm": 0.46495236107469884, + "learning_rate": 1.744943523597882e-06, + "loss": 0.5301, + "step": 13545 + }, + { + "epoch": 3.5960440727465817, + "grad_norm": 0.46122650207334315, + "learning_rate": 1.74461070704963e-06, + "loss": 0.5544, + "step": 13546 + }, + { + "epoch": 3.5963095712199653, + "grad_norm": 0.4499103909485904, + "learning_rate": 1.7442779052333196e-06, + "loss": 0.5775, + "step": 13547 + }, + { + "epoch": 3.5965750696933494, + "grad_norm": 0.467187799874779, + "learning_rate": 1.7439451181554396e-06, + "loss": 0.5074, + "step": 13548 + }, + { + "epoch": 3.596840568166733, + "grad_norm": 0.47675999681071635, + "learning_rate": 1.7436123458224801e-06, + "loss": 0.559, + "step": 13549 + }, + { + "epoch": 3.5971060666401167, + "grad_norm": 0.4640558966115894, + "learning_rate": 1.7432795882409326e-06, + "loss": 0.5238, + "step": 13550 + }, + { + "epoch": 3.5973715651135008, + "grad_norm": 0.46998406412200033, + "learning_rate": 1.7429468454172858e-06, + "loss": 0.5296, + "step": 13551 + }, + { + "epoch": 3.5976370635868844, + "grad_norm": 0.45108524878875705, + "learning_rate": 1.742614117358029e-06, + "loss": 0.5474, + "step": 13552 + }, + { + "epoch": 3.597902562060268, + "grad_norm": 0.4698085632406593, + "learning_rate": 1.7422814040696523e-06, + "loss": 0.5632, + "step": 13553 + }, + { + "epoch": 3.598168060533652, + "grad_norm": 0.47612071347764656, + "learning_rate": 1.7419487055586425e-06, + "loss": 0.5588, + "step": 13554 + }, + { + "epoch": 3.5984335590070358, + "grad_norm": 0.47702669160319555, + "learning_rate": 1.741616021831489e-06, + "loss": 0.5638, + "step": 13555 + }, + { + "epoch": 3.5986990574804194, + "grad_norm": 0.4699379446796062, + "learning_rate": 1.74128335289468e-06, + "loss": 0.562, + "step": 13556 + }, + { + "epoch": 3.5989645559538035, + "grad_norm": 0.4602961472409376, + "learning_rate": 1.7409506987547032e-06, + "loss": 0.5674, + "step": 13557 + }, + { + "epoch": 3.599230054427187, + "grad_norm": 0.46559632964917874, + "learning_rate": 1.7406180594180466e-06, + "loss": 0.543, + "step": 13558 + }, + { + "epoch": 3.5994955529005708, + "grad_norm": 0.46674183003026837, + "learning_rate": 1.7402854348911968e-06, + "loss": 0.5176, + "step": 13559 + }, + { + "epoch": 3.599761051373955, + "grad_norm": 0.4522972304929055, + "learning_rate": 1.7399528251806411e-06, + "loss": 0.5379, + "step": 13560 + }, + { + "epoch": 3.6000265498473385, + "grad_norm": 0.47912756529151496, + "learning_rate": 1.739620230292866e-06, + "loss": 0.5332, + "step": 13561 + }, + { + "epoch": 3.600292048320722, + "grad_norm": 0.4716449912045407, + "learning_rate": 1.7392876502343587e-06, + "loss": 0.5616, + "step": 13562 + }, + { + "epoch": 3.600557546794106, + "grad_norm": 0.4594146892071589, + "learning_rate": 1.7389550850116044e-06, + "loss": 0.5463, + "step": 13563 + }, + { + "epoch": 3.60082304526749, + "grad_norm": 0.470518691819596, + "learning_rate": 1.7386225346310903e-06, + "loss": 0.5214, + "step": 13564 + }, + { + "epoch": 3.6010885437408735, + "grad_norm": 0.4709061899337295, + "learning_rate": 1.7382899990992996e-06, + "loss": 0.5699, + "step": 13565 + }, + { + "epoch": 3.601354042214257, + "grad_norm": 0.46598305534954343, + "learning_rate": 1.7379574784227192e-06, + "loss": 0.5676, + "step": 13566 + }, + { + "epoch": 3.601619540687641, + "grad_norm": 0.46249379887871217, + "learning_rate": 1.7376249726078343e-06, + "loss": 0.547, + "step": 13567 + }, + { + "epoch": 3.601885039161025, + "grad_norm": 0.48189805557502413, + "learning_rate": 1.7372924816611283e-06, + "loss": 0.5471, + "step": 13568 + }, + { + "epoch": 3.6021505376344085, + "grad_norm": 0.4681194709696662, + "learning_rate": 1.7369600055890861e-06, + "loss": 0.5513, + "step": 13569 + }, + { + "epoch": 3.602416036107792, + "grad_norm": 0.4683496453924496, + "learning_rate": 1.7366275443981936e-06, + "loss": 0.5272, + "step": 13570 + }, + { + "epoch": 3.6026815345811762, + "grad_norm": 0.4682665521946284, + "learning_rate": 1.7362950980949323e-06, + "loss": 0.5342, + "step": 13571 + }, + { + "epoch": 3.60294703305456, + "grad_norm": 0.45808760624460554, + "learning_rate": 1.735962666685786e-06, + "loss": 0.5522, + "step": 13572 + }, + { + "epoch": 3.6032125315279435, + "grad_norm": 0.4647097304585912, + "learning_rate": 1.7356302501772385e-06, + "loss": 0.5391, + "step": 13573 + }, + { + "epoch": 3.6034780300013276, + "grad_norm": 0.47034793355342025, + "learning_rate": 1.7352978485757727e-06, + "loss": 0.5546, + "step": 13574 + }, + { + "epoch": 3.6037435284747112, + "grad_norm": 0.4448627507602738, + "learning_rate": 1.7349654618878708e-06, + "loss": 0.5413, + "step": 13575 + }, + { + "epoch": 3.604009026948095, + "grad_norm": 0.46162949965669164, + "learning_rate": 1.7346330901200164e-06, + "loss": 0.5372, + "step": 13576 + }, + { + "epoch": 3.604274525421479, + "grad_norm": 0.4518532704969727, + "learning_rate": 1.7343007332786904e-06, + "loss": 0.5031, + "step": 13577 + }, + { + "epoch": 3.6045400238948626, + "grad_norm": 0.4700331515662211, + "learning_rate": 1.7339683913703745e-06, + "loss": 0.5317, + "step": 13578 + }, + { + "epoch": 3.6048055223682463, + "grad_norm": 0.4678001322655851, + "learning_rate": 1.7336360644015508e-06, + "loss": 0.5826, + "step": 13579 + }, + { + "epoch": 3.6050710208416303, + "grad_norm": 0.4575028624396153, + "learning_rate": 1.7333037523787e-06, + "loss": 0.5311, + "step": 13580 + }, + { + "epoch": 3.605336519315014, + "grad_norm": 0.46315715560371845, + "learning_rate": 1.7329714553083044e-06, + "loss": 0.5426, + "step": 13581 + }, + { + "epoch": 3.6056020177883976, + "grad_norm": 0.47113471415086106, + "learning_rate": 1.7326391731968423e-06, + "loss": 0.5136, + "step": 13582 + }, + { + "epoch": 3.6058675162617817, + "grad_norm": 0.4567439978450365, + "learning_rate": 1.7323069060507953e-06, + "loss": 0.5151, + "step": 13583 + }, + { + "epoch": 3.6061330147351653, + "grad_norm": 0.4491878621633084, + "learning_rate": 1.731974653876643e-06, + "loss": 0.5308, + "step": 13584 + }, + { + "epoch": 3.606398513208549, + "grad_norm": 0.459542964572663, + "learning_rate": 1.7316424166808654e-06, + "loss": 0.5644, + "step": 13585 + }, + { + "epoch": 3.606664011681933, + "grad_norm": 0.48029558159350694, + "learning_rate": 1.7313101944699421e-06, + "loss": 0.5318, + "step": 13586 + }, + { + "epoch": 3.6069295101553167, + "grad_norm": 0.458615514035395, + "learning_rate": 1.7309779872503523e-06, + "loss": 0.5713, + "step": 13587 + }, + { + "epoch": 3.6071950086287004, + "grad_norm": 0.4664399596060654, + "learning_rate": 1.7306457950285747e-06, + "loss": 0.5036, + "step": 13588 + }, + { + "epoch": 3.6074605071020844, + "grad_norm": 0.4692681958041445, + "learning_rate": 1.7303136178110873e-06, + "loss": 0.5017, + "step": 13589 + }, + { + "epoch": 3.607726005575468, + "grad_norm": 0.46142858668685666, + "learning_rate": 1.7299814556043694e-06, + "loss": 0.5442, + "step": 13590 + }, + { + "epoch": 3.6079915040488517, + "grad_norm": 0.45118892028036384, + "learning_rate": 1.729649308414898e-06, + "loss": 0.5671, + "step": 13591 + }, + { + "epoch": 3.6082570025222354, + "grad_norm": 0.4598542880949478, + "learning_rate": 1.729317176249151e-06, + "loss": 0.525, + "step": 13592 + }, + { + "epoch": 3.608522500995619, + "grad_norm": 0.46819627471808545, + "learning_rate": 1.7289850591136077e-06, + "loss": 0.5709, + "step": 13593 + }, + { + "epoch": 3.608787999469003, + "grad_norm": 0.45636397953153746, + "learning_rate": 1.7286529570147415e-06, + "loss": 0.5717, + "step": 13594 + }, + { + "epoch": 3.6090534979423867, + "grad_norm": 0.47415080865320663, + "learning_rate": 1.7283208699590321e-06, + "loss": 0.5535, + "step": 13595 + }, + { + "epoch": 3.6093189964157704, + "grad_norm": 0.44461879784687897, + "learning_rate": 1.7279887979529548e-06, + "loss": 0.5514, + "step": 13596 + }, + { + "epoch": 3.6095844948891544, + "grad_norm": 0.46604672820334747, + "learning_rate": 1.7276567410029863e-06, + "loss": 0.5693, + "step": 13597 + }, + { + "epoch": 3.609849993362538, + "grad_norm": 0.46406717274669723, + "learning_rate": 1.7273246991156023e-06, + "loss": 0.5396, + "step": 13598 + }, + { + "epoch": 3.6101154918359217, + "grad_norm": 0.4663992982029978, + "learning_rate": 1.7269926722972796e-06, + "loss": 0.5132, + "step": 13599 + }, + { + "epoch": 3.610380990309306, + "grad_norm": 0.4774319859445451, + "learning_rate": 1.7266606605544914e-06, + "loss": 0.5384, + "step": 13600 + }, + { + "epoch": 3.6106464887826895, + "grad_norm": 0.46548168841747417, + "learning_rate": 1.7263286638937138e-06, + "loss": 0.5458, + "step": 13601 + }, + { + "epoch": 3.610911987256073, + "grad_norm": 0.4484765598962136, + "learning_rate": 1.7259966823214214e-06, + "loss": 0.5101, + "step": 13602 + }, + { + "epoch": 3.611177485729457, + "grad_norm": 0.47103830221202686, + "learning_rate": 1.7256647158440887e-06, + "loss": 0.5355, + "step": 13603 + }, + { + "epoch": 3.611442984202841, + "grad_norm": 0.458278323284456, + "learning_rate": 1.7253327644681908e-06, + "loss": 0.5425, + "step": 13604 + }, + { + "epoch": 3.6117084826762245, + "grad_norm": 0.4612397541487162, + "learning_rate": 1.7250008282002e-06, + "loss": 0.5212, + "step": 13605 + }, + { + "epoch": 3.6119739811496085, + "grad_norm": 0.4688759358010234, + "learning_rate": 1.7246689070465906e-06, + "loss": 0.5613, + "step": 13606 + }, + { + "epoch": 3.612239479622992, + "grad_norm": 0.4574925144570196, + "learning_rate": 1.724337001013836e-06, + "loss": 0.5323, + "step": 13607 + }, + { + "epoch": 3.612504978096376, + "grad_norm": 0.45692543219069714, + "learning_rate": 1.7240051101084088e-06, + "loss": 0.5521, + "step": 13608 + }, + { + "epoch": 3.61277047656976, + "grad_norm": 0.4570651800360103, + "learning_rate": 1.7236732343367818e-06, + "loss": 0.5974, + "step": 13609 + }, + { + "epoch": 3.6130359750431436, + "grad_norm": 0.4714511045563157, + "learning_rate": 1.7233413737054289e-06, + "loss": 0.5411, + "step": 13610 + }, + { + "epoch": 3.613301473516527, + "grad_norm": 0.46681311007707244, + "learning_rate": 1.7230095282208197e-06, + "loss": 0.5138, + "step": 13611 + }, + { + "epoch": 3.6135669719899113, + "grad_norm": 0.4374633771891128, + "learning_rate": 1.7226776978894269e-06, + "loss": 0.5124, + "step": 13612 + }, + { + "epoch": 3.613832470463295, + "grad_norm": 0.46014361574268586, + "learning_rate": 1.7223458827177226e-06, + "loss": 0.5388, + "step": 13613 + }, + { + "epoch": 3.6140979689366786, + "grad_norm": 0.4526472867290781, + "learning_rate": 1.722014082712177e-06, + "loss": 0.5508, + "step": 13614 + }, + { + "epoch": 3.6143634674100626, + "grad_norm": 0.45206992035680404, + "learning_rate": 1.7216822978792613e-06, + "loss": 0.5165, + "step": 13615 + }, + { + "epoch": 3.6146289658834463, + "grad_norm": 0.4801129442343729, + "learning_rate": 1.7213505282254482e-06, + "loss": 0.5697, + "step": 13616 + }, + { + "epoch": 3.61489446435683, + "grad_norm": 0.48423488502181944, + "learning_rate": 1.7210187737572055e-06, + "loss": 0.549, + "step": 13617 + }, + { + "epoch": 3.6151599628302136, + "grad_norm": 0.4598635727984862, + "learning_rate": 1.7206870344810034e-06, + "loss": 0.5412, + "step": 13618 + }, + { + "epoch": 3.6154254613035977, + "grad_norm": 0.45683239145922466, + "learning_rate": 1.7203553104033124e-06, + "loss": 0.5426, + "step": 13619 + }, + { + "epoch": 3.6156909597769813, + "grad_norm": 0.4714143392031203, + "learning_rate": 1.7200236015306018e-06, + "loss": 0.564, + "step": 13620 + }, + { + "epoch": 3.615956458250365, + "grad_norm": 0.45942074409938555, + "learning_rate": 1.7196919078693408e-06, + "loss": 0.548, + "step": 13621 + }, + { + "epoch": 3.6162219567237486, + "grad_norm": 0.4672479852630923, + "learning_rate": 1.7193602294259983e-06, + "loss": 0.5596, + "step": 13622 + }, + { + "epoch": 3.6164874551971327, + "grad_norm": 0.4596295563065969, + "learning_rate": 1.7190285662070424e-06, + "loss": 0.5783, + "step": 13623 + }, + { + "epoch": 3.6167529536705163, + "grad_norm": 0.47054608348118204, + "learning_rate": 1.7186969182189417e-06, + "loss": 0.505, + "step": 13624 + }, + { + "epoch": 3.6170184521439, + "grad_norm": 0.4824321631269175, + "learning_rate": 1.7183652854681637e-06, + "loss": 0.568, + "step": 13625 + }, + { + "epoch": 3.617283950617284, + "grad_norm": 0.4606630924047428, + "learning_rate": 1.718033667961177e-06, + "loss": 0.5517, + "step": 13626 + }, + { + "epoch": 3.6175494490906677, + "grad_norm": 0.4719866749369406, + "learning_rate": 1.7177020657044488e-06, + "loss": 0.5631, + "step": 13627 + }, + { + "epoch": 3.6178149475640513, + "grad_norm": 0.46929153589945016, + "learning_rate": 1.7173704787044448e-06, + "loss": 0.5485, + "step": 13628 + }, + { + "epoch": 3.6180804460374354, + "grad_norm": 0.4541068694132094, + "learning_rate": 1.717038906967633e-06, + "loss": 0.5394, + "step": 13629 + }, + { + "epoch": 3.618345944510819, + "grad_norm": 0.45401441817806126, + "learning_rate": 1.7167073505004792e-06, + "loss": 0.5579, + "step": 13630 + }, + { + "epoch": 3.6186114429842027, + "grad_norm": 0.47340886187189046, + "learning_rate": 1.7163758093094502e-06, + "loss": 0.5495, + "step": 13631 + }, + { + "epoch": 3.6188769414575868, + "grad_norm": 0.4757218638647428, + "learning_rate": 1.716044283401011e-06, + "loss": 0.5393, + "step": 13632 + }, + { + "epoch": 3.6191424399309704, + "grad_norm": 0.4562527725970025, + "learning_rate": 1.7157127727816286e-06, + "loss": 0.5283, + "step": 13633 + }, + { + "epoch": 3.619407938404354, + "grad_norm": 0.458041298347764, + "learning_rate": 1.7153812774577672e-06, + "loss": 0.5403, + "step": 13634 + }, + { + "epoch": 3.619673436877738, + "grad_norm": 0.47282952261417815, + "learning_rate": 1.7150497974358914e-06, + "loss": 0.5216, + "step": 13635 + }, + { + "epoch": 3.6199389353511218, + "grad_norm": 0.46716491856811143, + "learning_rate": 1.714718332722467e-06, + "loss": 0.5533, + "step": 13636 + }, + { + "epoch": 3.6202044338245054, + "grad_norm": 0.467141076609319, + "learning_rate": 1.7143868833239575e-06, + "loss": 0.5371, + "step": 13637 + }, + { + "epoch": 3.6204699322978895, + "grad_norm": 0.4654792678552746, + "learning_rate": 1.714055449246827e-06, + "loss": 0.5118, + "step": 13638 + }, + { + "epoch": 3.620735430771273, + "grad_norm": 0.45384211363342897, + "learning_rate": 1.7137240304975411e-06, + "loss": 0.5409, + "step": 13639 + }, + { + "epoch": 3.6210009292446568, + "grad_norm": 0.4745162033476175, + "learning_rate": 1.71339262708256e-06, + "loss": 0.5743, + "step": 13640 + }, + { + "epoch": 3.621266427718041, + "grad_norm": 0.45193187242856186, + "learning_rate": 1.713061239008349e-06, + "loss": 0.5471, + "step": 13641 + }, + { + "epoch": 3.6215319261914245, + "grad_norm": 0.4705267753487941, + "learning_rate": 1.7127298662813706e-06, + "loss": 0.5616, + "step": 13642 + }, + { + "epoch": 3.621797424664808, + "grad_norm": 0.46118296825061666, + "learning_rate": 1.7123985089080875e-06, + "loss": 0.5462, + "step": 13643 + }, + { + "epoch": 3.6220629231381922, + "grad_norm": 0.4468382816640049, + "learning_rate": 1.712067166894962e-06, + "loss": 0.5182, + "step": 13644 + }, + { + "epoch": 3.622328421611576, + "grad_norm": 0.4644414446232178, + "learning_rate": 1.7117358402484568e-06, + "loss": 0.5514, + "step": 13645 + }, + { + "epoch": 3.6225939200849595, + "grad_norm": 0.4728811353813963, + "learning_rate": 1.7114045289750319e-06, + "loss": 0.5995, + "step": 13646 + }, + { + "epoch": 3.622859418558343, + "grad_norm": 0.4547843982903181, + "learning_rate": 1.7110732330811492e-06, + "loss": 0.5478, + "step": 13647 + }, + { + "epoch": 3.6231249170317272, + "grad_norm": 0.4695241271381432, + "learning_rate": 1.7107419525732701e-06, + "loss": 0.5576, + "step": 13648 + }, + { + "epoch": 3.623390415505111, + "grad_norm": 0.458804988875267, + "learning_rate": 1.7104106874578555e-06, + "loss": 0.5976, + "step": 13649 + }, + { + "epoch": 3.6236559139784945, + "grad_norm": 0.46468763330087354, + "learning_rate": 1.7100794377413658e-06, + "loss": 0.5451, + "step": 13650 + }, + { + "epoch": 3.623921412451878, + "grad_norm": 0.4791723989770146, + "learning_rate": 1.7097482034302616e-06, + "loss": 0.5474, + "step": 13651 + }, + { + "epoch": 3.6241869109252622, + "grad_norm": 0.4614585352340324, + "learning_rate": 1.7094169845310017e-06, + "loss": 0.5245, + "step": 13652 + }, + { + "epoch": 3.624452409398646, + "grad_norm": 0.45746898951882653, + "learning_rate": 1.7090857810500462e-06, + "loss": 0.5282, + "step": 13653 + }, + { + "epoch": 3.6247179078720295, + "grad_norm": 0.4741656899525762, + "learning_rate": 1.7087545929938545e-06, + "loss": 0.5738, + "step": 13654 + }, + { + "epoch": 3.6249834063454136, + "grad_norm": 0.46060643980717986, + "learning_rate": 1.7084234203688855e-06, + "loss": 0.5407, + "step": 13655 + }, + { + "epoch": 3.6252489048187972, + "grad_norm": 0.4647203963659346, + "learning_rate": 1.7080922631815994e-06, + "loss": 0.4934, + "step": 13656 + }, + { + "epoch": 3.625514403292181, + "grad_norm": 0.45171062586580485, + "learning_rate": 1.7077611214384514e-06, + "loss": 0.5155, + "step": 13657 + }, + { + "epoch": 3.625779901765565, + "grad_norm": 0.45525417035870663, + "learning_rate": 1.7074299951459017e-06, + "loss": 0.5234, + "step": 13658 + }, + { + "epoch": 3.6260454002389486, + "grad_norm": 0.4639623621962392, + "learning_rate": 1.7070988843104074e-06, + "loss": 0.5318, + "step": 13659 + }, + { + "epoch": 3.6263108987123323, + "grad_norm": 0.46311921372434306, + "learning_rate": 1.7067677889384265e-06, + "loss": 0.5005, + "step": 13660 + }, + { + "epoch": 3.6265763971857163, + "grad_norm": 0.45319119470606894, + "learning_rate": 1.7064367090364152e-06, + "loss": 0.5307, + "step": 13661 + }, + { + "epoch": 3.6268418956591, + "grad_norm": 0.46147456132470915, + "learning_rate": 1.7061056446108325e-06, + "loss": 0.5396, + "step": 13662 + }, + { + "epoch": 3.6271073941324836, + "grad_norm": 0.46380341042187784, + "learning_rate": 1.7057745956681327e-06, + "loss": 0.55, + "step": 13663 + }, + { + "epoch": 3.6273728926058677, + "grad_norm": 0.46266186746131904, + "learning_rate": 1.7054435622147727e-06, + "loss": 0.5642, + "step": 13664 + }, + { + "epoch": 3.6276383910792513, + "grad_norm": 0.45589186004984195, + "learning_rate": 1.7051125442572087e-06, + "loss": 0.5301, + "step": 13665 + }, + { + "epoch": 3.627903889552635, + "grad_norm": 0.453659468490755, + "learning_rate": 1.7047815418018965e-06, + "loss": 0.543, + "step": 13666 + }, + { + "epoch": 3.628169388026019, + "grad_norm": 0.4709650239203018, + "learning_rate": 1.7044505548552908e-06, + "loss": 0.5346, + "step": 13667 + }, + { + "epoch": 3.6284348864994027, + "grad_norm": 0.45100151231476326, + "learning_rate": 1.704119583423848e-06, + "loss": 0.5309, + "step": 13668 + }, + { + "epoch": 3.6287003849727864, + "grad_norm": 0.45782276589120025, + "learning_rate": 1.7037886275140213e-06, + "loss": 0.5082, + "step": 13669 + }, + { + "epoch": 3.6289658834461704, + "grad_norm": 0.4617019052182944, + "learning_rate": 1.7034576871322661e-06, + "loss": 0.5723, + "step": 13670 + }, + { + "epoch": 3.629231381919554, + "grad_norm": 0.4705898177021215, + "learning_rate": 1.703126762285036e-06, + "loss": 0.5408, + "step": 13671 + }, + { + "epoch": 3.6294968803929377, + "grad_norm": 0.46026739951165246, + "learning_rate": 1.7027958529787853e-06, + "loss": 0.5753, + "step": 13672 + }, + { + "epoch": 3.6297623788663214, + "grad_norm": 0.4774369464407052, + "learning_rate": 1.702464959219967e-06, + "loss": 0.5792, + "step": 13673 + }, + { + "epoch": 3.6300278773397054, + "grad_norm": 0.4770219619640301, + "learning_rate": 1.7021340810150361e-06, + "loss": 0.5241, + "step": 13674 + }, + { + "epoch": 3.630293375813089, + "grad_norm": 0.4650085159622131, + "learning_rate": 1.701803218370443e-06, + "loss": 0.512, + "step": 13675 + }, + { + "epoch": 3.6305588742864727, + "grad_norm": 0.4618618539422062, + "learning_rate": 1.7014723712926414e-06, + "loss": 0.5709, + "step": 13676 + }, + { + "epoch": 3.6308243727598564, + "grad_norm": 0.4608173417979016, + "learning_rate": 1.7011415397880838e-06, + "loss": 0.549, + "step": 13677 + }, + { + "epoch": 3.6310898712332405, + "grad_norm": 0.48138934099222763, + "learning_rate": 1.7008107238632222e-06, + "loss": 0.5653, + "step": 13678 + }, + { + "epoch": 3.631355369706624, + "grad_norm": 0.47405279024724833, + "learning_rate": 1.7004799235245084e-06, + "loss": 0.5298, + "step": 13679 + }, + { + "epoch": 3.6316208681800077, + "grad_norm": 0.475434812157762, + "learning_rate": 1.7001491387783936e-06, + "loss": 0.5243, + "step": 13680 + }, + { + "epoch": 3.631886366653392, + "grad_norm": 0.4555221980389128, + "learning_rate": 1.6998183696313286e-06, + "loss": 0.5524, + "step": 13681 + }, + { + "epoch": 3.6321518651267755, + "grad_norm": 0.46018521990203676, + "learning_rate": 1.6994876160897647e-06, + "loss": 0.543, + "step": 13682 + }, + { + "epoch": 3.632417363600159, + "grad_norm": 0.48111701125486367, + "learning_rate": 1.6991568781601522e-06, + "loss": 0.5634, + "step": 13683 + }, + { + "epoch": 3.632682862073543, + "grad_norm": 0.4683753571383182, + "learning_rate": 1.6988261558489416e-06, + "loss": 0.5445, + "step": 13684 + }, + { + "epoch": 3.632948360546927, + "grad_norm": 0.47775647821815037, + "learning_rate": 1.6984954491625833e-06, + "loss": 0.5367, + "step": 13685 + }, + { + "epoch": 3.6332138590203105, + "grad_norm": 0.4790413181753658, + "learning_rate": 1.698164758107525e-06, + "loss": 0.5289, + "step": 13686 + }, + { + "epoch": 3.6334793574936946, + "grad_norm": 0.46344828824385936, + "learning_rate": 1.6978340826902173e-06, + "loss": 0.5064, + "step": 13687 + }, + { + "epoch": 3.633744855967078, + "grad_norm": 0.46296307530320996, + "learning_rate": 1.6975034229171092e-06, + "loss": 0.5263, + "step": 13688 + }, + { + "epoch": 3.634010354440462, + "grad_norm": 0.4703805306281193, + "learning_rate": 1.697172778794649e-06, + "loss": 0.5636, + "step": 13689 + }, + { + "epoch": 3.634275852913846, + "grad_norm": 0.48484719367374407, + "learning_rate": 1.6968421503292856e-06, + "loss": 0.5531, + "step": 13690 + }, + { + "epoch": 3.6345413513872296, + "grad_norm": 0.47379127579867886, + "learning_rate": 1.6965115375274678e-06, + "loss": 0.5528, + "step": 13691 + }, + { + "epoch": 3.634806849860613, + "grad_norm": 0.4544197893375784, + "learning_rate": 1.6961809403956409e-06, + "loss": 0.5349, + "step": 13692 + }, + { + "epoch": 3.6350723483339973, + "grad_norm": 0.4713125982530026, + "learning_rate": 1.6958503589402539e-06, + "loss": 0.5843, + "step": 13693 + }, + { + "epoch": 3.635337846807381, + "grad_norm": 0.4667524768297605, + "learning_rate": 1.6955197931677538e-06, + "loss": 0.5291, + "step": 13694 + }, + { + "epoch": 3.6356033452807646, + "grad_norm": 0.4512196666018313, + "learning_rate": 1.6951892430845874e-06, + "loss": 0.5313, + "step": 13695 + }, + { + "epoch": 3.6358688437541486, + "grad_norm": 0.4693406989880126, + "learning_rate": 1.6948587086972013e-06, + "loss": 0.5597, + "step": 13696 + }, + { + "epoch": 3.6361343422275323, + "grad_norm": 0.46678191467790486, + "learning_rate": 1.6945281900120425e-06, + "loss": 0.549, + "step": 13697 + }, + { + "epoch": 3.636399840700916, + "grad_norm": 0.4645748361508512, + "learning_rate": 1.6941976870355553e-06, + "loss": 0.5307, + "step": 13698 + }, + { + "epoch": 3.6366653391743, + "grad_norm": 0.45548918114258136, + "learning_rate": 1.6938671997741863e-06, + "loss": 0.5291, + "step": 13699 + }, + { + "epoch": 3.6369308376476837, + "grad_norm": 0.4608563291416018, + "learning_rate": 1.6935367282343806e-06, + "loss": 0.5471, + "step": 13700 + }, + { + "epoch": 3.6371963361210673, + "grad_norm": 0.46696330066728287, + "learning_rate": 1.693206272422583e-06, + "loss": 0.5347, + "step": 13701 + }, + { + "epoch": 3.637461834594451, + "grad_norm": 0.46854927397595214, + "learning_rate": 1.69287583234524e-06, + "loss": 0.5366, + "step": 13702 + }, + { + "epoch": 3.637727333067835, + "grad_norm": 0.4610060401679918, + "learning_rate": 1.6925454080087931e-06, + "loss": 0.5241, + "step": 13703 + }, + { + "epoch": 3.6379928315412187, + "grad_norm": 0.4816351182302213, + "learning_rate": 1.6922149994196878e-06, + "loss": 0.5504, + "step": 13704 + }, + { + "epoch": 3.6382583300146023, + "grad_norm": 0.4629181425049328, + "learning_rate": 1.6918846065843677e-06, + "loss": 0.5331, + "step": 13705 + }, + { + "epoch": 3.638523828487986, + "grad_norm": 0.45681701236609173, + "learning_rate": 1.691554229509276e-06, + "loss": 0.5288, + "step": 13706 + }, + { + "epoch": 3.63878932696137, + "grad_norm": 0.4687411694448322, + "learning_rate": 1.6912238682008567e-06, + "loss": 0.5009, + "step": 13707 + }, + { + "epoch": 3.6390548254347537, + "grad_norm": 0.460669153949547, + "learning_rate": 1.6908935226655532e-06, + "loss": 0.5276, + "step": 13708 + }, + { + "epoch": 3.6393203239081373, + "grad_norm": 0.45111500154666356, + "learning_rate": 1.690563192909806e-06, + "loss": 0.5322, + "step": 13709 + }, + { + "epoch": 3.6395858223815214, + "grad_norm": 0.46686859367134764, + "learning_rate": 1.6902328789400585e-06, + "loss": 0.5512, + "step": 13710 + }, + { + "epoch": 3.639851320854905, + "grad_norm": 0.46572273536612585, + "learning_rate": 1.6899025807627525e-06, + "loss": 0.554, + "step": 13711 + }, + { + "epoch": 3.6401168193282887, + "grad_norm": 0.45905155799806036, + "learning_rate": 1.6895722983843295e-06, + "loss": 0.5275, + "step": 13712 + }, + { + "epoch": 3.6403823178016728, + "grad_norm": 0.4593243862512204, + "learning_rate": 1.6892420318112312e-06, + "loss": 0.5599, + "step": 13713 + }, + { + "epoch": 3.6406478162750564, + "grad_norm": 0.4532097012078511, + "learning_rate": 1.6889117810498986e-06, + "loss": 0.5468, + "step": 13714 + }, + { + "epoch": 3.64091331474844, + "grad_norm": 0.46951339482759724, + "learning_rate": 1.6885815461067717e-06, + "loss": 0.5498, + "step": 13715 + }, + { + "epoch": 3.641178813221824, + "grad_norm": 0.46255136715925005, + "learning_rate": 1.6882513269882916e-06, + "loss": 0.502, + "step": 13716 + }, + { + "epoch": 3.6414443116952078, + "grad_norm": 0.45581375133165497, + "learning_rate": 1.687921123700898e-06, + "loss": 0.5559, + "step": 13717 + }, + { + "epoch": 3.6417098101685914, + "grad_norm": 0.4563335031048507, + "learning_rate": 1.6875909362510306e-06, + "loss": 0.546, + "step": 13718 + }, + { + "epoch": 3.6419753086419755, + "grad_norm": 0.4606115766645989, + "learning_rate": 1.6872607646451292e-06, + "loss": 0.5359, + "step": 13719 + }, + { + "epoch": 3.642240807115359, + "grad_norm": 0.4678123251846704, + "learning_rate": 1.6869306088896336e-06, + "loss": 0.5001, + "step": 13720 + }, + { + "epoch": 3.6425063055887428, + "grad_norm": 0.46661629944344796, + "learning_rate": 1.6866004689909812e-06, + "loss": 0.5617, + "step": 13721 + }, + { + "epoch": 3.642771804062127, + "grad_norm": 0.46385917180605735, + "learning_rate": 1.6862703449556112e-06, + "loss": 0.5664, + "step": 13722 + }, + { + "epoch": 3.6430373025355105, + "grad_norm": 0.473229430186421, + "learning_rate": 1.6859402367899616e-06, + "loss": 0.5335, + "step": 13723 + }, + { + "epoch": 3.643302801008894, + "grad_norm": 0.467403493787694, + "learning_rate": 1.6856101445004709e-06, + "loss": 0.562, + "step": 13724 + }, + { + "epoch": 3.6435682994822782, + "grad_norm": 0.5369963572586336, + "learning_rate": 1.6852800680935763e-06, + "loss": 0.5232, + "step": 13725 + }, + { + "epoch": 3.643833797955662, + "grad_norm": 0.4624955681115691, + "learning_rate": 1.684950007575715e-06, + "loss": 0.5451, + "step": 13726 + }, + { + "epoch": 3.6440992964290455, + "grad_norm": 0.4731734794609201, + "learning_rate": 1.684619962953324e-06, + "loss": 0.6037, + "step": 13727 + }, + { + "epoch": 3.644364794902429, + "grad_norm": 0.4777244040321825, + "learning_rate": 1.6842899342328403e-06, + "loss": 0.5751, + "step": 13728 + }, + { + "epoch": 3.6446302933758132, + "grad_norm": 0.47354660154344547, + "learning_rate": 1.6839599214206997e-06, + "loss": 0.5518, + "step": 13729 + }, + { + "epoch": 3.644895791849197, + "grad_norm": 0.4646577938387865, + "learning_rate": 1.6836299245233393e-06, + "loss": 0.5504, + "step": 13730 + }, + { + "epoch": 3.6451612903225805, + "grad_norm": 0.45674061939232097, + "learning_rate": 1.6832999435471942e-06, + "loss": 0.5501, + "step": 13731 + }, + { + "epoch": 3.645426788795964, + "grad_norm": 0.46075901989042545, + "learning_rate": 1.6829699784986995e-06, + "loss": 0.5415, + "step": 13732 + }, + { + "epoch": 3.6456922872693482, + "grad_norm": 0.47252930868368015, + "learning_rate": 1.6826400293842904e-06, + "loss": 0.5573, + "step": 13733 + }, + { + "epoch": 3.645957785742732, + "grad_norm": 0.4642419027528526, + "learning_rate": 1.682310096210402e-06, + "loss": 0.5426, + "step": 13734 + }, + { + "epoch": 3.6462232842161155, + "grad_norm": 0.4422289085002464, + "learning_rate": 1.6819801789834695e-06, + "loss": 0.5207, + "step": 13735 + }, + { + "epoch": 3.6464887826894996, + "grad_norm": 0.45187185258765883, + "learning_rate": 1.6816502777099255e-06, + "loss": 0.5632, + "step": 13736 + }, + { + "epoch": 3.6467542811628832, + "grad_norm": 0.4543059246818115, + "learning_rate": 1.6813203923962062e-06, + "loss": 0.5148, + "step": 13737 + }, + { + "epoch": 3.647019779636267, + "grad_norm": 0.45540447985108073, + "learning_rate": 1.6809905230487424e-06, + "loss": 0.5285, + "step": 13738 + }, + { + "epoch": 3.647285278109651, + "grad_norm": 0.44950988158098654, + "learning_rate": 1.680660669673969e-06, + "loss": 0.5581, + "step": 13739 + }, + { + "epoch": 3.6475507765830346, + "grad_norm": 0.44480987406676287, + "learning_rate": 1.6803308322783186e-06, + "loss": 0.4945, + "step": 13740 + }, + { + "epoch": 3.6478162750564183, + "grad_norm": 0.4437292871157782, + "learning_rate": 1.6800010108682235e-06, + "loss": 0.5191, + "step": 13741 + }, + { + "epoch": 3.6480817735298023, + "grad_norm": 0.46783100596878574, + "learning_rate": 1.679671205450117e-06, + "loss": 0.5154, + "step": 13742 + }, + { + "epoch": 3.648347272003186, + "grad_norm": 0.4662660159818644, + "learning_rate": 1.6793414160304306e-06, + "loss": 0.5106, + "step": 13743 + }, + { + "epoch": 3.6486127704765696, + "grad_norm": 0.45063723228620567, + "learning_rate": 1.6790116426155952e-06, + "loss": 0.5191, + "step": 13744 + }, + { + "epoch": 3.6488782689499537, + "grad_norm": 0.47530364731471897, + "learning_rate": 1.6786818852120434e-06, + "loss": 0.5571, + "step": 13745 + }, + { + "epoch": 3.6491437674233373, + "grad_norm": 0.45783278452546355, + "learning_rate": 1.6783521438262055e-06, + "loss": 0.5163, + "step": 13746 + }, + { + "epoch": 3.649409265896721, + "grad_norm": 0.4658339520731718, + "learning_rate": 1.6780224184645125e-06, + "loss": 0.5324, + "step": 13747 + }, + { + "epoch": 3.649674764370105, + "grad_norm": 0.46906537898002826, + "learning_rate": 1.677692709133396e-06, + "loss": 0.5195, + "step": 13748 + }, + { + "epoch": 3.6499402628434887, + "grad_norm": 0.4796470923598252, + "learning_rate": 1.6773630158392839e-06, + "loss": 0.5642, + "step": 13749 + }, + { + "epoch": 3.6502057613168724, + "grad_norm": 0.4624525968299761, + "learning_rate": 1.6770333385886072e-06, + "loss": 0.5394, + "step": 13750 + }, + { + "epoch": 3.6504712597902564, + "grad_norm": 0.46916695987056584, + "learning_rate": 1.6767036773877955e-06, + "loss": 0.533, + "step": 13751 + }, + { + "epoch": 3.65073675826364, + "grad_norm": 0.4601153019971286, + "learning_rate": 1.6763740322432774e-06, + "loss": 0.5526, + "step": 13752 + }, + { + "epoch": 3.6510022567370237, + "grad_norm": 0.4666390969867862, + "learning_rate": 1.6760444031614825e-06, + "loss": 0.5158, + "step": 13753 + }, + { + "epoch": 3.651267755210408, + "grad_norm": 0.48048271925305885, + "learning_rate": 1.6757147901488402e-06, + "loss": 0.5817, + "step": 13754 + }, + { + "epoch": 3.6515332536837914, + "grad_norm": 0.4568907473787726, + "learning_rate": 1.6753851932117767e-06, + "loss": 0.5532, + "step": 13755 + }, + { + "epoch": 3.651798752157175, + "grad_norm": 0.4656320036619044, + "learning_rate": 1.6750556123567213e-06, + "loss": 0.5276, + "step": 13756 + }, + { + "epoch": 3.6520642506305587, + "grad_norm": 0.48038360636326377, + "learning_rate": 1.6747260475901005e-06, + "loss": 0.5521, + "step": 13757 + }, + { + "epoch": 3.652329749103943, + "grad_norm": 0.4591990968948632, + "learning_rate": 1.6743964989183432e-06, + "loss": 0.5403, + "step": 13758 + }, + { + "epoch": 3.6525952475773265, + "grad_norm": 0.4497268732977616, + "learning_rate": 1.6740669663478752e-06, + "loss": 0.5411, + "step": 13759 + }, + { + "epoch": 3.65286074605071, + "grad_norm": 0.4695250918736584, + "learning_rate": 1.6737374498851242e-06, + "loss": 0.5799, + "step": 13760 + }, + { + "epoch": 3.6531262445240937, + "grad_norm": 0.47880899544657424, + "learning_rate": 1.6734079495365152e-06, + "loss": 0.509, + "step": 13761 + }, + { + "epoch": 3.653391742997478, + "grad_norm": 0.45150262748154263, + "learning_rate": 1.6730784653084753e-06, + "loss": 0.5422, + "step": 13762 + }, + { + "epoch": 3.6536572414708615, + "grad_norm": 0.46926536260451784, + "learning_rate": 1.6727489972074302e-06, + "loss": 0.5458, + "step": 13763 + }, + { + "epoch": 3.653922739944245, + "grad_norm": 0.46755137129013313, + "learning_rate": 1.6724195452398045e-06, + "loss": 0.5474, + "step": 13764 + }, + { + "epoch": 3.654188238417629, + "grad_norm": 0.475678818659334, + "learning_rate": 1.6720901094120242e-06, + "loss": 0.5475, + "step": 13765 + }, + { + "epoch": 3.654453736891013, + "grad_norm": 0.46702358092713714, + "learning_rate": 1.671760689730515e-06, + "loss": 0.5482, + "step": 13766 + }, + { + "epoch": 3.6547192353643965, + "grad_norm": 0.45767050178879953, + "learning_rate": 1.671431286201699e-06, + "loss": 0.564, + "step": 13767 + }, + { + "epoch": 3.6549847338377806, + "grad_norm": 0.46927537560672444, + "learning_rate": 1.6711018988320018e-06, + "loss": 0.5275, + "step": 13768 + }, + { + "epoch": 3.655250232311164, + "grad_norm": 0.47007928774544794, + "learning_rate": 1.670772527627847e-06, + "loss": 0.5544, + "step": 13769 + }, + { + "epoch": 3.655515730784548, + "grad_norm": 0.4621877853463103, + "learning_rate": 1.6704431725956582e-06, + "loss": 0.5267, + "step": 13770 + }, + { + "epoch": 3.655781229257932, + "grad_norm": 0.4719922100923226, + "learning_rate": 1.6701138337418588e-06, + "loss": 0.5462, + "step": 13771 + }, + { + "epoch": 3.6560467277313156, + "grad_norm": 0.4856879016586914, + "learning_rate": 1.6697845110728716e-06, + "loss": 0.5934, + "step": 13772 + }, + { + "epoch": 3.656312226204699, + "grad_norm": 0.46600035176760796, + "learning_rate": 1.669455204595119e-06, + "loss": 0.5879, + "step": 13773 + }, + { + "epoch": 3.6565777246780833, + "grad_norm": 0.473317566841543, + "learning_rate": 1.6691259143150234e-06, + "loss": 0.5468, + "step": 13774 + }, + { + "epoch": 3.656843223151467, + "grad_norm": 0.4700812442788767, + "learning_rate": 1.6687966402390072e-06, + "loss": 0.5505, + "step": 13775 + }, + { + "epoch": 3.6571087216248506, + "grad_norm": 0.4617394420070805, + "learning_rate": 1.668467382373491e-06, + "loss": 0.5372, + "step": 13776 + }, + { + "epoch": 3.6573742200982347, + "grad_norm": 0.46302034078613996, + "learning_rate": 1.6681381407248976e-06, + "loss": 0.5342, + "step": 13777 + }, + { + "epoch": 3.6576397185716183, + "grad_norm": 0.47138132180944026, + "learning_rate": 1.6678089152996466e-06, + "loss": 0.5757, + "step": 13778 + }, + { + "epoch": 3.657905217045002, + "grad_norm": 0.4743354747005439, + "learning_rate": 1.6674797061041592e-06, + "loss": 0.5375, + "step": 13779 + }, + { + "epoch": 3.658170715518386, + "grad_norm": 0.4672946190054541, + "learning_rate": 1.6671505131448562e-06, + "loss": 0.538, + "step": 13780 + }, + { + "epoch": 3.6584362139917697, + "grad_norm": 0.4672790473691408, + "learning_rate": 1.6668213364281571e-06, + "loss": 0.5306, + "step": 13781 + }, + { + "epoch": 3.6587017124651533, + "grad_norm": 0.47824961447080494, + "learning_rate": 1.666492175960482e-06, + "loss": 0.4846, + "step": 13782 + }, + { + "epoch": 3.658967210938537, + "grad_norm": 0.46018164315105453, + "learning_rate": 1.6661630317482514e-06, + "loss": 0.5169, + "step": 13783 + }, + { + "epoch": 3.659232709411921, + "grad_norm": 0.4672544641386213, + "learning_rate": 1.6658339037978822e-06, + "loss": 0.5224, + "step": 13784 + }, + { + "epoch": 3.6594982078853047, + "grad_norm": 0.4617404304902396, + "learning_rate": 1.665504792115794e-06, + "loss": 0.5311, + "step": 13785 + }, + { + "epoch": 3.6597637063586883, + "grad_norm": 0.46531970146958807, + "learning_rate": 1.665175696708406e-06, + "loss": 0.5393, + "step": 13786 + }, + { + "epoch": 3.660029204832072, + "grad_norm": 0.4728687915400536, + "learning_rate": 1.6648466175821354e-06, + "loss": 0.5558, + "step": 13787 + }, + { + "epoch": 3.660294703305456, + "grad_norm": 0.47411357878230637, + "learning_rate": 1.6645175547434008e-06, + "loss": 0.5703, + "step": 13788 + }, + { + "epoch": 3.6605602017788397, + "grad_norm": 0.4691443142671677, + "learning_rate": 1.66418850819862e-06, + "loss": 0.5641, + "step": 13789 + }, + { + "epoch": 3.6608257002522233, + "grad_norm": 0.46475373500301737, + "learning_rate": 1.6638594779542094e-06, + "loss": 0.5819, + "step": 13790 + }, + { + "epoch": 3.6610911987256074, + "grad_norm": 0.44387761698007133, + "learning_rate": 1.6635304640165858e-06, + "loss": 0.5553, + "step": 13791 + }, + { + "epoch": 3.661356697198991, + "grad_norm": 0.4810550724624136, + "learning_rate": 1.6632014663921663e-06, + "loss": 0.5656, + "step": 13792 + }, + { + "epoch": 3.6616221956723747, + "grad_norm": 0.4689449555235533, + "learning_rate": 1.6628724850873672e-06, + "loss": 0.5521, + "step": 13793 + }, + { + "epoch": 3.6618876941457588, + "grad_norm": 0.45270153457013607, + "learning_rate": 1.6625435201086043e-06, + "loss": 0.527, + "step": 13794 + }, + { + "epoch": 3.6621531926191424, + "grad_norm": 0.4640409986992459, + "learning_rate": 1.6622145714622945e-06, + "loss": 0.5276, + "step": 13795 + }, + { + "epoch": 3.662418691092526, + "grad_norm": 0.46032140187441206, + "learning_rate": 1.6618856391548501e-06, + "loss": 0.5382, + "step": 13796 + }, + { + "epoch": 3.66268418956591, + "grad_norm": 0.4593336761700628, + "learning_rate": 1.6615567231926883e-06, + "loss": 0.5715, + "step": 13797 + }, + { + "epoch": 3.6629496880392938, + "grad_norm": 0.4650367964551919, + "learning_rate": 1.6612278235822226e-06, + "loss": 0.5569, + "step": 13798 + }, + { + "epoch": 3.6632151865126774, + "grad_norm": 0.4759064884383785, + "learning_rate": 1.6608989403298687e-06, + "loss": 0.5341, + "step": 13799 + }, + { + "epoch": 3.6634806849860615, + "grad_norm": 0.45127955764380284, + "learning_rate": 1.6605700734420404e-06, + "loss": 0.5545, + "step": 13800 + }, + { + "epoch": 3.663746183459445, + "grad_norm": 0.47173306366153084, + "learning_rate": 1.6602412229251506e-06, + "loss": 0.5589, + "step": 13801 + }, + { + "epoch": 3.664011681932829, + "grad_norm": 0.45609750597281096, + "learning_rate": 1.6599123887856125e-06, + "loss": 0.5559, + "step": 13802 + }, + { + "epoch": 3.664277180406213, + "grad_norm": 0.44657638749684, + "learning_rate": 1.6595835710298399e-06, + "loss": 0.5483, + "step": 13803 + }, + { + "epoch": 3.6645426788795965, + "grad_norm": 0.4693042614205532, + "learning_rate": 1.6592547696642454e-06, + "loss": 0.5263, + "step": 13804 + }, + { + "epoch": 3.66480817735298, + "grad_norm": 0.4673434022835937, + "learning_rate": 1.6589259846952416e-06, + "loss": 0.5281, + "step": 13805 + }, + { + "epoch": 3.6650736758263642, + "grad_norm": 0.44618661743463467, + "learning_rate": 1.6585972161292407e-06, + "loss": 0.5651, + "step": 13806 + }, + { + "epoch": 3.665339174299748, + "grad_norm": 0.47838916677146576, + "learning_rate": 1.6582684639726538e-06, + "loss": 0.5778, + "step": 13807 + }, + { + "epoch": 3.6656046727731315, + "grad_norm": 0.44572004513080526, + "learning_rate": 1.6579397282318924e-06, + "loss": 0.5453, + "step": 13808 + }, + { + "epoch": 3.6658701712465156, + "grad_norm": 0.46079325263267695, + "learning_rate": 1.6576110089133685e-06, + "loss": 0.5757, + "step": 13809 + }, + { + "epoch": 3.6661356697198992, + "grad_norm": 0.47767259239488713, + "learning_rate": 1.6572823060234922e-06, + "loss": 0.5318, + "step": 13810 + }, + { + "epoch": 3.666401168193283, + "grad_norm": 0.44752203724949646, + "learning_rate": 1.6569536195686742e-06, + "loss": 0.5669, + "step": 13811 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.4481408048638044, + "learning_rate": 1.6566249495553257e-06, + "loss": 0.5674, + "step": 13812 + }, + { + "epoch": 3.6669321651400506, + "grad_norm": 0.4583911188615994, + "learning_rate": 1.656296295989855e-06, + "loss": 0.5085, + "step": 13813 + }, + { + "epoch": 3.6671976636134342, + "grad_norm": 0.46124650894906316, + "learning_rate": 1.6559676588786719e-06, + "loss": 0.564, + "step": 13814 + }, + { + "epoch": 3.667463162086818, + "grad_norm": 0.4636484665243648, + "learning_rate": 1.6556390382281863e-06, + "loss": 0.5378, + "step": 13815 + }, + { + "epoch": 3.6677286605602015, + "grad_norm": 0.46431658553635075, + "learning_rate": 1.6553104340448064e-06, + "loss": 0.5128, + "step": 13816 + }, + { + "epoch": 3.6679941590335856, + "grad_norm": 0.4595474444114003, + "learning_rate": 1.6549818463349414e-06, + "loss": 0.5736, + "step": 13817 + }, + { + "epoch": 3.6682596575069693, + "grad_norm": 0.47041724140920504, + "learning_rate": 1.654653275105e-06, + "loss": 0.5793, + "step": 13818 + }, + { + "epoch": 3.668525155980353, + "grad_norm": 0.46345124598520104, + "learning_rate": 1.6543247203613888e-06, + "loss": 0.5654, + "step": 13819 + }, + { + "epoch": 3.668790654453737, + "grad_norm": 0.4588294040718161, + "learning_rate": 1.653996182110516e-06, + "loss": 0.5401, + "step": 13820 + }, + { + "epoch": 3.6690561529271206, + "grad_norm": 0.45203161847891504, + "learning_rate": 1.6536676603587895e-06, + "loss": 0.5767, + "step": 13821 + }, + { + "epoch": 3.6693216514005043, + "grad_norm": 0.47595138219758726, + "learning_rate": 1.6533391551126155e-06, + "loss": 0.5463, + "step": 13822 + }, + { + "epoch": 3.6695871498738883, + "grad_norm": 0.46340827814896357, + "learning_rate": 1.6530106663784014e-06, + "loss": 0.5431, + "step": 13823 + }, + { + "epoch": 3.669852648347272, + "grad_norm": 0.47845663224381507, + "learning_rate": 1.6526821941625527e-06, + "loss": 0.5534, + "step": 13824 + }, + { + "epoch": 3.6701181468206556, + "grad_norm": 0.4640173704946256, + "learning_rate": 1.652353738471476e-06, + "loss": 0.5181, + "step": 13825 + }, + { + "epoch": 3.6703836452940397, + "grad_norm": 0.4463265522923325, + "learning_rate": 1.6520252993115764e-06, + "loss": 0.527, + "step": 13826 + }, + { + "epoch": 3.6706491437674234, + "grad_norm": 0.4769353909832081, + "learning_rate": 1.65169687668926e-06, + "loss": 0.5346, + "step": 13827 + }, + { + "epoch": 3.670914642240807, + "grad_norm": 0.47438968445297885, + "learning_rate": 1.6513684706109311e-06, + "loss": 0.5638, + "step": 13828 + }, + { + "epoch": 3.671180140714191, + "grad_norm": 0.4815962358437599, + "learning_rate": 1.6510400810829963e-06, + "loss": 0.5452, + "step": 13829 + }, + { + "epoch": 3.6714456391875747, + "grad_norm": 0.45755692206997206, + "learning_rate": 1.6507117081118573e-06, + "loss": 0.5519, + "step": 13830 + }, + { + "epoch": 3.6717111376609584, + "grad_norm": 0.4712517847688879, + "learning_rate": 1.6503833517039195e-06, + "loss": 0.5438, + "step": 13831 + }, + { + "epoch": 3.6719766361343424, + "grad_norm": 0.47004399864078256, + "learning_rate": 1.6500550118655864e-06, + "loss": 0.5115, + "step": 13832 + }, + { + "epoch": 3.672242134607726, + "grad_norm": 0.46609737019115044, + "learning_rate": 1.6497266886032618e-06, + "loss": 0.5258, + "step": 13833 + }, + { + "epoch": 3.6725076330811097, + "grad_norm": 0.47863831508876836, + "learning_rate": 1.6493983819233483e-06, + "loss": 0.5779, + "step": 13834 + }, + { + "epoch": 3.672773131554494, + "grad_norm": 0.45920612412700157, + "learning_rate": 1.64907009183225e-06, + "loss": 0.5617, + "step": 13835 + }, + { + "epoch": 3.6730386300278774, + "grad_norm": 0.4675138351061314, + "learning_rate": 1.6487418183363674e-06, + "loss": 0.5699, + "step": 13836 + }, + { + "epoch": 3.673304128501261, + "grad_norm": 0.4458090676072325, + "learning_rate": 1.6484135614421037e-06, + "loss": 0.551, + "step": 13837 + }, + { + "epoch": 3.6735696269746447, + "grad_norm": 0.46268100456788447, + "learning_rate": 1.6480853211558606e-06, + "loss": 0.5222, + "step": 13838 + }, + { + "epoch": 3.673835125448029, + "grad_norm": 0.45726517388185506, + "learning_rate": 1.6477570974840396e-06, + "loss": 0.5345, + "step": 13839 + }, + { + "epoch": 3.6741006239214125, + "grad_norm": 0.46255457686740953, + "learning_rate": 1.6474288904330416e-06, + "loss": 0.5962, + "step": 13840 + }, + { + "epoch": 3.674366122394796, + "grad_norm": 0.46181795465539005, + "learning_rate": 1.6471007000092692e-06, + "loss": 0.5185, + "step": 13841 + }, + { + "epoch": 3.6746316208681797, + "grad_norm": 0.4549028654235298, + "learning_rate": 1.6467725262191197e-06, + "loss": 0.5227, + "step": 13842 + }, + { + "epoch": 3.674897119341564, + "grad_norm": 0.4816546919531103, + "learning_rate": 1.6464443690689952e-06, + "loss": 0.5642, + "step": 13843 + }, + { + "epoch": 3.6751626178149475, + "grad_norm": 0.4546686745562164, + "learning_rate": 1.6461162285652954e-06, + "loss": 0.5461, + "step": 13844 + }, + { + "epoch": 3.675428116288331, + "grad_norm": 0.45597902204896146, + "learning_rate": 1.6457881047144202e-06, + "loss": 0.5377, + "step": 13845 + }, + { + "epoch": 3.675693614761715, + "grad_norm": 0.4751503934380114, + "learning_rate": 1.6454599975227693e-06, + "loss": 0.5724, + "step": 13846 + }, + { + "epoch": 3.675959113235099, + "grad_norm": 0.45704125522101935, + "learning_rate": 1.6451319069967397e-06, + "loss": 0.54, + "step": 13847 + }, + { + "epoch": 3.6762246117084825, + "grad_norm": 0.4761212291809819, + "learning_rate": 1.6448038331427307e-06, + "loss": 0.5376, + "step": 13848 + }, + { + "epoch": 3.6764901101818666, + "grad_norm": 0.47287030099643695, + "learning_rate": 1.6444757759671412e-06, + "loss": 0.5593, + "step": 13849 + }, + { + "epoch": 3.67675560865525, + "grad_norm": 0.4606940443573965, + "learning_rate": 1.6441477354763685e-06, + "loss": 0.5558, + "step": 13850 + }, + { + "epoch": 3.677021107128634, + "grad_norm": 0.458971606590005, + "learning_rate": 1.6438197116768107e-06, + "loss": 0.5474, + "step": 13851 + }, + { + "epoch": 3.677286605602018, + "grad_norm": 0.48013030574761045, + "learning_rate": 1.6434917045748652e-06, + "loss": 0.5691, + "step": 13852 + }, + { + "epoch": 3.6775521040754016, + "grad_norm": 0.4891441748302624, + "learning_rate": 1.643163714176928e-06, + "loss": 0.5661, + "step": 13853 + }, + { + "epoch": 3.677817602548785, + "grad_norm": 0.4632434781297104, + "learning_rate": 1.6428357404893964e-06, + "loss": 0.5204, + "step": 13854 + }, + { + "epoch": 3.6780831010221693, + "grad_norm": 0.44504795363461613, + "learning_rate": 1.6425077835186666e-06, + "loss": 0.5123, + "step": 13855 + }, + { + "epoch": 3.678348599495553, + "grad_norm": 0.4612803336897518, + "learning_rate": 1.6421798432711344e-06, + "loss": 0.572, + "step": 13856 + }, + { + "epoch": 3.6786140979689366, + "grad_norm": 0.4955079775505199, + "learning_rate": 1.6418519197531953e-06, + "loss": 0.5494, + "step": 13857 + }, + { + "epoch": 3.6788795964423207, + "grad_norm": 0.468319232696857, + "learning_rate": 1.6415240129712462e-06, + "loss": 0.5343, + "step": 13858 + }, + { + "epoch": 3.6791450949157043, + "grad_norm": 0.4570726011425613, + "learning_rate": 1.6411961229316797e-06, + "loss": 0.5401, + "step": 13859 + }, + { + "epoch": 3.679410593389088, + "grad_norm": 0.48574608820282317, + "learning_rate": 1.6408682496408918e-06, + "loss": 0.5688, + "step": 13860 + }, + { + "epoch": 3.679676091862472, + "grad_norm": 0.45720660188781775, + "learning_rate": 1.640540393105276e-06, + "loss": 0.5418, + "step": 13861 + }, + { + "epoch": 3.6799415903358557, + "grad_norm": 0.4530380208607555, + "learning_rate": 1.640212553331227e-06, + "loss": 0.5411, + "step": 13862 + }, + { + "epoch": 3.6802070888092393, + "grad_norm": 0.46057470830283, + "learning_rate": 1.6398847303251385e-06, + "loss": 0.5231, + "step": 13863 + }, + { + "epoch": 3.6804725872826234, + "grad_norm": 0.45520904342865004, + "learning_rate": 1.6395569240934042e-06, + "loss": 0.5154, + "step": 13864 + }, + { + "epoch": 3.680738085756007, + "grad_norm": 0.4727417815262151, + "learning_rate": 1.639229134642416e-06, + "loss": 0.5682, + "step": 13865 + }, + { + "epoch": 3.6810035842293907, + "grad_norm": 0.4504506264367193, + "learning_rate": 1.6389013619785673e-06, + "loss": 0.5716, + "step": 13866 + }, + { + "epoch": 3.6812690827027743, + "grad_norm": 0.46328484284373306, + "learning_rate": 1.6385736061082502e-06, + "loss": 0.514, + "step": 13867 + }, + { + "epoch": 3.6815345811761584, + "grad_norm": 0.47518937869733086, + "learning_rate": 1.638245867037857e-06, + "loss": 0.5499, + "step": 13868 + }, + { + "epoch": 3.681800079649542, + "grad_norm": 0.43831306084815297, + "learning_rate": 1.6379181447737796e-06, + "loss": 0.556, + "step": 13869 + }, + { + "epoch": 3.6820655781229257, + "grad_norm": 0.46597164407659364, + "learning_rate": 1.6375904393224095e-06, + "loss": 0.5626, + "step": 13870 + }, + { + "epoch": 3.6823310765963093, + "grad_norm": 0.4724242796507134, + "learning_rate": 1.6372627506901368e-06, + "loss": 0.5574, + "step": 13871 + }, + { + "epoch": 3.6825965750696934, + "grad_norm": 0.44406006482576116, + "learning_rate": 1.6369350788833526e-06, + "loss": 0.5441, + "step": 13872 + }, + { + "epoch": 3.682862073543077, + "grad_norm": 0.4746675405411556, + "learning_rate": 1.6366074239084478e-06, + "loss": 0.5405, + "step": 13873 + }, + { + "epoch": 3.6831275720164607, + "grad_norm": 0.45067043633564874, + "learning_rate": 1.6362797857718121e-06, + "loss": 0.5311, + "step": 13874 + }, + { + "epoch": 3.6833930704898448, + "grad_norm": 0.4715372294439943, + "learning_rate": 1.6359521644798363e-06, + "loss": 0.5624, + "step": 13875 + }, + { + "epoch": 3.6836585689632284, + "grad_norm": 0.4641625166813518, + "learning_rate": 1.6356245600389076e-06, + "loss": 0.5546, + "step": 13876 + }, + { + "epoch": 3.683924067436612, + "grad_norm": 0.4841146630617197, + "learning_rate": 1.6352969724554169e-06, + "loss": 0.5753, + "step": 13877 + }, + { + "epoch": 3.684189565909996, + "grad_norm": 0.47600627256485334, + "learning_rate": 1.634969401735752e-06, + "loss": 0.5089, + "step": 13878 + }, + { + "epoch": 3.6844550643833798, + "grad_norm": 0.4388927895339281, + "learning_rate": 1.6346418478863017e-06, + "loss": 0.5042, + "step": 13879 + }, + { + "epoch": 3.6847205628567634, + "grad_norm": 0.45746641196419136, + "learning_rate": 1.634314310913454e-06, + "loss": 0.5546, + "step": 13880 + }, + { + "epoch": 3.6849860613301475, + "grad_norm": 0.4771462424705586, + "learning_rate": 1.6339867908235975e-06, + "loss": 0.5552, + "step": 13881 + }, + { + "epoch": 3.685251559803531, + "grad_norm": 0.4712223924104759, + "learning_rate": 1.6336592876231187e-06, + "loss": 0.5445, + "step": 13882 + }, + { + "epoch": 3.685517058276915, + "grad_norm": 0.44977422498067143, + "learning_rate": 1.6333318013184047e-06, + "loss": 0.5497, + "step": 13883 + }, + { + "epoch": 3.685782556750299, + "grad_norm": 0.46828108350791414, + "learning_rate": 1.6330043319158425e-06, + "loss": 0.5554, + "step": 13884 + }, + { + "epoch": 3.6860480552236825, + "grad_norm": 0.46552111808760377, + "learning_rate": 1.6326768794218187e-06, + "loss": 0.5647, + "step": 13885 + }, + { + "epoch": 3.686313553697066, + "grad_norm": 0.4737033665491398, + "learning_rate": 1.6323494438427193e-06, + "loss": 0.5423, + "step": 13886 + }, + { + "epoch": 3.6865790521704502, + "grad_norm": 0.4724461744141565, + "learning_rate": 1.6320220251849312e-06, + "loss": 0.5748, + "step": 13887 + }, + { + "epoch": 3.686844550643834, + "grad_norm": 0.43996545078195703, + "learning_rate": 1.6316946234548375e-06, + "loss": 0.5245, + "step": 13888 + }, + { + "epoch": 3.6871100491172175, + "grad_norm": 0.4638149328901893, + "learning_rate": 1.6313672386588245e-06, + "loss": 0.5296, + "step": 13889 + }, + { + "epoch": 3.6873755475906016, + "grad_norm": 0.4601278630077606, + "learning_rate": 1.6310398708032773e-06, + "loss": 0.5623, + "step": 13890 + }, + { + "epoch": 3.6876410460639852, + "grad_norm": 0.4769117955376731, + "learning_rate": 1.6307125198945806e-06, + "loss": 0.5181, + "step": 13891 + }, + { + "epoch": 3.687906544537369, + "grad_norm": 0.46463664186935766, + "learning_rate": 1.630385185939118e-06, + "loss": 0.5392, + "step": 13892 + }, + { + "epoch": 3.688172043010753, + "grad_norm": 0.466850451238197, + "learning_rate": 1.6300578689432742e-06, + "loss": 0.5566, + "step": 13893 + }, + { + "epoch": 3.6884375414841366, + "grad_norm": 0.48184318201135246, + "learning_rate": 1.629730568913431e-06, + "loss": 0.5546, + "step": 13894 + }, + { + "epoch": 3.6887030399575202, + "grad_norm": 0.4577660396965125, + "learning_rate": 1.6294032858559727e-06, + "loss": 0.5497, + "step": 13895 + }, + { + "epoch": 3.688968538430904, + "grad_norm": 0.45783928710687316, + "learning_rate": 1.6290760197772818e-06, + "loss": 0.5568, + "step": 13896 + }, + { + "epoch": 3.6892340369042875, + "grad_norm": 0.4628803253871579, + "learning_rate": 1.6287487706837407e-06, + "loss": 0.5456, + "step": 13897 + }, + { + "epoch": 3.6894995353776716, + "grad_norm": 0.4635932855947962, + "learning_rate": 1.6284215385817326e-06, + "loss": 0.537, + "step": 13898 + }, + { + "epoch": 3.6897650338510553, + "grad_norm": 0.47520761872809547, + "learning_rate": 1.6280943234776378e-06, + "loss": 0.5488, + "step": 13899 + }, + { + "epoch": 3.690030532324439, + "grad_norm": 0.4754631243992757, + "learning_rate": 1.6277671253778387e-06, + "loss": 0.5216, + "step": 13900 + }, + { + "epoch": 3.690296030797823, + "grad_norm": 0.46259647229771034, + "learning_rate": 1.627439944288716e-06, + "loss": 0.5298, + "step": 13901 + }, + { + "epoch": 3.6905615292712066, + "grad_norm": 0.4585881187975157, + "learning_rate": 1.627112780216651e-06, + "loss": 0.5369, + "step": 13902 + }, + { + "epoch": 3.6908270277445903, + "grad_norm": 0.45425879226152227, + "learning_rate": 1.6267856331680237e-06, + "loss": 0.5279, + "step": 13903 + }, + { + "epoch": 3.6910925262179743, + "grad_norm": 0.46521762478199624, + "learning_rate": 1.6264585031492158e-06, + "loss": 0.5847, + "step": 13904 + }, + { + "epoch": 3.691358024691358, + "grad_norm": 0.4660984073751585, + "learning_rate": 1.6261313901666047e-06, + "loss": 0.5332, + "step": 13905 + }, + { + "epoch": 3.6916235231647416, + "grad_norm": 0.48665427669182987, + "learning_rate": 1.6258042942265712e-06, + "loss": 0.5345, + "step": 13906 + }, + { + "epoch": 3.6918890216381257, + "grad_norm": 0.45131391718563035, + "learning_rate": 1.6254772153354943e-06, + "loss": 0.5276, + "step": 13907 + }, + { + "epoch": 3.6921545201115094, + "grad_norm": 0.4649475060263303, + "learning_rate": 1.6251501534997529e-06, + "loss": 0.5486, + "step": 13908 + }, + { + "epoch": 3.692420018584893, + "grad_norm": 0.46623062789875286, + "learning_rate": 1.6248231087257254e-06, + "loss": 0.5944, + "step": 13909 + }, + { + "epoch": 3.692685517058277, + "grad_norm": 0.4682689806776586, + "learning_rate": 1.624496081019791e-06, + "loss": 0.5826, + "step": 13910 + }, + { + "epoch": 3.6929510155316607, + "grad_norm": 0.45466535491127313, + "learning_rate": 1.624169070388326e-06, + "loss": 0.5555, + "step": 13911 + }, + { + "epoch": 3.6932165140050444, + "grad_norm": 0.4609226118457142, + "learning_rate": 1.6238420768377084e-06, + "loss": 0.5804, + "step": 13912 + }, + { + "epoch": 3.6934820124784284, + "grad_norm": 0.4618726155874749, + "learning_rate": 1.6235151003743154e-06, + "loss": 0.5144, + "step": 13913 + }, + { + "epoch": 3.693747510951812, + "grad_norm": 0.4747378174786819, + "learning_rate": 1.623188141004524e-06, + "loss": 0.5367, + "step": 13914 + }, + { + "epoch": 3.6940130094251957, + "grad_norm": 0.4656448617720923, + "learning_rate": 1.6228611987347107e-06, + "loss": 0.577, + "step": 13915 + }, + { + "epoch": 3.69427850789858, + "grad_norm": 0.4821937949863833, + "learning_rate": 1.622534273571252e-06, + "loss": 0.5548, + "step": 13916 + }, + { + "epoch": 3.6945440063719635, + "grad_norm": 0.47272017190138055, + "learning_rate": 1.6222073655205232e-06, + "loss": 0.5346, + "step": 13917 + }, + { + "epoch": 3.694809504845347, + "grad_norm": 0.48947600039813927, + "learning_rate": 1.6218804745888996e-06, + "loss": 0.5496, + "step": 13918 + }, + { + "epoch": 3.695075003318731, + "grad_norm": 0.4628365735585339, + "learning_rate": 1.621553600782757e-06, + "loss": 0.5553, + "step": 13919 + }, + { + "epoch": 3.695340501792115, + "grad_norm": 0.4671841927704189, + "learning_rate": 1.62122674410847e-06, + "loss": 0.5132, + "step": 13920 + }, + { + "epoch": 3.6956060002654985, + "grad_norm": 0.46874744754035863, + "learning_rate": 1.620899904572414e-06, + "loss": 0.5518, + "step": 13921 + }, + { + "epoch": 3.695871498738882, + "grad_norm": 0.4611338121045755, + "learning_rate": 1.6205730821809611e-06, + "loss": 0.546, + "step": 13922 + }, + { + "epoch": 3.696136997212266, + "grad_norm": 0.46299008145604237, + "learning_rate": 1.6202462769404866e-06, + "loss": 0.5435, + "step": 13923 + }, + { + "epoch": 3.69640249568565, + "grad_norm": 0.4674528864076, + "learning_rate": 1.619919488857364e-06, + "loss": 0.5446, + "step": 13924 + }, + { + "epoch": 3.6966679941590335, + "grad_norm": 0.4573710094174378, + "learning_rate": 1.6195927179379656e-06, + "loss": 0.5627, + "step": 13925 + }, + { + "epoch": 3.696933492632417, + "grad_norm": 0.45716538728410083, + "learning_rate": 1.6192659641886648e-06, + "loss": 0.5618, + "step": 13926 + }, + { + "epoch": 3.697198991105801, + "grad_norm": 0.4749133491026972, + "learning_rate": 1.618939227615835e-06, + "loss": 0.5494, + "step": 13927 + }, + { + "epoch": 3.697464489579185, + "grad_norm": 0.4580850014047163, + "learning_rate": 1.6186125082258466e-06, + "loss": 0.5794, + "step": 13928 + }, + { + "epoch": 3.6977299880525685, + "grad_norm": 0.49375317023769816, + "learning_rate": 1.6182858060250726e-06, + "loss": 0.5403, + "step": 13929 + }, + { + "epoch": 3.6979954865259526, + "grad_norm": 0.45791496821629646, + "learning_rate": 1.6179591210198842e-06, + "loss": 0.5713, + "step": 13930 + }, + { + "epoch": 3.698260984999336, + "grad_norm": 0.46493461749910653, + "learning_rate": 1.617632453216652e-06, + "loss": 0.5328, + "step": 13931 + }, + { + "epoch": 3.69852648347272, + "grad_norm": 0.46741895580898485, + "learning_rate": 1.617305802621748e-06, + "loss": 0.5322, + "step": 13932 + }, + { + "epoch": 3.698791981946104, + "grad_norm": 0.48606468392420066, + "learning_rate": 1.616979169241543e-06, + "loss": 0.5633, + "step": 13933 + }, + { + "epoch": 3.6990574804194876, + "grad_norm": 0.46635827365048776, + "learning_rate": 1.6166525530824045e-06, + "loss": 0.4823, + "step": 13934 + }, + { + "epoch": 3.699322978892871, + "grad_norm": 0.483742766973483, + "learning_rate": 1.6163259541507045e-06, + "loss": 0.5252, + "step": 13935 + }, + { + "epoch": 3.6995884773662553, + "grad_norm": 0.48002758390466216, + "learning_rate": 1.6159993724528123e-06, + "loss": 0.548, + "step": 13936 + }, + { + "epoch": 3.699853975839639, + "grad_norm": 0.4803884467536995, + "learning_rate": 1.6156728079950962e-06, + "loss": 0.5465, + "step": 13937 + }, + { + "epoch": 3.7001194743130226, + "grad_norm": 0.4569002973412173, + "learning_rate": 1.615346260783926e-06, + "loss": 0.5542, + "step": 13938 + }, + { + "epoch": 3.7003849727864067, + "grad_norm": 0.46924788082767865, + "learning_rate": 1.6150197308256703e-06, + "loss": 0.557, + "step": 13939 + }, + { + "epoch": 3.7006504712597903, + "grad_norm": 0.47314279083053973, + "learning_rate": 1.6146932181266962e-06, + "loss": 0.5436, + "step": 13940 + }, + { + "epoch": 3.700915969733174, + "grad_norm": 0.5313183201914511, + "learning_rate": 1.6143667226933718e-06, + "loss": 0.5339, + "step": 13941 + }, + { + "epoch": 3.701181468206558, + "grad_norm": 0.47921918196469343, + "learning_rate": 1.6140402445320646e-06, + "loss": 0.512, + "step": 13942 + }, + { + "epoch": 3.7014469666799417, + "grad_norm": 0.46459005172820533, + "learning_rate": 1.6137137836491417e-06, + "loss": 0.54, + "step": 13943 + }, + { + "epoch": 3.7017124651533253, + "grad_norm": 0.47263746969667975, + "learning_rate": 1.613387340050971e-06, + "loss": 0.5305, + "step": 13944 + }, + { + "epoch": 3.7019779636267094, + "grad_norm": 0.47009001472941886, + "learning_rate": 1.613060913743917e-06, + "loss": 0.5388, + "step": 13945 + }, + { + "epoch": 3.702243462100093, + "grad_norm": 0.47743816368702907, + "learning_rate": 1.6127345047343467e-06, + "loss": 0.5085, + "step": 13946 + }, + { + "epoch": 3.7025089605734767, + "grad_norm": 0.47882869565384395, + "learning_rate": 1.6124081130286262e-06, + "loss": 0.5483, + "step": 13947 + }, + { + "epoch": 3.7027744590468608, + "grad_norm": 0.4846219203354417, + "learning_rate": 1.6120817386331205e-06, + "loss": 0.5357, + "step": 13948 + }, + { + "epoch": 3.7030399575202444, + "grad_norm": 0.4611814051502773, + "learning_rate": 1.6117553815541952e-06, + "loss": 0.5603, + "step": 13949 + }, + { + "epoch": 3.703305455993628, + "grad_norm": 0.4741688231407391, + "learning_rate": 1.6114290417982156e-06, + "loss": 0.5532, + "step": 13950 + }, + { + "epoch": 3.7035709544670117, + "grad_norm": 0.4624028702997076, + "learning_rate": 1.6111027193715446e-06, + "loss": 0.5571, + "step": 13951 + }, + { + "epoch": 3.7038364529403953, + "grad_norm": 0.4792280960525569, + "learning_rate": 1.6107764142805465e-06, + "loss": 0.5495, + "step": 13952 + }, + { + "epoch": 3.7041019514137794, + "grad_norm": 0.46827108651071253, + "learning_rate": 1.6104501265315856e-06, + "loss": 0.526, + "step": 13953 + }, + { + "epoch": 3.704367449887163, + "grad_norm": 0.4513240760024143, + "learning_rate": 1.6101238561310254e-06, + "loss": 0.5258, + "step": 13954 + }, + { + "epoch": 3.7046329483605467, + "grad_norm": 0.4817332644741952, + "learning_rate": 1.609797603085228e-06, + "loss": 0.5643, + "step": 13955 + }, + { + "epoch": 3.7048984468339308, + "grad_norm": 0.4706657004875287, + "learning_rate": 1.6094713674005587e-06, + "loss": 0.5345, + "step": 13956 + }, + { + "epoch": 3.7051639453073144, + "grad_norm": 0.47702912757364696, + "learning_rate": 1.609145149083377e-06, + "loss": 0.5401, + "step": 13957 + }, + { + "epoch": 3.705429443780698, + "grad_norm": 0.45719426570333266, + "learning_rate": 1.6088189481400462e-06, + "loss": 0.5641, + "step": 13958 + }, + { + "epoch": 3.705694942254082, + "grad_norm": 0.4717978058671165, + "learning_rate": 1.608492764576928e-06, + "loss": 0.5848, + "step": 13959 + }, + { + "epoch": 3.7059604407274658, + "grad_norm": 0.4558463698394017, + "learning_rate": 1.6081665984003837e-06, + "loss": 0.5389, + "step": 13960 + }, + { + "epoch": 3.7062259392008494, + "grad_norm": 0.45039616958114526, + "learning_rate": 1.6078404496167738e-06, + "loss": 0.5638, + "step": 13961 + }, + { + "epoch": 3.7064914376742335, + "grad_norm": 0.4726891741244302, + "learning_rate": 1.6075143182324605e-06, + "loss": 0.5647, + "step": 13962 + }, + { + "epoch": 3.706756936147617, + "grad_norm": 0.45377895141054436, + "learning_rate": 1.6071882042538027e-06, + "loss": 0.5183, + "step": 13963 + }, + { + "epoch": 3.707022434621001, + "grad_norm": 0.46712960183505264, + "learning_rate": 1.6068621076871605e-06, + "loss": 0.5696, + "step": 13964 + }, + { + "epoch": 3.707287933094385, + "grad_norm": 0.4827404860993805, + "learning_rate": 1.6065360285388942e-06, + "loss": 0.5841, + "step": 13965 + }, + { + "epoch": 3.7075534315677685, + "grad_norm": 0.4908811403818706, + "learning_rate": 1.606209966815363e-06, + "loss": 0.5721, + "step": 13966 + }, + { + "epoch": 3.707818930041152, + "grad_norm": 0.4788514392622545, + "learning_rate": 1.6058839225229267e-06, + "loss": 0.515, + "step": 13967 + }, + { + "epoch": 3.7080844285145362, + "grad_norm": 0.445737067469096, + "learning_rate": 1.6055578956679422e-06, + "loss": 0.5304, + "step": 13968 + }, + { + "epoch": 3.70834992698792, + "grad_norm": 0.45283892177050783, + "learning_rate": 1.6052318862567687e-06, + "loss": 0.5325, + "step": 13969 + }, + { + "epoch": 3.7086154254613035, + "grad_norm": 0.46370054133355104, + "learning_rate": 1.604905894295764e-06, + "loss": 0.5729, + "step": 13970 + }, + { + "epoch": 3.7088809239346876, + "grad_norm": 0.4555017536285113, + "learning_rate": 1.6045799197912859e-06, + "loss": 0.5112, + "step": 13971 + }, + { + "epoch": 3.7091464224080712, + "grad_norm": 0.45157386613356704, + "learning_rate": 1.6042539627496922e-06, + "loss": 0.5247, + "step": 13972 + }, + { + "epoch": 3.709411920881455, + "grad_norm": 0.4798367542277795, + "learning_rate": 1.6039280231773396e-06, + "loss": 0.5661, + "step": 13973 + }, + { + "epoch": 3.709677419354839, + "grad_norm": 0.4689371554252817, + "learning_rate": 1.6036021010805838e-06, + "loss": 0.5376, + "step": 13974 + }, + { + "epoch": 3.7099429178282226, + "grad_norm": 0.46109600438568477, + "learning_rate": 1.603276196465782e-06, + "loss": 0.5063, + "step": 13975 + }, + { + "epoch": 3.7102084163016062, + "grad_norm": 0.4686636423379695, + "learning_rate": 1.60295030933929e-06, + "loss": 0.5404, + "step": 13976 + }, + { + "epoch": 3.71047391477499, + "grad_norm": 0.45903305570379066, + "learning_rate": 1.602624439707463e-06, + "loss": 0.5235, + "step": 13977 + }, + { + "epoch": 3.710739413248374, + "grad_norm": 0.4503795922045036, + "learning_rate": 1.602298587576657e-06, + "loss": 0.5551, + "step": 13978 + }, + { + "epoch": 3.7110049117217576, + "grad_norm": 0.4683078318820563, + "learning_rate": 1.6019727529532275e-06, + "loss": 0.5351, + "step": 13979 + }, + { + "epoch": 3.7112704101951413, + "grad_norm": 0.47236374610325926, + "learning_rate": 1.601646935843526e-06, + "loss": 0.529, + "step": 13980 + }, + { + "epoch": 3.711535908668525, + "grad_norm": 0.45045476106818827, + "learning_rate": 1.60132113625391e-06, + "loss": 0.5361, + "step": 13981 + }, + { + "epoch": 3.711801407141909, + "grad_norm": 0.4818288768367668, + "learning_rate": 1.6009953541907316e-06, + "loss": 0.5337, + "step": 13982 + }, + { + "epoch": 3.7120669056152926, + "grad_norm": 0.4592896302201238, + "learning_rate": 1.6006695896603453e-06, + "loss": 0.5509, + "step": 13983 + }, + { + "epoch": 3.7123324040886763, + "grad_norm": 0.4578212135807717, + "learning_rate": 1.6003438426691037e-06, + "loss": 0.5238, + "step": 13984 + }, + { + "epoch": 3.7125979025620603, + "grad_norm": 0.4521174056743562, + "learning_rate": 1.600018113223361e-06, + "loss": 0.5155, + "step": 13985 + }, + { + "epoch": 3.712863401035444, + "grad_norm": 0.4596232729534214, + "learning_rate": 1.5996924013294678e-06, + "loss": 0.5349, + "step": 13986 + }, + { + "epoch": 3.7131288995088276, + "grad_norm": 0.4810597563683559, + "learning_rate": 1.5993667069937772e-06, + "loss": 0.5497, + "step": 13987 + }, + { + "epoch": 3.7133943979822117, + "grad_norm": 0.45672500074153605, + "learning_rate": 1.5990410302226405e-06, + "loss": 0.5394, + "step": 13988 + }, + { + "epoch": 3.7136598964555954, + "grad_norm": 0.48144456966661453, + "learning_rate": 1.5987153710224102e-06, + "loss": 0.5259, + "step": 13989 + }, + { + "epoch": 3.713925394928979, + "grad_norm": 0.4708938282933909, + "learning_rate": 1.5983897293994363e-06, + "loss": 0.5657, + "step": 13990 + }, + { + "epoch": 3.714190893402363, + "grad_norm": 0.4630481578459902, + "learning_rate": 1.5980641053600709e-06, + "loss": 0.5235, + "step": 13991 + }, + { + "epoch": 3.7144563918757467, + "grad_norm": 0.4662343235015243, + "learning_rate": 1.5977384989106634e-06, + "loss": 0.5381, + "step": 13992 + }, + { + "epoch": 3.7147218903491304, + "grad_norm": 0.4815318169614546, + "learning_rate": 1.5974129100575641e-06, + "loss": 0.5529, + "step": 13993 + }, + { + "epoch": 3.7149873888225144, + "grad_norm": 0.47144939258861923, + "learning_rate": 1.5970873388071232e-06, + "loss": 0.5261, + "step": 13994 + }, + { + "epoch": 3.715252887295898, + "grad_norm": 0.4754920297639651, + "learning_rate": 1.5967617851656897e-06, + "loss": 0.5585, + "step": 13995 + }, + { + "epoch": 3.7155183857692817, + "grad_norm": 0.4708984872778865, + "learning_rate": 1.596436249139614e-06, + "loss": 0.5071, + "step": 13996 + }, + { + "epoch": 3.715783884242666, + "grad_norm": 0.4775123442750793, + "learning_rate": 1.5961107307352428e-06, + "loss": 0.4857, + "step": 13997 + }, + { + "epoch": 3.7160493827160495, + "grad_norm": 0.47555086418049525, + "learning_rate": 1.5957852299589255e-06, + "loss": 0.5529, + "step": 13998 + }, + { + "epoch": 3.716314881189433, + "grad_norm": 0.4664510654874107, + "learning_rate": 1.5954597468170098e-06, + "loss": 0.5266, + "step": 13999 + }, + { + "epoch": 3.716580379662817, + "grad_norm": 0.4794408476235134, + "learning_rate": 1.595134281315844e-06, + "loss": 0.5527, + "step": 14000 + }, + { + "epoch": 3.716845878136201, + "grad_norm": 0.47434262143827643, + "learning_rate": 1.5948088334617745e-06, + "loss": 0.5611, + "step": 14001 + }, + { + "epoch": 3.7171113766095845, + "grad_norm": 0.4697606501569821, + "learning_rate": 1.5944834032611506e-06, + "loss": 0.5039, + "step": 14002 + }, + { + "epoch": 3.7173768750829685, + "grad_norm": 0.438691579646571, + "learning_rate": 1.5941579907203166e-06, + "loss": 0.5582, + "step": 14003 + }, + { + "epoch": 3.717642373556352, + "grad_norm": 0.47460392170986415, + "learning_rate": 1.5938325958456197e-06, + "loss": 0.5634, + "step": 14004 + }, + { + "epoch": 3.717907872029736, + "grad_norm": 0.45733436648954034, + "learning_rate": 1.5935072186434056e-06, + "loss": 0.5567, + "step": 14005 + }, + { + "epoch": 3.7181733705031195, + "grad_norm": 0.4653273471991448, + "learning_rate": 1.5931818591200203e-06, + "loss": 0.5398, + "step": 14006 + }, + { + "epoch": 3.718438868976503, + "grad_norm": 0.4624898457928785, + "learning_rate": 1.592856517281809e-06, + "loss": 0.5296, + "step": 14007 + }, + { + "epoch": 3.718704367449887, + "grad_norm": 0.47841349143065914, + "learning_rate": 1.5925311931351172e-06, + "loss": 0.542, + "step": 14008 + }, + { + "epoch": 3.718969865923271, + "grad_norm": 0.46830228555091596, + "learning_rate": 1.592205886686289e-06, + "loss": 0.5441, + "step": 14009 + }, + { + "epoch": 3.7192353643966545, + "grad_norm": 0.46408189201490874, + "learning_rate": 1.591880597941668e-06, + "loss": 0.523, + "step": 14010 + }, + { + "epoch": 3.7195008628700386, + "grad_norm": 0.4568198058342598, + "learning_rate": 1.5915553269075989e-06, + "loss": 0.5391, + "step": 14011 + }, + { + "epoch": 3.719766361343422, + "grad_norm": 0.47465780068653185, + "learning_rate": 1.5912300735904252e-06, + "loss": 0.5498, + "step": 14012 + }, + { + "epoch": 3.720031859816806, + "grad_norm": 0.464038828101832, + "learning_rate": 1.5909048379964904e-06, + "loss": 0.5524, + "step": 14013 + }, + { + "epoch": 3.72029735829019, + "grad_norm": 0.4713679855204704, + "learning_rate": 1.5905796201321378e-06, + "loss": 0.547, + "step": 14014 + }, + { + "epoch": 3.7205628567635736, + "grad_norm": 0.46960477873354917, + "learning_rate": 1.5902544200037084e-06, + "loss": 0.5594, + "step": 14015 + }, + { + "epoch": 3.720828355236957, + "grad_norm": 0.4630010464039946, + "learning_rate": 1.5899292376175451e-06, + "loss": 0.547, + "step": 14016 + }, + { + "epoch": 3.7210938537103413, + "grad_norm": 0.46599825242366705, + "learning_rate": 1.58960407297999e-06, + "loss": 0.5158, + "step": 14017 + }, + { + "epoch": 3.721359352183725, + "grad_norm": 0.464129095162835, + "learning_rate": 1.5892789260973846e-06, + "loss": 0.5323, + "step": 14018 + }, + { + "epoch": 3.7216248506571086, + "grad_norm": 0.45242721019960425, + "learning_rate": 1.5889537969760704e-06, + "loss": 0.5267, + "step": 14019 + }, + { + "epoch": 3.7218903491304927, + "grad_norm": 0.46224502955299124, + "learning_rate": 1.5886286856223876e-06, + "loss": 0.5948, + "step": 14020 + }, + { + "epoch": 3.7221558476038763, + "grad_norm": 0.47642409515405704, + "learning_rate": 1.5883035920426765e-06, + "loss": 0.537, + "step": 14021 + }, + { + "epoch": 3.72242134607726, + "grad_norm": 0.48979447676449406, + "learning_rate": 1.5879785162432777e-06, + "loss": 0.5458, + "step": 14022 + }, + { + "epoch": 3.722686844550644, + "grad_norm": 0.4716302142955136, + "learning_rate": 1.5876534582305306e-06, + "loss": 0.5297, + "step": 14023 + }, + { + "epoch": 3.7229523430240277, + "grad_norm": 0.45977962044863896, + "learning_rate": 1.587328418010775e-06, + "loss": 0.5173, + "step": 14024 + }, + { + "epoch": 3.7232178414974113, + "grad_norm": 0.4757333385130725, + "learning_rate": 1.5870033955903508e-06, + "loss": 0.5558, + "step": 14025 + }, + { + "epoch": 3.7234833399707954, + "grad_norm": 0.4647279888470987, + "learning_rate": 1.5866783909755945e-06, + "loss": 0.5472, + "step": 14026 + }, + { + "epoch": 3.723748838444179, + "grad_norm": 0.4595301093408153, + "learning_rate": 1.5863534041728463e-06, + "loss": 0.5745, + "step": 14027 + }, + { + "epoch": 3.7240143369175627, + "grad_norm": 0.4557547095475187, + "learning_rate": 1.5860284351884433e-06, + "loss": 0.5694, + "step": 14028 + }, + { + "epoch": 3.7242798353909468, + "grad_norm": 0.46598219163401927, + "learning_rate": 1.5857034840287238e-06, + "loss": 0.5221, + "step": 14029 + }, + { + "epoch": 3.7245453338643304, + "grad_norm": 0.46745037414065777, + "learning_rate": 1.5853785507000253e-06, + "loss": 0.5507, + "step": 14030 + }, + { + "epoch": 3.724810832337714, + "grad_norm": 0.4643586203810378, + "learning_rate": 1.585053635208685e-06, + "loss": 0.5454, + "step": 14031 + }, + { + "epoch": 3.7250763308110977, + "grad_norm": 0.4490529417547004, + "learning_rate": 1.5847287375610382e-06, + "loss": 0.5336, + "step": 14032 + }, + { + "epoch": 3.7253418292844818, + "grad_norm": 0.46334427494015706, + "learning_rate": 1.5844038577634218e-06, + "loss": 0.561, + "step": 14033 + }, + { + "epoch": 3.7256073277578654, + "grad_norm": 0.4773383910244735, + "learning_rate": 1.584078995822172e-06, + "loss": 0.591, + "step": 14034 + }, + { + "epoch": 3.725872826231249, + "grad_norm": 0.4521197587887517, + "learning_rate": 1.5837541517436244e-06, + "loss": 0.5427, + "step": 14035 + }, + { + "epoch": 3.7261383247046327, + "grad_norm": 0.47851141726519236, + "learning_rate": 1.5834293255341143e-06, + "loss": 0.5332, + "step": 14036 + }, + { + "epoch": 3.7264038231780168, + "grad_norm": 0.45555362619218864, + "learning_rate": 1.583104517199977e-06, + "loss": 0.5523, + "step": 14037 + }, + { + "epoch": 3.7266693216514004, + "grad_norm": 0.46394069286110046, + "learning_rate": 1.5827797267475458e-06, + "loss": 0.5507, + "step": 14038 + }, + { + "epoch": 3.726934820124784, + "grad_norm": 0.46670052645269156, + "learning_rate": 1.582454954183156e-06, + "loss": 0.5386, + "step": 14039 + }, + { + "epoch": 3.727200318598168, + "grad_norm": 0.4602634805367968, + "learning_rate": 1.582130199513141e-06, + "loss": 0.4983, + "step": 14040 + }, + { + "epoch": 3.727465817071552, + "grad_norm": 0.45986926124912514, + "learning_rate": 1.5818054627438344e-06, + "loss": 0.56, + "step": 14041 + }, + { + "epoch": 3.7277313155449354, + "grad_norm": 0.4807354143039927, + "learning_rate": 1.5814807438815704e-06, + "loss": 0.5327, + "step": 14042 + }, + { + "epoch": 3.7279968140183195, + "grad_norm": 0.44852251511983077, + "learning_rate": 1.58115604293268e-06, + "loss": 0.545, + "step": 14043 + }, + { + "epoch": 3.728262312491703, + "grad_norm": 0.45572005634133306, + "learning_rate": 1.5808313599034963e-06, + "loss": 0.5121, + "step": 14044 + }, + { + "epoch": 3.728527810965087, + "grad_norm": 0.4561595856380745, + "learning_rate": 1.5805066948003518e-06, + "loss": 0.5592, + "step": 14045 + }, + { + "epoch": 3.728793309438471, + "grad_norm": 0.46463527285217193, + "learning_rate": 1.580182047629577e-06, + "loss": 0.5478, + "step": 14046 + }, + { + "epoch": 3.7290588079118545, + "grad_norm": 0.46442425642510066, + "learning_rate": 1.5798574183975052e-06, + "loss": 0.5474, + "step": 14047 + }, + { + "epoch": 3.729324306385238, + "grad_norm": 0.458550221409625, + "learning_rate": 1.5795328071104677e-06, + "loss": 0.5377, + "step": 14048 + }, + { + "epoch": 3.7295898048586222, + "grad_norm": 0.47089280908586767, + "learning_rate": 1.5792082137747932e-06, + "loss": 0.5466, + "step": 14049 + }, + { + "epoch": 3.729855303332006, + "grad_norm": 0.4665835127020925, + "learning_rate": 1.578883638396813e-06, + "loss": 0.5381, + "step": 14050 + }, + { + "epoch": 3.7301208018053895, + "grad_norm": 0.47175018564770754, + "learning_rate": 1.5785590809828572e-06, + "loss": 0.5707, + "step": 14051 + }, + { + "epoch": 3.7303863002787736, + "grad_norm": 0.48065741563771125, + "learning_rate": 1.5782345415392553e-06, + "loss": 0.6159, + "step": 14052 + }, + { + "epoch": 3.7306517987521572, + "grad_norm": 0.4671378933997786, + "learning_rate": 1.5779100200723365e-06, + "loss": 0.5237, + "step": 14053 + }, + { + "epoch": 3.730917297225541, + "grad_norm": 0.46555770384453865, + "learning_rate": 1.5775855165884307e-06, + "loss": 0.5615, + "step": 14054 + }, + { + "epoch": 3.731182795698925, + "grad_norm": 0.4705426754631287, + "learning_rate": 1.5772610310938651e-06, + "loss": 0.5757, + "step": 14055 + }, + { + "epoch": 3.7314482941723086, + "grad_norm": 0.48111052999150883, + "learning_rate": 1.5769365635949691e-06, + "loss": 0.5716, + "step": 14056 + }, + { + "epoch": 3.7317137926456923, + "grad_norm": 0.4783846596911704, + "learning_rate": 1.5766121140980697e-06, + "loss": 0.5615, + "step": 14057 + }, + { + "epoch": 3.7319792911190763, + "grad_norm": 0.45921066186769616, + "learning_rate": 1.5762876826094953e-06, + "loss": 0.5436, + "step": 14058 + }, + { + "epoch": 3.73224478959246, + "grad_norm": 0.4707470663063092, + "learning_rate": 1.5759632691355726e-06, + "loss": 0.5522, + "step": 14059 + }, + { + "epoch": 3.7325102880658436, + "grad_norm": 0.46844028418797196, + "learning_rate": 1.5756388736826294e-06, + "loss": 0.5465, + "step": 14060 + }, + { + "epoch": 3.7327757865392273, + "grad_norm": 0.4761099712401972, + "learning_rate": 1.5753144962569905e-06, + "loss": 0.5535, + "step": 14061 + }, + { + "epoch": 3.7330412850126113, + "grad_norm": 0.46316910287909924, + "learning_rate": 1.5749901368649828e-06, + "loss": 0.5265, + "step": 14062 + }, + { + "epoch": 3.733306783485995, + "grad_norm": 0.4668734526674542, + "learning_rate": 1.5746657955129325e-06, + "loss": 0.5283, + "step": 14063 + }, + { + "epoch": 3.7335722819593786, + "grad_norm": 0.4556384463968969, + "learning_rate": 1.5743414722071646e-06, + "loss": 0.5342, + "step": 14064 + }, + { + "epoch": 3.7338377804327623, + "grad_norm": 0.4632358335100543, + "learning_rate": 1.574017166954005e-06, + "loss": 0.5638, + "step": 14065 + }, + { + "epoch": 3.7341032789061464, + "grad_norm": 0.4545067266750769, + "learning_rate": 1.5736928797597773e-06, + "loss": 0.575, + "step": 14066 + }, + { + "epoch": 3.73436877737953, + "grad_norm": 0.46235884499032825, + "learning_rate": 1.5733686106308063e-06, + "loss": 0.4791, + "step": 14067 + }, + { + "epoch": 3.7346342758529136, + "grad_norm": 0.45747485624648193, + "learning_rate": 1.5730443595734162e-06, + "loss": 0.5536, + "step": 14068 + }, + { + "epoch": 3.7348997743262977, + "grad_norm": 0.4788272081449488, + "learning_rate": 1.572720126593931e-06, + "loss": 0.5459, + "step": 14069 + }, + { + "epoch": 3.7351652727996814, + "grad_norm": 0.4606531577542309, + "learning_rate": 1.5723959116986733e-06, + "loss": 0.5177, + "step": 14070 + }, + { + "epoch": 3.735430771273065, + "grad_norm": 0.44896799254195546, + "learning_rate": 1.5720717148939673e-06, + "loss": 0.5459, + "step": 14071 + }, + { + "epoch": 3.735696269746449, + "grad_norm": 0.47194539036748584, + "learning_rate": 1.5717475361861342e-06, + "loss": 0.5309, + "step": 14072 + }, + { + "epoch": 3.7359617682198327, + "grad_norm": 0.4666282782255182, + "learning_rate": 1.5714233755814969e-06, + "loss": 0.5458, + "step": 14073 + }, + { + "epoch": 3.7362272666932164, + "grad_norm": 0.4514419452370618, + "learning_rate": 1.5710992330863772e-06, + "loss": 0.5094, + "step": 14074 + }, + { + "epoch": 3.7364927651666004, + "grad_norm": 0.47802713437696404, + "learning_rate": 1.5707751087070972e-06, + "loss": 0.5643, + "step": 14075 + }, + { + "epoch": 3.736758263639984, + "grad_norm": 0.4621559787055226, + "learning_rate": 1.5704510024499775e-06, + "loss": 0.5368, + "step": 14076 + }, + { + "epoch": 3.7370237621133677, + "grad_norm": 0.46975279722094376, + "learning_rate": 1.57012691432134e-06, + "loss": 0.5349, + "step": 14077 + }, + { + "epoch": 3.737289260586752, + "grad_norm": 0.462153972407686, + "learning_rate": 1.5698028443275035e-06, + "loss": 0.5185, + "step": 14078 + }, + { + "epoch": 3.7375547590601355, + "grad_norm": 0.45544824401053713, + "learning_rate": 1.569478792474789e-06, + "loss": 0.5101, + "step": 14079 + }, + { + "epoch": 3.737820257533519, + "grad_norm": 0.4614501022541727, + "learning_rate": 1.5691547587695167e-06, + "loss": 0.5149, + "step": 14080 + }, + { + "epoch": 3.738085756006903, + "grad_norm": 0.43899634509229857, + "learning_rate": 1.568830743218005e-06, + "loss": 0.5423, + "step": 14081 + }, + { + "epoch": 3.738351254480287, + "grad_norm": 0.466974910594938, + "learning_rate": 1.5685067458265742e-06, + "loss": 0.5586, + "step": 14082 + }, + { + "epoch": 3.7386167529536705, + "grad_norm": 0.48080311004045906, + "learning_rate": 1.5681827666015431e-06, + "loss": 0.5323, + "step": 14083 + }, + { + "epoch": 3.7388822514270545, + "grad_norm": 0.4610910508519491, + "learning_rate": 1.5678588055492289e-06, + "loss": 0.5819, + "step": 14084 + }, + { + "epoch": 3.739147749900438, + "grad_norm": 0.46625495393892213, + "learning_rate": 1.5675348626759501e-06, + "loss": 0.5376, + "step": 14085 + }, + { + "epoch": 3.739413248373822, + "grad_norm": 0.4692032275781226, + "learning_rate": 1.567210937988025e-06, + "loss": 0.5418, + "step": 14086 + }, + { + "epoch": 3.7396787468472055, + "grad_norm": 0.4626499037264671, + "learning_rate": 1.56688703149177e-06, + "loss": 0.4998, + "step": 14087 + }, + { + "epoch": 3.7399442453205896, + "grad_norm": 0.473183185442196, + "learning_rate": 1.5665631431935036e-06, + "loss": 0.5507, + "step": 14088 + }, + { + "epoch": 3.740209743793973, + "grad_norm": 0.472429269896874, + "learning_rate": 1.5662392730995408e-06, + "loss": 0.4906, + "step": 14089 + }, + { + "epoch": 3.740475242267357, + "grad_norm": 0.46444205147050116, + "learning_rate": 1.565915421216198e-06, + "loss": 0.5397, + "step": 14090 + }, + { + "epoch": 3.7407407407407405, + "grad_norm": 0.4724857395032314, + "learning_rate": 1.5655915875497915e-06, + "loss": 0.5586, + "step": 14091 + }, + { + "epoch": 3.7410062392141246, + "grad_norm": 0.4594644289338305, + "learning_rate": 1.5652677721066364e-06, + "loss": 0.5826, + "step": 14092 + }, + { + "epoch": 3.741271737687508, + "grad_norm": 0.4565426786648952, + "learning_rate": 1.564943974893049e-06, + "loss": 0.5655, + "step": 14093 + }, + { + "epoch": 3.741537236160892, + "grad_norm": 0.4613102449586906, + "learning_rate": 1.5646201959153446e-06, + "loss": 0.5567, + "step": 14094 + }, + { + "epoch": 3.741802734634276, + "grad_norm": 0.4656469888339646, + "learning_rate": 1.5642964351798353e-06, + "loss": 0.5233, + "step": 14095 + }, + { + "epoch": 3.7420682331076596, + "grad_norm": 0.4605834463327008, + "learning_rate": 1.5639726926928368e-06, + "loss": 0.5554, + "step": 14096 + }, + { + "epoch": 3.742333731581043, + "grad_norm": 0.46272121324459853, + "learning_rate": 1.5636489684606624e-06, + "loss": 0.5233, + "step": 14097 + }, + { + "epoch": 3.7425992300544273, + "grad_norm": 0.47381366440702893, + "learning_rate": 1.5633252624896256e-06, + "loss": 0.559, + "step": 14098 + }, + { + "epoch": 3.742864728527811, + "grad_norm": 0.49302717878618785, + "learning_rate": 1.5630015747860397e-06, + "loss": 0.5547, + "step": 14099 + }, + { + "epoch": 3.7431302270011946, + "grad_norm": 0.4692119237585427, + "learning_rate": 1.5626779053562177e-06, + "loss": 0.5499, + "step": 14100 + }, + { + "epoch": 3.7433957254745787, + "grad_norm": 0.4635284235067521, + "learning_rate": 1.5623542542064712e-06, + "loss": 0.5576, + "step": 14101 + }, + { + "epoch": 3.7436612239479623, + "grad_norm": 0.47322606476546286, + "learning_rate": 1.5620306213431121e-06, + "loss": 0.5679, + "step": 14102 + }, + { + "epoch": 3.743926722421346, + "grad_norm": 0.4593292619626133, + "learning_rate": 1.5617070067724528e-06, + "loss": 0.5554, + "step": 14103 + }, + { + "epoch": 3.74419222089473, + "grad_norm": 0.4570012072944531, + "learning_rate": 1.5613834105008042e-06, + "loss": 0.5316, + "step": 14104 + }, + { + "epoch": 3.7444577193681137, + "grad_norm": 0.4836065398740782, + "learning_rate": 1.5610598325344769e-06, + "loss": 0.54, + "step": 14105 + }, + { + "epoch": 3.7447232178414973, + "grad_norm": 0.45267301603514387, + "learning_rate": 1.560736272879783e-06, + "loss": 0.5344, + "step": 14106 + }, + { + "epoch": 3.7449887163148814, + "grad_norm": 0.46849438334900084, + "learning_rate": 1.5604127315430305e-06, + "loss": 0.5504, + "step": 14107 + }, + { + "epoch": 3.745254214788265, + "grad_norm": 0.4598448626115179, + "learning_rate": 1.5600892085305303e-06, + "loss": 0.5826, + "step": 14108 + }, + { + "epoch": 3.7455197132616487, + "grad_norm": 0.47265533526469505, + "learning_rate": 1.5597657038485916e-06, + "loss": 0.567, + "step": 14109 + }, + { + "epoch": 3.7457852117350328, + "grad_norm": 0.4606289040756109, + "learning_rate": 1.559442217503524e-06, + "loss": 0.5088, + "step": 14110 + }, + { + "epoch": 3.7460507102084164, + "grad_norm": 0.45830198382405063, + "learning_rate": 1.559118749501636e-06, + "loss": 0.5757, + "step": 14111 + }, + { + "epoch": 3.7463162086818, + "grad_norm": 0.47692293144030207, + "learning_rate": 1.5587952998492367e-06, + "loss": 0.5799, + "step": 14112 + }, + { + "epoch": 3.746581707155184, + "grad_norm": 0.46547077227637007, + "learning_rate": 1.5584718685526327e-06, + "loss": 0.5538, + "step": 14113 + }, + { + "epoch": 3.7468472056285678, + "grad_norm": 0.4755419270336829, + "learning_rate": 1.5581484556181325e-06, + "loss": 0.5356, + "step": 14114 + }, + { + "epoch": 3.7471127041019514, + "grad_norm": 0.47574451654945993, + "learning_rate": 1.5578250610520437e-06, + "loss": 0.5348, + "step": 14115 + }, + { + "epoch": 3.747378202575335, + "grad_norm": 0.46424667495492333, + "learning_rate": 1.557501684860673e-06, + "loss": 0.5316, + "step": 14116 + }, + { + "epoch": 3.747643701048719, + "grad_norm": 0.4711814640324193, + "learning_rate": 1.5571783270503277e-06, + "loss": 0.5273, + "step": 14117 + }, + { + "epoch": 3.7479091995221028, + "grad_norm": 0.44395035674952754, + "learning_rate": 1.5568549876273126e-06, + "loss": 0.5322, + "step": 14118 + }, + { + "epoch": 3.7481746979954864, + "grad_norm": 0.4692596407197698, + "learning_rate": 1.5565316665979347e-06, + "loss": 0.5172, + "step": 14119 + }, + { + "epoch": 3.74844019646887, + "grad_norm": 0.46053713594499845, + "learning_rate": 1.5562083639684994e-06, + "loss": 0.5385, + "step": 14120 + }, + { + "epoch": 3.748705694942254, + "grad_norm": 0.46262925446653674, + "learning_rate": 1.5558850797453117e-06, + "loss": 0.5616, + "step": 14121 + }, + { + "epoch": 3.748971193415638, + "grad_norm": 0.4699826910517659, + "learning_rate": 1.5555618139346766e-06, + "loss": 0.5239, + "step": 14122 + }, + { + "epoch": 3.7492366918890214, + "grad_norm": 0.4422348623783503, + "learning_rate": 1.5552385665428993e-06, + "loss": 0.5381, + "step": 14123 + }, + { + "epoch": 3.7495021903624055, + "grad_norm": 0.4753881555299518, + "learning_rate": 1.554915337576282e-06, + "loss": 0.5618, + "step": 14124 + }, + { + "epoch": 3.749767688835789, + "grad_norm": 0.4593523001586029, + "learning_rate": 1.5545921270411303e-06, + "loss": 0.5576, + "step": 14125 + }, + { + "epoch": 3.750033187309173, + "grad_norm": 0.4441459213475431, + "learning_rate": 1.5542689349437462e-06, + "loss": 0.5307, + "step": 14126 + }, + { + "epoch": 3.750298685782557, + "grad_norm": 0.4688868136805969, + "learning_rate": 1.5539457612904336e-06, + "loss": 0.5405, + "step": 14127 + }, + { + "epoch": 3.7505641842559405, + "grad_norm": 0.45672135263786906, + "learning_rate": 1.5536226060874948e-06, + "loss": 0.5368, + "step": 14128 + }, + { + "epoch": 3.750829682729324, + "grad_norm": 0.44788237743165515, + "learning_rate": 1.5532994693412335e-06, + "loss": 0.5419, + "step": 14129 + }, + { + "epoch": 3.7510951812027082, + "grad_norm": 0.48443930831327164, + "learning_rate": 1.5529763510579492e-06, + "loss": 0.5385, + "step": 14130 + }, + { + "epoch": 3.751360679676092, + "grad_norm": 0.4491891747531123, + "learning_rate": 1.5526532512439452e-06, + "loss": 0.5346, + "step": 14131 + }, + { + "epoch": 3.7516261781494755, + "grad_norm": 0.4722058708203047, + "learning_rate": 1.552330169905522e-06, + "loss": 0.5445, + "step": 14132 + }, + { + "epoch": 3.7518916766228596, + "grad_norm": 0.45254181154495215, + "learning_rate": 1.5520071070489811e-06, + "loss": 0.5603, + "step": 14133 + }, + { + "epoch": 3.7521571750962432, + "grad_norm": 0.4542584920689708, + "learning_rate": 1.5516840626806226e-06, + "loss": 0.5314, + "step": 14134 + }, + { + "epoch": 3.752422673569627, + "grad_norm": 0.4656317828637685, + "learning_rate": 1.5513610368067478e-06, + "loss": 0.5688, + "step": 14135 + }, + { + "epoch": 3.752688172043011, + "grad_norm": 0.47768548199775884, + "learning_rate": 1.551038029433654e-06, + "loss": 0.5624, + "step": 14136 + }, + { + "epoch": 3.7529536705163946, + "grad_norm": 0.4649648373231188, + "learning_rate": 1.5507150405676427e-06, + "loss": 0.519, + "step": 14137 + }, + { + "epoch": 3.7532191689897783, + "grad_norm": 0.46361827345434214, + "learning_rate": 1.5503920702150115e-06, + "loss": 0.5524, + "step": 14138 + }, + { + "epoch": 3.7534846674631623, + "grad_norm": 0.45863664716041846, + "learning_rate": 1.5500691183820608e-06, + "loss": 0.5646, + "step": 14139 + }, + { + "epoch": 3.753750165936546, + "grad_norm": 0.46648489603091786, + "learning_rate": 1.549746185075089e-06, + "loss": 0.5347, + "step": 14140 + }, + { + "epoch": 3.7540156644099296, + "grad_norm": 0.46623498511021055, + "learning_rate": 1.549423270300392e-06, + "loss": 0.5569, + "step": 14141 + }, + { + "epoch": 3.7542811628833133, + "grad_norm": 0.4563435477628405, + "learning_rate": 1.5491003740642688e-06, + "loss": 0.536, + "step": 14142 + }, + { + "epoch": 3.7545466613566973, + "grad_norm": 0.46742232530729466, + "learning_rate": 1.5487774963730166e-06, + "loss": 0.5531, + "step": 14143 + }, + { + "epoch": 3.754812159830081, + "grad_norm": 0.4687147083898061, + "learning_rate": 1.5484546372329317e-06, + "loss": 0.5503, + "step": 14144 + }, + { + "epoch": 3.7550776583034646, + "grad_norm": 0.4611813122899733, + "learning_rate": 1.5481317966503119e-06, + "loss": 0.5319, + "step": 14145 + }, + { + "epoch": 3.7553431567768483, + "grad_norm": 0.45901687651977074, + "learning_rate": 1.5478089746314529e-06, + "loss": 0.5676, + "step": 14146 + }, + { + "epoch": 3.7556086552502324, + "grad_norm": 0.47304115708617367, + "learning_rate": 1.5474861711826494e-06, + "loss": 0.5584, + "step": 14147 + }, + { + "epoch": 3.755874153723616, + "grad_norm": 0.4492820497942645, + "learning_rate": 1.5471633863101982e-06, + "loss": 0.5333, + "step": 14148 + }, + { + "epoch": 3.7561396521969996, + "grad_norm": 0.4716639682607868, + "learning_rate": 1.5468406200203933e-06, + "loss": 0.5565, + "step": 14149 + }, + { + "epoch": 3.7564051506703837, + "grad_norm": 0.4585976654031539, + "learning_rate": 1.5465178723195306e-06, + "loss": 0.5614, + "step": 14150 + }, + { + "epoch": 3.7566706491437674, + "grad_norm": 0.46439300898643665, + "learning_rate": 1.5461951432139037e-06, + "loss": 0.5398, + "step": 14151 + }, + { + "epoch": 3.756936147617151, + "grad_norm": 0.4691646075945546, + "learning_rate": 1.545872432709808e-06, + "loss": 0.5467, + "step": 14152 + }, + { + "epoch": 3.757201646090535, + "grad_norm": 0.44993226764445804, + "learning_rate": 1.5455497408135348e-06, + "loss": 0.5089, + "step": 14153 + }, + { + "epoch": 3.7574671445639187, + "grad_norm": 0.46664183547218313, + "learning_rate": 1.5452270675313786e-06, + "loss": 0.5428, + "step": 14154 + }, + { + "epoch": 3.7577326430373024, + "grad_norm": 0.46316653873858293, + "learning_rate": 1.5449044128696321e-06, + "loss": 0.5479, + "step": 14155 + }, + { + "epoch": 3.7579981415106865, + "grad_norm": 0.4549140776418115, + "learning_rate": 1.5445817768345883e-06, + "loss": 0.5423, + "step": 14156 + }, + { + "epoch": 3.75826363998407, + "grad_norm": 0.47043199261157453, + "learning_rate": 1.544259159432539e-06, + "loss": 0.5401, + "step": 14157 + }, + { + "epoch": 3.7585291384574537, + "grad_norm": 0.4626591132457924, + "learning_rate": 1.5439365606697767e-06, + "loss": 0.5584, + "step": 14158 + }, + { + "epoch": 3.758794636930838, + "grad_norm": 0.47208030798589096, + "learning_rate": 1.5436139805525918e-06, + "loss": 0.5439, + "step": 14159 + }, + { + "epoch": 3.7590601354042215, + "grad_norm": 0.4614881069001578, + "learning_rate": 1.5432914190872757e-06, + "loss": 0.5792, + "step": 14160 + }, + { + "epoch": 3.759325633877605, + "grad_norm": 0.4641126748050402, + "learning_rate": 1.5429688762801198e-06, + "loss": 0.5078, + "step": 14161 + }, + { + "epoch": 3.759591132350989, + "grad_norm": 0.47313052671372646, + "learning_rate": 1.5426463521374136e-06, + "loss": 0.5379, + "step": 14162 + }, + { + "epoch": 3.759856630824373, + "grad_norm": 0.46400997890945683, + "learning_rate": 1.5423238466654483e-06, + "loss": 0.545, + "step": 14163 + }, + { + "epoch": 3.7601221292977565, + "grad_norm": 0.47074210383803505, + "learning_rate": 1.5420013598705122e-06, + "loss": 0.5482, + "step": 14164 + }, + { + "epoch": 3.7603876277711406, + "grad_norm": 0.4649314673898206, + "learning_rate": 1.5416788917588951e-06, + "loss": 0.5504, + "step": 14165 + }, + { + "epoch": 3.760653126244524, + "grad_norm": 0.4711338780606955, + "learning_rate": 1.541356442336886e-06, + "loss": 0.5329, + "step": 14166 + }, + { + "epoch": 3.760918624717908, + "grad_norm": 0.45930638280344877, + "learning_rate": 1.5410340116107736e-06, + "loss": 0.5123, + "step": 14167 + }, + { + "epoch": 3.761184123191292, + "grad_norm": 0.4383401983935441, + "learning_rate": 1.5407115995868458e-06, + "loss": 0.5594, + "step": 14168 + }, + { + "epoch": 3.7614496216646756, + "grad_norm": 0.46732753969976504, + "learning_rate": 1.5403892062713916e-06, + "loss": 0.5638, + "step": 14169 + }, + { + "epoch": 3.761715120138059, + "grad_norm": 0.4683283767847534, + "learning_rate": 1.5400668316706963e-06, + "loss": 0.5176, + "step": 14170 + }, + { + "epoch": 3.761980618611443, + "grad_norm": 0.47326179173337873, + "learning_rate": 1.5397444757910482e-06, + "loss": 0.518, + "step": 14171 + }, + { + "epoch": 3.762246117084827, + "grad_norm": 0.46933222046778494, + "learning_rate": 1.5394221386387339e-06, + "loss": 0.5051, + "step": 14172 + }, + { + "epoch": 3.7625116155582106, + "grad_norm": 0.5041568566719479, + "learning_rate": 1.5390998202200397e-06, + "loss": 0.5654, + "step": 14173 + }, + { + "epoch": 3.762777114031594, + "grad_norm": 0.455140207780505, + "learning_rate": 1.538777520541252e-06, + "loss": 0.5405, + "step": 14174 + }, + { + "epoch": 3.763042612504978, + "grad_norm": 0.4606004466048283, + "learning_rate": 1.5384552396086564e-06, + "loss": 0.5607, + "step": 14175 + }, + { + "epoch": 3.763308110978362, + "grad_norm": 0.4574048347688159, + "learning_rate": 1.5381329774285375e-06, + "loss": 0.5365, + "step": 14176 + }, + { + "epoch": 3.7635736094517456, + "grad_norm": 0.46712493614989464, + "learning_rate": 1.5378107340071804e-06, + "loss": 0.5343, + "step": 14177 + }, + { + "epoch": 3.763839107925129, + "grad_norm": 0.47263627279934206, + "learning_rate": 1.53748850935087e-06, + "loss": 0.5325, + "step": 14178 + }, + { + "epoch": 3.7641046063985133, + "grad_norm": 0.46263170172867796, + "learning_rate": 1.53716630346589e-06, + "loss": 0.5259, + "step": 14179 + }, + { + "epoch": 3.764370104871897, + "grad_norm": 0.46947621929996913, + "learning_rate": 1.5368441163585251e-06, + "loss": 0.5297, + "step": 14180 + }, + { + "epoch": 3.7646356033452806, + "grad_norm": 0.4754590092064743, + "learning_rate": 1.5365219480350587e-06, + "loss": 0.5163, + "step": 14181 + }, + { + "epoch": 3.7649011018186647, + "grad_norm": 0.45825797284702485, + "learning_rate": 1.5361997985017723e-06, + "loss": 0.5415, + "step": 14182 + }, + { + "epoch": 3.7651666002920483, + "grad_norm": 0.476110609699576, + "learning_rate": 1.5358776677649495e-06, + "loss": 0.5501, + "step": 14183 + }, + { + "epoch": 3.765432098765432, + "grad_norm": 0.4696895496839855, + "learning_rate": 1.5355555558308726e-06, + "loss": 0.5521, + "step": 14184 + }, + { + "epoch": 3.765697597238816, + "grad_norm": 0.47632564194477317, + "learning_rate": 1.5352334627058243e-06, + "loss": 0.5679, + "step": 14185 + }, + { + "epoch": 3.7659630957121997, + "grad_norm": 0.46612802089194644, + "learning_rate": 1.5349113883960865e-06, + "loss": 0.5735, + "step": 14186 + }, + { + "epoch": 3.7662285941855833, + "grad_norm": 0.4490705760323217, + "learning_rate": 1.5345893329079386e-06, + "loss": 0.537, + "step": 14187 + }, + { + "epoch": 3.7664940926589674, + "grad_norm": 0.47574549701559665, + "learning_rate": 1.5342672962476623e-06, + "loss": 0.5508, + "step": 14188 + }, + { + "epoch": 3.766759591132351, + "grad_norm": 0.45127674119658506, + "learning_rate": 1.5339452784215384e-06, + "loss": 0.5109, + "step": 14189 + }, + { + "epoch": 3.7670250896057347, + "grad_norm": 0.4739759112568161, + "learning_rate": 1.5336232794358468e-06, + "loss": 0.5072, + "step": 14190 + }, + { + "epoch": 3.7672905880791188, + "grad_norm": 0.471017294318739, + "learning_rate": 1.5333012992968672e-06, + "loss": 0.536, + "step": 14191 + }, + { + "epoch": 3.7675560865525024, + "grad_norm": 0.47965777557191386, + "learning_rate": 1.53297933801088e-06, + "loss": 0.5463, + "step": 14192 + }, + { + "epoch": 3.767821585025886, + "grad_norm": 0.4730400259524348, + "learning_rate": 1.5326573955841625e-06, + "loss": 0.5848, + "step": 14193 + }, + { + "epoch": 3.76808708349927, + "grad_norm": 0.47093039427389355, + "learning_rate": 1.5323354720229945e-06, + "loss": 0.5341, + "step": 14194 + }, + { + "epoch": 3.7683525819726538, + "grad_norm": 0.46697219926777084, + "learning_rate": 1.532013567333654e-06, + "loss": 0.5548, + "step": 14195 + }, + { + "epoch": 3.7686180804460374, + "grad_norm": 0.46259618436138317, + "learning_rate": 1.5316916815224189e-06, + "loss": 0.5255, + "step": 14196 + }, + { + "epoch": 3.768883578919421, + "grad_norm": 0.4512855912663067, + "learning_rate": 1.531369814595567e-06, + "loss": 0.5383, + "step": 14197 + }, + { + "epoch": 3.769149077392805, + "grad_norm": 0.479558634468816, + "learning_rate": 1.531047966559376e-06, + "loss": 0.5159, + "step": 14198 + }, + { + "epoch": 3.7694145758661888, + "grad_norm": 0.4549524141193485, + "learning_rate": 1.5307261374201216e-06, + "loss": 0.5831, + "step": 14199 + }, + { + "epoch": 3.7696800743395724, + "grad_norm": 0.4701022812441394, + "learning_rate": 1.53040432718408e-06, + "loss": 0.5454, + "step": 14200 + }, + { + "epoch": 3.769945572812956, + "grad_norm": 0.46919677287507405, + "learning_rate": 1.5300825358575286e-06, + "loss": 0.519, + "step": 14201 + }, + { + "epoch": 3.77021107128634, + "grad_norm": 0.45721756471685293, + "learning_rate": 1.5297607634467424e-06, + "loss": 0.5183, + "step": 14202 + }, + { + "epoch": 3.770476569759724, + "grad_norm": 0.45926350050062215, + "learning_rate": 1.5294390099579967e-06, + "loss": 0.5485, + "step": 14203 + }, + { + "epoch": 3.7707420682331074, + "grad_norm": 0.4678462504936208, + "learning_rate": 1.5291172753975672e-06, + "loss": 0.4939, + "step": 14204 + }, + { + "epoch": 3.7710075667064915, + "grad_norm": 0.448983046629253, + "learning_rate": 1.5287955597717274e-06, + "loss": 0.5188, + "step": 14205 + }, + { + "epoch": 3.771273065179875, + "grad_norm": 0.4659836263418916, + "learning_rate": 1.5284738630867525e-06, + "loss": 0.5434, + "step": 14206 + }, + { + "epoch": 3.771538563653259, + "grad_norm": 0.4765383989395894, + "learning_rate": 1.5281521853489154e-06, + "loss": 0.5726, + "step": 14207 + }, + { + "epoch": 3.771804062126643, + "grad_norm": 0.4544630757379573, + "learning_rate": 1.5278305265644905e-06, + "loss": 0.5239, + "step": 14208 + }, + { + "epoch": 3.7720695606000265, + "grad_norm": 0.47659364852386443, + "learning_rate": 1.5275088867397508e-06, + "loss": 0.5611, + "step": 14209 + }, + { + "epoch": 3.77233505907341, + "grad_norm": 0.4650810045165806, + "learning_rate": 1.527187265880969e-06, + "loss": 0.5862, + "step": 14210 + }, + { + "epoch": 3.7726005575467942, + "grad_norm": 0.46150674179319245, + "learning_rate": 1.526865663994417e-06, + "loss": 0.5277, + "step": 14211 + }, + { + "epoch": 3.772866056020178, + "grad_norm": 0.478079467895871, + "learning_rate": 1.526544081086367e-06, + "loss": 0.5698, + "step": 14212 + }, + { + "epoch": 3.7731315544935615, + "grad_norm": 0.45511240665043146, + "learning_rate": 1.5262225171630913e-06, + "loss": 0.5101, + "step": 14213 + }, + { + "epoch": 3.7733970529669456, + "grad_norm": 0.46177868951407436, + "learning_rate": 1.5259009722308605e-06, + "loss": 0.5638, + "step": 14214 + }, + { + "epoch": 3.7736625514403292, + "grad_norm": 0.46710375414889743, + "learning_rate": 1.5255794462959466e-06, + "loss": 0.5647, + "step": 14215 + }, + { + "epoch": 3.773928049913713, + "grad_norm": 0.47510377273736837, + "learning_rate": 1.5252579393646183e-06, + "loss": 0.5521, + "step": 14216 + }, + { + "epoch": 3.774193548387097, + "grad_norm": 0.4732149253766579, + "learning_rate": 1.5249364514431469e-06, + "loss": 0.5121, + "step": 14217 + }, + { + "epoch": 3.7744590468604806, + "grad_norm": 0.44246816831853997, + "learning_rate": 1.524614982537802e-06, + "loss": 0.5275, + "step": 14218 + }, + { + "epoch": 3.7747245453338643, + "grad_norm": 0.4864985393189019, + "learning_rate": 1.524293532654853e-06, + "loss": 0.5385, + "step": 14219 + }, + { + "epoch": 3.7749900438072483, + "grad_norm": 0.45719647036488437, + "learning_rate": 1.5239721018005692e-06, + "loss": 0.5657, + "step": 14220 + }, + { + "epoch": 3.775255542280632, + "grad_norm": 0.46570500618607347, + "learning_rate": 1.5236506899812198e-06, + "loss": 0.5028, + "step": 14221 + }, + { + "epoch": 3.7755210407540156, + "grad_norm": 0.4785090858172073, + "learning_rate": 1.5233292972030717e-06, + "loss": 0.6005, + "step": 14222 + }, + { + "epoch": 3.7757865392273997, + "grad_norm": 0.48188933767877945, + "learning_rate": 1.5230079234723938e-06, + "loss": 0.5435, + "step": 14223 + }, + { + "epoch": 3.7760520377007833, + "grad_norm": 0.48281571172493926, + "learning_rate": 1.5226865687954533e-06, + "loss": 0.5475, + "step": 14224 + }, + { + "epoch": 3.776317536174167, + "grad_norm": 0.4565732545235267, + "learning_rate": 1.5223652331785177e-06, + "loss": 0.5531, + "step": 14225 + }, + { + "epoch": 3.7765830346475506, + "grad_norm": 0.45469069433779674, + "learning_rate": 1.522043916627854e-06, + "loss": 0.5635, + "step": 14226 + }, + { + "epoch": 3.7768485331209347, + "grad_norm": 0.4673266070009971, + "learning_rate": 1.521722619149729e-06, + "loss": 0.554, + "step": 14227 + }, + { + "epoch": 3.7771140315943184, + "grad_norm": 0.4659146112129088, + "learning_rate": 1.521401340750407e-06, + "loss": 0.5406, + "step": 14228 + }, + { + "epoch": 3.777379530067702, + "grad_norm": 0.47131386373090756, + "learning_rate": 1.5210800814361548e-06, + "loss": 0.5622, + "step": 14229 + }, + { + "epoch": 3.7776450285410856, + "grad_norm": 0.4698670457443824, + "learning_rate": 1.520758841213238e-06, + "loss": 0.5275, + "step": 14230 + }, + { + "epoch": 3.7779105270144697, + "grad_norm": 0.461861436313446, + "learning_rate": 1.5204376200879214e-06, + "loss": 0.5373, + "step": 14231 + }, + { + "epoch": 3.7781760254878534, + "grad_norm": 0.4809959313284668, + "learning_rate": 1.52011641806647e-06, + "loss": 0.5679, + "step": 14232 + }, + { + "epoch": 3.778441523961237, + "grad_norm": 0.4645756254962713, + "learning_rate": 1.519795235155148e-06, + "loss": 0.5285, + "step": 14233 + }, + { + "epoch": 3.778707022434621, + "grad_norm": 0.4595264123217648, + "learning_rate": 1.5194740713602184e-06, + "loss": 0.5598, + "step": 14234 + }, + { + "epoch": 3.7789725209080047, + "grad_norm": 0.47212455714652074, + "learning_rate": 1.5191529266879448e-06, + "loss": 0.5596, + "step": 14235 + }, + { + "epoch": 3.7792380193813884, + "grad_norm": 0.48195499968682903, + "learning_rate": 1.5188318011445907e-06, + "loss": 0.5185, + "step": 14236 + }, + { + "epoch": 3.7795035178547725, + "grad_norm": 0.4594241349240103, + "learning_rate": 1.5185106947364191e-06, + "loss": 0.573, + "step": 14237 + }, + { + "epoch": 3.779769016328156, + "grad_norm": 0.4922396420602789, + "learning_rate": 1.5181896074696923e-06, + "loss": 0.5693, + "step": 14238 + }, + { + "epoch": 3.7800345148015397, + "grad_norm": 0.4773497931957487, + "learning_rate": 1.5178685393506714e-06, + "loss": 0.5439, + "step": 14239 + }, + { + "epoch": 3.780300013274924, + "grad_norm": 0.46411952210108837, + "learning_rate": 1.5175474903856191e-06, + "loss": 0.5416, + "step": 14240 + }, + { + "epoch": 3.7805655117483075, + "grad_norm": 0.46802209086818164, + "learning_rate": 1.5172264605807958e-06, + "loss": 0.5345, + "step": 14241 + }, + { + "epoch": 3.780831010221691, + "grad_norm": 0.4620134979726957, + "learning_rate": 1.5169054499424629e-06, + "loss": 0.5326, + "step": 14242 + }, + { + "epoch": 3.781096508695075, + "grad_norm": 0.45728762087002545, + "learning_rate": 1.5165844584768808e-06, + "loss": 0.5618, + "step": 14243 + }, + { + "epoch": 3.781362007168459, + "grad_norm": 0.4631897804995381, + "learning_rate": 1.5162634861903106e-06, + "loss": 0.5408, + "step": 14244 + }, + { + "epoch": 3.7816275056418425, + "grad_norm": 0.45919151059566493, + "learning_rate": 1.5159425330890098e-06, + "loss": 0.5417, + "step": 14245 + }, + { + "epoch": 3.7818930041152266, + "grad_norm": 0.4502948640783779, + "learning_rate": 1.5156215991792394e-06, + "loss": 0.5272, + "step": 14246 + }, + { + "epoch": 3.78215850258861, + "grad_norm": 0.4614817440882876, + "learning_rate": 1.5153006844672573e-06, + "loss": 0.5382, + "step": 14247 + }, + { + "epoch": 3.782424001061994, + "grad_norm": 0.4734694506670941, + "learning_rate": 1.5149797889593237e-06, + "loss": 0.5528, + "step": 14248 + }, + { + "epoch": 3.782689499535378, + "grad_norm": 0.4774470773048459, + "learning_rate": 1.5146589126616953e-06, + "loss": 0.5631, + "step": 14249 + }, + { + "epoch": 3.7829549980087616, + "grad_norm": 0.46553325165739917, + "learning_rate": 1.5143380555806315e-06, + "loss": 0.5799, + "step": 14250 + }, + { + "epoch": 3.783220496482145, + "grad_norm": 0.4704822328945651, + "learning_rate": 1.5140172177223884e-06, + "loss": 0.5686, + "step": 14251 + }, + { + "epoch": 3.783485994955529, + "grad_norm": 0.48043657987176347, + "learning_rate": 1.5136963990932236e-06, + "loss": 0.5594, + "step": 14252 + }, + { + "epoch": 3.783751493428913, + "grad_norm": 0.46528168845864426, + "learning_rate": 1.5133755996993937e-06, + "loss": 0.5512, + "step": 14253 + }, + { + "epoch": 3.7840169919022966, + "grad_norm": 0.4502793248338295, + "learning_rate": 1.5130548195471554e-06, + "loss": 0.5337, + "step": 14254 + }, + { + "epoch": 3.78428249037568, + "grad_norm": 0.4692016784601357, + "learning_rate": 1.5127340586427646e-06, + "loss": 0.5611, + "step": 14255 + }, + { + "epoch": 3.784547988849064, + "grad_norm": 0.4559760124667694, + "learning_rate": 1.5124133169924773e-06, + "loss": 0.5305, + "step": 14256 + }, + { + "epoch": 3.784813487322448, + "grad_norm": 0.4505917176982701, + "learning_rate": 1.5120925946025478e-06, + "loss": 0.5618, + "step": 14257 + }, + { + "epoch": 3.7850789857958316, + "grad_norm": 0.47546895568641556, + "learning_rate": 1.5117718914792318e-06, + "loss": 0.5382, + "step": 14258 + }, + { + "epoch": 3.785344484269215, + "grad_norm": 0.4639670482399162, + "learning_rate": 1.5114512076287835e-06, + "loss": 0.525, + "step": 14259 + }, + { + "epoch": 3.7856099827425993, + "grad_norm": 0.4726281398809493, + "learning_rate": 1.511130543057457e-06, + "loss": 0.5523, + "step": 14260 + }, + { + "epoch": 3.785875481215983, + "grad_norm": 0.4556047756476211, + "learning_rate": 1.5108098977715068e-06, + "loss": 0.5294, + "step": 14261 + }, + { + "epoch": 3.7861409796893666, + "grad_norm": 0.4751286302268576, + "learning_rate": 1.5104892717771846e-06, + "loss": 0.53, + "step": 14262 + }, + { + "epoch": 3.7864064781627507, + "grad_norm": 0.45137645035804386, + "learning_rate": 1.5101686650807446e-06, + "loss": 0.5219, + "step": 14263 + }, + { + "epoch": 3.7866719766361343, + "grad_norm": 0.4679210685989948, + "learning_rate": 1.5098480776884389e-06, + "loss": 0.568, + "step": 14264 + }, + { + "epoch": 3.786937475109518, + "grad_norm": 0.47379446436026784, + "learning_rate": 1.5095275096065203e-06, + "loss": 0.5511, + "step": 14265 + }, + { + "epoch": 3.787202973582902, + "grad_norm": 0.4766575379702056, + "learning_rate": 1.5092069608412402e-06, + "loss": 0.5401, + "step": 14266 + }, + { + "epoch": 3.7874684720562857, + "grad_norm": 0.45344540195607325, + "learning_rate": 1.5088864313988507e-06, + "loss": 0.5734, + "step": 14267 + }, + { + "epoch": 3.7877339705296693, + "grad_norm": 0.47757820917918764, + "learning_rate": 1.508565921285602e-06, + "loss": 0.5169, + "step": 14268 + }, + { + "epoch": 3.7879994690030534, + "grad_norm": 0.4508937528219623, + "learning_rate": 1.5082454305077455e-06, + "loss": 0.5458, + "step": 14269 + }, + { + "epoch": 3.788264967476437, + "grad_norm": 0.46597878656759767, + "learning_rate": 1.5079249590715309e-06, + "loss": 0.5492, + "step": 14270 + }, + { + "epoch": 3.7885304659498207, + "grad_norm": 0.47184756971392877, + "learning_rate": 1.5076045069832088e-06, + "loss": 0.5662, + "step": 14271 + }, + { + "epoch": 3.7887959644232048, + "grad_norm": 0.46757912090755344, + "learning_rate": 1.5072840742490285e-06, + "loss": 0.5609, + "step": 14272 + }, + { + "epoch": 3.7890614628965884, + "grad_norm": 0.4593729915670754, + "learning_rate": 1.5069636608752409e-06, + "loss": 0.5378, + "step": 14273 + }, + { + "epoch": 3.789326961369972, + "grad_norm": 0.4629796176203357, + "learning_rate": 1.5066432668680916e-06, + "loss": 0.5123, + "step": 14274 + }, + { + "epoch": 3.789592459843356, + "grad_norm": 0.4496752974954881, + "learning_rate": 1.5063228922338306e-06, + "loss": 0.5452, + "step": 14275 + }, + { + "epoch": 3.7898579583167398, + "grad_norm": 0.4660184669343393, + "learning_rate": 1.5060025369787065e-06, + "loss": 0.5097, + "step": 14276 + }, + { + "epoch": 3.7901234567901234, + "grad_norm": 0.46449246547749035, + "learning_rate": 1.5056822011089667e-06, + "loss": 0.5428, + "step": 14277 + }, + { + "epoch": 3.7903889552635075, + "grad_norm": 0.44628606804148463, + "learning_rate": 1.5053618846308585e-06, + "loss": 0.5116, + "step": 14278 + }, + { + "epoch": 3.790654453736891, + "grad_norm": 0.4713317079471035, + "learning_rate": 1.5050415875506296e-06, + "loss": 0.5328, + "step": 14279 + }, + { + "epoch": 3.790919952210275, + "grad_norm": 0.47065117455100447, + "learning_rate": 1.5047213098745254e-06, + "loss": 0.5277, + "step": 14280 + }, + { + "epoch": 3.7911854506836584, + "grad_norm": 0.469574895305161, + "learning_rate": 1.504401051608792e-06, + "loss": 0.535, + "step": 14281 + }, + { + "epoch": 3.7914509491570425, + "grad_norm": 0.47241031745461826, + "learning_rate": 1.5040808127596763e-06, + "loss": 0.5509, + "step": 14282 + }, + { + "epoch": 3.791716447630426, + "grad_norm": 0.46563778512404824, + "learning_rate": 1.5037605933334227e-06, + "loss": 0.5386, + "step": 14283 + }, + { + "epoch": 3.79198194610381, + "grad_norm": 0.46433832434161454, + "learning_rate": 1.5034403933362773e-06, + "loss": 0.5645, + "step": 14284 + }, + { + "epoch": 3.7922474445771934, + "grad_norm": 0.469463197434822, + "learning_rate": 1.5031202127744836e-06, + "loss": 0.5554, + "step": 14285 + }, + { + "epoch": 3.7925129430505775, + "grad_norm": 0.4769221326208225, + "learning_rate": 1.5028000516542867e-06, + "loss": 0.5352, + "step": 14286 + }, + { + "epoch": 3.792778441523961, + "grad_norm": 0.4546887213542327, + "learning_rate": 1.5024799099819304e-06, + "loss": 0.5251, + "step": 14287 + }, + { + "epoch": 3.793043939997345, + "grad_norm": 0.46961964453831107, + "learning_rate": 1.5021597877636584e-06, + "loss": 0.5511, + "step": 14288 + }, + { + "epoch": 3.793309438470729, + "grad_norm": 0.46794875897737165, + "learning_rate": 1.5018396850057133e-06, + "loss": 0.4994, + "step": 14289 + }, + { + "epoch": 3.7935749369441125, + "grad_norm": 0.45679025757187486, + "learning_rate": 1.5015196017143397e-06, + "loss": 0.579, + "step": 14290 + }, + { + "epoch": 3.793840435417496, + "grad_norm": 0.4625444253415987, + "learning_rate": 1.5011995378957772e-06, + "loss": 0.5379, + "step": 14291 + }, + { + "epoch": 3.7941059338908802, + "grad_norm": 0.4613853189765263, + "learning_rate": 1.5008794935562693e-06, + "loss": 0.5735, + "step": 14292 + }, + { + "epoch": 3.794371432364264, + "grad_norm": 0.47584492283496405, + "learning_rate": 1.5005594687020576e-06, + "loss": 0.5369, + "step": 14293 + }, + { + "epoch": 3.7946369308376475, + "grad_norm": 0.47583246786974676, + "learning_rate": 1.500239463339383e-06, + "loss": 0.5634, + "step": 14294 + }, + { + "epoch": 3.7949024293110316, + "grad_norm": 0.46178991889172755, + "learning_rate": 1.4999194774744863e-06, + "loss": 0.5425, + "step": 14295 + }, + { + "epoch": 3.7951679277844153, + "grad_norm": 0.47049713903604845, + "learning_rate": 1.4995995111136102e-06, + "loss": 0.5781, + "step": 14296 + }, + { + "epoch": 3.795433426257799, + "grad_norm": 0.46479861695990177, + "learning_rate": 1.499279564262992e-06, + "loss": 0.5482, + "step": 14297 + }, + { + "epoch": 3.795698924731183, + "grad_norm": 0.46861104556202055, + "learning_rate": 1.4989596369288723e-06, + "loss": 0.5763, + "step": 14298 + }, + { + "epoch": 3.7959644232045666, + "grad_norm": 0.4655584797180522, + "learning_rate": 1.4986397291174906e-06, + "loss": 0.5314, + "step": 14299 + }, + { + "epoch": 3.7962299216779503, + "grad_norm": 0.46196263811205207, + "learning_rate": 1.4983198408350863e-06, + "loss": 0.5317, + "step": 14300 + }, + { + "epoch": 3.7964954201513343, + "grad_norm": 0.47418777778015764, + "learning_rate": 1.4979999720878975e-06, + "loss": 0.5712, + "step": 14301 + }, + { + "epoch": 3.796760918624718, + "grad_norm": 0.4710991704508042, + "learning_rate": 1.4976801228821625e-06, + "loss": 0.5443, + "step": 14302 + }, + { + "epoch": 3.7970264170981016, + "grad_norm": 0.4778669367536355, + "learning_rate": 1.4973602932241193e-06, + "loss": 0.5588, + "step": 14303 + }, + { + "epoch": 3.7972919155714857, + "grad_norm": 0.46518698161276734, + "learning_rate": 1.4970404831200053e-06, + "loss": 0.5568, + "step": 14304 + }, + { + "epoch": 3.7975574140448694, + "grad_norm": 0.4664777751508425, + "learning_rate": 1.4967206925760573e-06, + "loss": 0.5429, + "step": 14305 + }, + { + "epoch": 3.797822912518253, + "grad_norm": 0.4754504041047351, + "learning_rate": 1.4964009215985122e-06, + "loss": 0.529, + "step": 14306 + }, + { + "epoch": 3.7980884109916366, + "grad_norm": 0.4626791191768184, + "learning_rate": 1.4960811701936076e-06, + "loss": 0.5474, + "step": 14307 + }, + { + "epoch": 3.7983539094650207, + "grad_norm": 0.46972224717961636, + "learning_rate": 1.495761438367577e-06, + "loss": 0.5648, + "step": 14308 + }, + { + "epoch": 3.7986194079384044, + "grad_norm": 0.459739570489443, + "learning_rate": 1.4954417261266573e-06, + "loss": 0.5488, + "step": 14309 + }, + { + "epoch": 3.798884906411788, + "grad_norm": 0.45959208686586267, + "learning_rate": 1.4951220334770833e-06, + "loss": 0.5212, + "step": 14310 + }, + { + "epoch": 3.7991504048851716, + "grad_norm": 0.461988454472999, + "learning_rate": 1.4948023604250902e-06, + "loss": 0.5728, + "step": 14311 + }, + { + "epoch": 3.7994159033585557, + "grad_norm": 0.4872759434077837, + "learning_rate": 1.4944827069769125e-06, + "loss": 0.553, + "step": 14312 + }, + { + "epoch": 3.7996814018319394, + "grad_norm": 0.46677084580224454, + "learning_rate": 1.494163073138784e-06, + "loss": 0.5814, + "step": 14313 + }, + { + "epoch": 3.799946900305323, + "grad_norm": 0.47477082042944535, + "learning_rate": 1.4938434589169378e-06, + "loss": 0.5039, + "step": 14314 + }, + { + "epoch": 3.800212398778707, + "grad_norm": 0.4519686286655203, + "learning_rate": 1.4935238643176079e-06, + "loss": 0.5228, + "step": 14315 + }, + { + "epoch": 3.8004778972520907, + "grad_norm": 0.4650749605615722, + "learning_rate": 1.4932042893470266e-06, + "loss": 0.5507, + "step": 14316 + }, + { + "epoch": 3.8007433957254744, + "grad_norm": 0.4752879449841521, + "learning_rate": 1.4928847340114271e-06, + "loss": 0.5445, + "step": 14317 + }, + { + "epoch": 3.8010088941988585, + "grad_norm": 0.46376848105078305, + "learning_rate": 1.492565198317041e-06, + "loss": 0.5319, + "step": 14318 + }, + { + "epoch": 3.801274392672242, + "grad_norm": 0.4694493607772262, + "learning_rate": 1.492245682270101e-06, + "loss": 0.5114, + "step": 14319 + }, + { + "epoch": 3.8015398911456257, + "grad_norm": 0.4798675830000061, + "learning_rate": 1.4919261858768363e-06, + "loss": 0.5209, + "step": 14320 + }, + { + "epoch": 3.80180538961901, + "grad_norm": 0.4640277711755632, + "learning_rate": 1.4916067091434795e-06, + "loss": 0.5211, + "step": 14321 + }, + { + "epoch": 3.8020708880923935, + "grad_norm": 0.4529010554991586, + "learning_rate": 1.4912872520762609e-06, + "loss": 0.5402, + "step": 14322 + }, + { + "epoch": 3.802336386565777, + "grad_norm": 0.4575938855109411, + "learning_rate": 1.4909678146814104e-06, + "loss": 0.5277, + "step": 14323 + }, + { + "epoch": 3.802601885039161, + "grad_norm": 0.46197337962599033, + "learning_rate": 1.4906483969651587e-06, + "loss": 0.5639, + "step": 14324 + }, + { + "epoch": 3.802867383512545, + "grad_norm": 0.4823898392660827, + "learning_rate": 1.490328998933735e-06, + "loss": 0.5518, + "step": 14325 + }, + { + "epoch": 3.8031328819859285, + "grad_norm": 0.4675881464292851, + "learning_rate": 1.4900096205933672e-06, + "loss": 0.508, + "step": 14326 + }, + { + "epoch": 3.8033983804593126, + "grad_norm": 0.4625001234238671, + "learning_rate": 1.4896902619502846e-06, + "loss": 0.5519, + "step": 14327 + }, + { + "epoch": 3.803663878932696, + "grad_norm": 0.46875681257116325, + "learning_rate": 1.4893709230107156e-06, + "loss": 0.5654, + "step": 14328 + }, + { + "epoch": 3.80392937740608, + "grad_norm": 0.4633042663645761, + "learning_rate": 1.4890516037808877e-06, + "loss": 0.5288, + "step": 14329 + }, + { + "epoch": 3.804194875879464, + "grad_norm": 0.48152113101093835, + "learning_rate": 1.4887323042670298e-06, + "loss": 0.5548, + "step": 14330 + }, + { + "epoch": 3.8044603743528476, + "grad_norm": 0.4852801034845774, + "learning_rate": 1.488413024475367e-06, + "loss": 0.5584, + "step": 14331 + }, + { + "epoch": 3.804725872826231, + "grad_norm": 0.4689323902391257, + "learning_rate": 1.4880937644121273e-06, + "loss": 0.5496, + "step": 14332 + }, + { + "epoch": 3.8049913712996153, + "grad_norm": 0.4608974987803858, + "learning_rate": 1.4877745240835368e-06, + "loss": 0.561, + "step": 14333 + }, + { + "epoch": 3.805256869772999, + "grad_norm": 0.4780259210240867, + "learning_rate": 1.487455303495821e-06, + "loss": 0.5248, + "step": 14334 + }, + { + "epoch": 3.8055223682463826, + "grad_norm": 0.492456924304905, + "learning_rate": 1.4871361026552067e-06, + "loss": 0.5625, + "step": 14335 + }, + { + "epoch": 3.805787866719766, + "grad_norm": 0.4809124671828694, + "learning_rate": 1.4868169215679188e-06, + "loss": 0.571, + "step": 14336 + }, + { + "epoch": 3.8060533651931503, + "grad_norm": 0.4714769115685962, + "learning_rate": 1.4864977602401806e-06, + "loss": 0.5533, + "step": 14337 + }, + { + "epoch": 3.806318863666534, + "grad_norm": 0.45694315668196317, + "learning_rate": 1.4861786186782176e-06, + "loss": 0.5458, + "step": 14338 + }, + { + "epoch": 3.8065843621399176, + "grad_norm": 0.47207225486480636, + "learning_rate": 1.4858594968882539e-06, + "loss": 0.563, + "step": 14339 + }, + { + "epoch": 3.806849860613301, + "grad_norm": 0.467871781759021, + "learning_rate": 1.4855403948765129e-06, + "loss": 0.5473, + "step": 14340 + }, + { + "epoch": 3.8071153590866853, + "grad_norm": 0.4654050096973252, + "learning_rate": 1.4852213126492177e-06, + "loss": 0.5298, + "step": 14341 + }, + { + "epoch": 3.807380857560069, + "grad_norm": 0.4660229739535231, + "learning_rate": 1.484902250212593e-06, + "loss": 0.5391, + "step": 14342 + }, + { + "epoch": 3.8076463560334526, + "grad_norm": 0.4685184569408039, + "learning_rate": 1.4845832075728589e-06, + "loss": 0.5599, + "step": 14343 + }, + { + "epoch": 3.8079118545068367, + "grad_norm": 0.4748748085920868, + "learning_rate": 1.4842641847362383e-06, + "loss": 0.5434, + "step": 14344 + }, + { + "epoch": 3.8081773529802203, + "grad_norm": 0.4719656503493941, + "learning_rate": 1.4839451817089528e-06, + "loss": 0.4853, + "step": 14345 + }, + { + "epoch": 3.808442851453604, + "grad_norm": 0.456788903507843, + "learning_rate": 1.4836261984972245e-06, + "loss": 0.5652, + "step": 14346 + }, + { + "epoch": 3.808708349926988, + "grad_norm": 0.46988639329071397, + "learning_rate": 1.4833072351072733e-06, + "loss": 0.5671, + "step": 14347 + }, + { + "epoch": 3.8089738484003717, + "grad_norm": 0.4534394757121887, + "learning_rate": 1.482988291545321e-06, + "loss": 0.5445, + "step": 14348 + }, + { + "epoch": 3.8092393468737553, + "grad_norm": 0.47365051182009255, + "learning_rate": 1.4826693678175869e-06, + "loss": 0.587, + "step": 14349 + }, + { + "epoch": 3.8095048453471394, + "grad_norm": 0.47773049958899966, + "learning_rate": 1.4823504639302905e-06, + "loss": 0.5439, + "step": 14350 + }, + { + "epoch": 3.809770343820523, + "grad_norm": 0.4679571382142268, + "learning_rate": 1.482031579889652e-06, + "loss": 0.5477, + "step": 14351 + }, + { + "epoch": 3.8100358422939067, + "grad_norm": 0.46938896335987407, + "learning_rate": 1.48171271570189e-06, + "loss": 0.5648, + "step": 14352 + }, + { + "epoch": 3.8103013407672908, + "grad_norm": 0.4779634706566664, + "learning_rate": 1.4813938713732235e-06, + "loss": 0.5096, + "step": 14353 + }, + { + "epoch": 3.8105668392406744, + "grad_norm": 0.47918547125911526, + "learning_rate": 1.4810750469098711e-06, + "loss": 0.5557, + "step": 14354 + }, + { + "epoch": 3.810832337714058, + "grad_norm": 0.47045561143239895, + "learning_rate": 1.4807562423180494e-06, + "loss": 0.5271, + "step": 14355 + }, + { + "epoch": 3.811097836187442, + "grad_norm": 0.46673636899865867, + "learning_rate": 1.4804374576039763e-06, + "loss": 0.5103, + "step": 14356 + }, + { + "epoch": 3.8113633346608258, + "grad_norm": 0.4624595970867824, + "learning_rate": 1.4801186927738689e-06, + "loss": 0.5065, + "step": 14357 + }, + { + "epoch": 3.8116288331342094, + "grad_norm": 0.46663858806130387, + "learning_rate": 1.4797999478339443e-06, + "loss": 0.553, + "step": 14358 + }, + { + "epoch": 3.8118943316075935, + "grad_norm": 0.48630159158713454, + "learning_rate": 1.4794812227904192e-06, + "loss": 0.5297, + "step": 14359 + }, + { + "epoch": 3.812159830080977, + "grad_norm": 0.4835106063040936, + "learning_rate": 1.4791625176495083e-06, + "loss": 0.5481, + "step": 14360 + }, + { + "epoch": 3.812425328554361, + "grad_norm": 0.4753636835452331, + "learning_rate": 1.478843832417428e-06, + "loss": 0.5501, + "step": 14361 + }, + { + "epoch": 3.812690827027745, + "grad_norm": 0.4651875084848534, + "learning_rate": 1.4785251671003927e-06, + "loss": 0.5615, + "step": 14362 + }, + { + "epoch": 3.8129563255011285, + "grad_norm": 0.46656368627726663, + "learning_rate": 1.4782065217046178e-06, + "loss": 0.5746, + "step": 14363 + }, + { + "epoch": 3.813221823974512, + "grad_norm": 0.47365670200672455, + "learning_rate": 1.4778878962363177e-06, + "loss": 0.542, + "step": 14364 + }, + { + "epoch": 3.813487322447896, + "grad_norm": 0.45628129341498064, + "learning_rate": 1.477569290701707e-06, + "loss": 0.5219, + "step": 14365 + }, + { + "epoch": 3.8137528209212794, + "grad_norm": 0.4669487538842543, + "learning_rate": 1.4772507051069972e-06, + "loss": 0.5857, + "step": 14366 + }, + { + "epoch": 3.8140183193946635, + "grad_norm": 0.47181475862756933, + "learning_rate": 1.476932139458403e-06, + "loss": 0.5461, + "step": 14367 + }, + { + "epoch": 3.814283817868047, + "grad_norm": 0.4630794460422562, + "learning_rate": 1.4766135937621372e-06, + "loss": 0.5225, + "step": 14368 + }, + { + "epoch": 3.814549316341431, + "grad_norm": 0.46823294179306313, + "learning_rate": 1.476295068024412e-06, + "loss": 0.5619, + "step": 14369 + }, + { + "epoch": 3.814814814814815, + "grad_norm": 0.4605088479269549, + "learning_rate": 1.4759765622514394e-06, + "loss": 0.5369, + "step": 14370 + }, + { + "epoch": 3.8150803132881985, + "grad_norm": 0.4821744351837662, + "learning_rate": 1.4756580764494322e-06, + "loss": 0.5418, + "step": 14371 + }, + { + "epoch": 3.815345811761582, + "grad_norm": 0.46880476449760783, + "learning_rate": 1.4753396106245993e-06, + "loss": 0.5569, + "step": 14372 + }, + { + "epoch": 3.8156113102349662, + "grad_norm": 0.4702449836733929, + "learning_rate": 1.4750211647831529e-06, + "loss": 0.5601, + "step": 14373 + }, + { + "epoch": 3.81587680870835, + "grad_norm": 0.465968062237587, + "learning_rate": 1.4747027389313032e-06, + "loss": 0.5523, + "step": 14374 + }, + { + "epoch": 3.8161423071817335, + "grad_norm": 0.46233518110951977, + "learning_rate": 1.4743843330752606e-06, + "loss": 0.5477, + "step": 14375 + }, + { + "epoch": 3.8164078056551176, + "grad_norm": 0.45895099695681163, + "learning_rate": 1.4740659472212347e-06, + "loss": 0.5694, + "step": 14376 + }, + { + "epoch": 3.8166733041285013, + "grad_norm": 0.4638001212321143, + "learning_rate": 1.4737475813754352e-06, + "loss": 0.4955, + "step": 14377 + }, + { + "epoch": 3.816938802601885, + "grad_norm": 0.46641427785805883, + "learning_rate": 1.4734292355440698e-06, + "loss": 0.5025, + "step": 14378 + }, + { + "epoch": 3.817204301075269, + "grad_norm": 0.48551446748419047, + "learning_rate": 1.473110909733348e-06, + "loss": 0.5509, + "step": 14379 + }, + { + "epoch": 3.8174697995486526, + "grad_norm": 0.45394307393484445, + "learning_rate": 1.4727926039494778e-06, + "loss": 0.5308, + "step": 14380 + }, + { + "epoch": 3.8177352980220363, + "grad_norm": 0.46843074556273445, + "learning_rate": 1.4724743181986668e-06, + "loss": 0.5472, + "step": 14381 + }, + { + "epoch": 3.8180007964954203, + "grad_norm": 0.44974838616594165, + "learning_rate": 1.4721560524871237e-06, + "loss": 0.5258, + "step": 14382 + }, + { + "epoch": 3.818266294968804, + "grad_norm": 0.4735508573254619, + "learning_rate": 1.4718378068210525e-06, + "loss": 0.5566, + "step": 14383 + }, + { + "epoch": 3.8185317934421876, + "grad_norm": 0.4700249550404407, + "learning_rate": 1.4715195812066619e-06, + "loss": 0.5414, + "step": 14384 + }, + { + "epoch": 3.8187972919155717, + "grad_norm": 0.4730080691362576, + "learning_rate": 1.4712013756501575e-06, + "loss": 0.5231, + "step": 14385 + }, + { + "epoch": 3.8190627903889554, + "grad_norm": 0.47169148682180734, + "learning_rate": 1.4708831901577453e-06, + "loss": 0.5193, + "step": 14386 + }, + { + "epoch": 3.819328288862339, + "grad_norm": 0.4763434968252582, + "learning_rate": 1.47056502473563e-06, + "loss": 0.5632, + "step": 14387 + }, + { + "epoch": 3.819593787335723, + "grad_norm": 0.4668472113874937, + "learning_rate": 1.4702468793900187e-06, + "loss": 0.5171, + "step": 14388 + }, + { + "epoch": 3.8198592858091067, + "grad_norm": 0.4560642203070396, + "learning_rate": 1.469928754127114e-06, + "loss": 0.5373, + "step": 14389 + }, + { + "epoch": 3.8201247842824904, + "grad_norm": 0.4661632993884505, + "learning_rate": 1.46961064895312e-06, + "loss": 0.523, + "step": 14390 + }, + { + "epoch": 3.820390282755874, + "grad_norm": 0.4498891809085212, + "learning_rate": 1.4692925638742416e-06, + "loss": 0.5621, + "step": 14391 + }, + { + "epoch": 3.820655781229258, + "grad_norm": 0.45782544834191014, + "learning_rate": 1.4689744988966814e-06, + "loss": 0.54, + "step": 14392 + }, + { + "epoch": 3.8209212797026417, + "grad_norm": 0.46138547897797916, + "learning_rate": 1.4686564540266432e-06, + "loss": 0.562, + "step": 14393 + }, + { + "epoch": 3.8211867781760254, + "grad_norm": 0.46067454050847145, + "learning_rate": 1.4683384292703295e-06, + "loss": 0.5297, + "step": 14394 + }, + { + "epoch": 3.821452276649409, + "grad_norm": 0.4772049945599968, + "learning_rate": 1.4680204246339425e-06, + "loss": 0.5597, + "step": 14395 + }, + { + "epoch": 3.821717775122793, + "grad_norm": 0.47401754810777014, + "learning_rate": 1.4677024401236834e-06, + "loss": 0.6031, + "step": 14396 + }, + { + "epoch": 3.8219832735961767, + "grad_norm": 0.4691731186206942, + "learning_rate": 1.4673844757457543e-06, + "loss": 0.527, + "step": 14397 + }, + { + "epoch": 3.8222487720695604, + "grad_norm": 0.4725447736989617, + "learning_rate": 1.467066531506356e-06, + "loss": 0.5449, + "step": 14398 + }, + { + "epoch": 3.8225142705429445, + "grad_norm": 0.4461468753699611, + "learning_rate": 1.4667486074116899e-06, + "loss": 0.5432, + "step": 14399 + }, + { + "epoch": 3.822779769016328, + "grad_norm": 0.4680426203236417, + "learning_rate": 1.4664307034679564e-06, + "loss": 0.558, + "step": 14400 + }, + { + "epoch": 3.8230452674897117, + "grad_norm": 0.4699348262991877, + "learning_rate": 1.466112819681354e-06, + "loss": 0.5149, + "step": 14401 + }, + { + "epoch": 3.823310765963096, + "grad_norm": 0.4559604435265895, + "learning_rate": 1.4657949560580829e-06, + "loss": 0.5156, + "step": 14402 + }, + { + "epoch": 3.8235762644364795, + "grad_norm": 0.45339592668133893, + "learning_rate": 1.465477112604342e-06, + "loss": 0.5373, + "step": 14403 + }, + { + "epoch": 3.823841762909863, + "grad_norm": 0.4636450337425917, + "learning_rate": 1.4651592893263309e-06, + "loss": 0.5356, + "step": 14404 + }, + { + "epoch": 3.824107261383247, + "grad_norm": 0.46785954943298463, + "learning_rate": 1.4648414862302476e-06, + "loss": 0.5522, + "step": 14405 + }, + { + "epoch": 3.824372759856631, + "grad_norm": 0.4645724093299341, + "learning_rate": 1.4645237033222895e-06, + "loss": 0.5796, + "step": 14406 + }, + { + "epoch": 3.8246382583300145, + "grad_norm": 0.4670454227207539, + "learning_rate": 1.4642059406086544e-06, + "loss": 0.5609, + "step": 14407 + }, + { + "epoch": 3.8249037568033986, + "grad_norm": 0.4716641920710068, + "learning_rate": 1.4638881980955394e-06, + "loss": 0.5897, + "step": 14408 + }, + { + "epoch": 3.825169255276782, + "grad_norm": 0.479027020079382, + "learning_rate": 1.4635704757891416e-06, + "loss": 0.592, + "step": 14409 + }, + { + "epoch": 3.825434753750166, + "grad_norm": 0.47218402186689384, + "learning_rate": 1.463252773695657e-06, + "loss": 0.576, + "step": 14410 + }, + { + "epoch": 3.82570025222355, + "grad_norm": 0.4897964541648506, + "learning_rate": 1.462935091821282e-06, + "loss": 0.5628, + "step": 14411 + }, + { + "epoch": 3.8259657506969336, + "grad_norm": 0.47220511644334695, + "learning_rate": 1.4626174301722119e-06, + "loss": 0.539, + "step": 14412 + }, + { + "epoch": 3.826231249170317, + "grad_norm": 0.4679090359651499, + "learning_rate": 1.4622997887546414e-06, + "loss": 0.5251, + "step": 14413 + }, + { + "epoch": 3.8264967476437013, + "grad_norm": 0.46316218365467776, + "learning_rate": 1.4619821675747658e-06, + "loss": 0.5381, + "step": 14414 + }, + { + "epoch": 3.826762246117085, + "grad_norm": 0.4700058145615277, + "learning_rate": 1.4616645666387796e-06, + "loss": 0.5419, + "step": 14415 + }, + { + "epoch": 3.8270277445904686, + "grad_norm": 0.48648563306576964, + "learning_rate": 1.4613469859528767e-06, + "loss": 0.5528, + "step": 14416 + }, + { + "epoch": 3.8272932430638527, + "grad_norm": 0.47850112833147723, + "learning_rate": 1.4610294255232516e-06, + "loss": 0.5497, + "step": 14417 + }, + { + "epoch": 3.8275587415372363, + "grad_norm": 0.4580334723020406, + "learning_rate": 1.4607118853560953e-06, + "loss": 0.5616, + "step": 14418 + }, + { + "epoch": 3.82782424001062, + "grad_norm": 0.48575969235574473, + "learning_rate": 1.460394365457602e-06, + "loss": 0.5387, + "step": 14419 + }, + { + "epoch": 3.8280897384840036, + "grad_norm": 0.463895146831036, + "learning_rate": 1.460076865833964e-06, + "loss": 0.5431, + "step": 14420 + }, + { + "epoch": 3.828355236957387, + "grad_norm": 0.48271634012227266, + "learning_rate": 1.4597593864913734e-06, + "loss": 0.5337, + "step": 14421 + }, + { + "epoch": 3.8286207354307713, + "grad_norm": 0.4651240594002985, + "learning_rate": 1.4594419274360213e-06, + "loss": 0.5602, + "step": 14422 + }, + { + "epoch": 3.828886233904155, + "grad_norm": 0.4602898879836279, + "learning_rate": 1.4591244886741002e-06, + "loss": 0.5161, + "step": 14423 + }, + { + "epoch": 3.8291517323775386, + "grad_norm": 0.47538170474468056, + "learning_rate": 1.4588070702118e-06, + "loss": 0.5979, + "step": 14424 + }, + { + "epoch": 3.8294172308509227, + "grad_norm": 0.4642100688108346, + "learning_rate": 1.45848967205531e-06, + "loss": 0.5545, + "step": 14425 + }, + { + "epoch": 3.8296827293243063, + "grad_norm": 0.5203533782236369, + "learning_rate": 1.4581722942108228e-06, + "loss": 0.545, + "step": 14426 + }, + { + "epoch": 3.82994822779769, + "grad_norm": 0.46207216440036447, + "learning_rate": 1.4578549366845257e-06, + "loss": 0.4999, + "step": 14427 + }, + { + "epoch": 3.830213726271074, + "grad_norm": 0.45854272060469004, + "learning_rate": 1.4575375994826097e-06, + "loss": 0.5281, + "step": 14428 + }, + { + "epoch": 3.8304792247444577, + "grad_norm": 0.45925303277682006, + "learning_rate": 1.4572202826112631e-06, + "loss": 0.5696, + "step": 14429 + }, + { + "epoch": 3.8307447232178413, + "grad_norm": 0.4638681669388345, + "learning_rate": 1.4569029860766728e-06, + "loss": 0.5766, + "step": 14430 + }, + { + "epoch": 3.8310102216912254, + "grad_norm": 0.4749060810405108, + "learning_rate": 1.4565857098850295e-06, + "loss": 0.5734, + "step": 14431 + }, + { + "epoch": 3.831275720164609, + "grad_norm": 0.4651022907169953, + "learning_rate": 1.4562684540425188e-06, + "loss": 0.5415, + "step": 14432 + }, + { + "epoch": 3.8315412186379927, + "grad_norm": 0.46227844972563475, + "learning_rate": 1.4559512185553292e-06, + "loss": 0.5603, + "step": 14433 + }, + { + "epoch": 3.8318067171113768, + "grad_norm": 0.4608024626035467, + "learning_rate": 1.4556340034296474e-06, + "loss": 0.5522, + "step": 14434 + }, + { + "epoch": 3.8320722155847604, + "grad_norm": 0.47195819865298355, + "learning_rate": 1.4553168086716584e-06, + "loss": 0.5318, + "step": 14435 + }, + { + "epoch": 3.832337714058144, + "grad_norm": 0.46974536143754225, + "learning_rate": 1.4549996342875507e-06, + "loss": 0.5412, + "step": 14436 + }, + { + "epoch": 3.832603212531528, + "grad_norm": 0.4698426666976415, + "learning_rate": 1.4546824802835076e-06, + "loss": 0.5366, + "step": 14437 + }, + { + "epoch": 3.8328687110049118, + "grad_norm": 0.4689159660350755, + "learning_rate": 1.4543653466657167e-06, + "loss": 0.535, + "step": 14438 + }, + { + "epoch": 3.8331342094782954, + "grad_norm": 0.4695055979222109, + "learning_rate": 1.4540482334403605e-06, + "loss": 0.5208, + "step": 14439 + }, + { + "epoch": 3.8333997079516795, + "grad_norm": 0.4739325010572114, + "learning_rate": 1.4537311406136267e-06, + "loss": 0.5551, + "step": 14440 + }, + { + "epoch": 3.833665206425063, + "grad_norm": 0.44968479892602603, + "learning_rate": 1.4534140681916955e-06, + "loss": 0.5396, + "step": 14441 + }, + { + "epoch": 3.833930704898447, + "grad_norm": 0.460112160600991, + "learning_rate": 1.4530970161807533e-06, + "loss": 0.5643, + "step": 14442 + }, + { + "epoch": 3.834196203371831, + "grad_norm": 0.4780980853671893, + "learning_rate": 1.4527799845869817e-06, + "loss": 0.5784, + "step": 14443 + }, + { + "epoch": 3.8344617018452145, + "grad_norm": 0.4718784366523411, + "learning_rate": 1.4524629734165652e-06, + "loss": 0.5318, + "step": 14444 + }, + { + "epoch": 3.834727200318598, + "grad_norm": 0.46814092938972407, + "learning_rate": 1.4521459826756847e-06, + "loss": 0.5792, + "step": 14445 + }, + { + "epoch": 3.834992698791982, + "grad_norm": 0.4747068394867822, + "learning_rate": 1.4518290123705242e-06, + "loss": 0.5382, + "step": 14446 + }, + { + "epoch": 3.835258197265366, + "grad_norm": 0.4610826260007812, + "learning_rate": 1.4515120625072642e-06, + "loss": 0.5262, + "step": 14447 + }, + { + "epoch": 3.8355236957387495, + "grad_norm": 0.46857610022842733, + "learning_rate": 1.4511951330920854e-06, + "loss": 0.5127, + "step": 14448 + }, + { + "epoch": 3.835789194212133, + "grad_norm": 0.4536533350890589, + "learning_rate": 1.45087822413117e-06, + "loss": 0.5144, + "step": 14449 + }, + { + "epoch": 3.836054692685517, + "grad_norm": 0.4603118458162056, + "learning_rate": 1.4505613356306969e-06, + "loss": 0.5569, + "step": 14450 + }, + { + "epoch": 3.836320191158901, + "grad_norm": 0.46594922010807804, + "learning_rate": 1.4502444675968496e-06, + "loss": 0.531, + "step": 14451 + }, + { + "epoch": 3.8365856896322845, + "grad_norm": 0.47209478582647557, + "learning_rate": 1.4499276200358031e-06, + "loss": 0.5442, + "step": 14452 + }, + { + "epoch": 3.836851188105668, + "grad_norm": 0.45790910803150287, + "learning_rate": 1.4496107929537403e-06, + "loss": 0.534, + "step": 14453 + }, + { + "epoch": 3.8371166865790522, + "grad_norm": 0.47481131408810984, + "learning_rate": 1.4492939863568379e-06, + "loss": 0.5524, + "step": 14454 + }, + { + "epoch": 3.837382185052436, + "grad_norm": 0.4807311968012792, + "learning_rate": 1.4489772002512748e-06, + "loss": 0.5186, + "step": 14455 + }, + { + "epoch": 3.8376476835258195, + "grad_norm": 0.463560076994564, + "learning_rate": 1.4486604346432311e-06, + "loss": 0.5713, + "step": 14456 + }, + { + "epoch": 3.8379131819992036, + "grad_norm": 0.4739190493513621, + "learning_rate": 1.448343689538883e-06, + "loss": 0.5577, + "step": 14457 + }, + { + "epoch": 3.8381786804725873, + "grad_norm": 0.46897797591620927, + "learning_rate": 1.4480269649444078e-06, + "loss": 0.5099, + "step": 14458 + }, + { + "epoch": 3.838444178945971, + "grad_norm": 0.46093583781014796, + "learning_rate": 1.4477102608659816e-06, + "loss": 0.5237, + "step": 14459 + }, + { + "epoch": 3.838709677419355, + "grad_norm": 0.47881392450239413, + "learning_rate": 1.4473935773097824e-06, + "loss": 0.5502, + "step": 14460 + }, + { + "epoch": 3.8389751758927386, + "grad_norm": 0.47346760678605226, + "learning_rate": 1.4470769142819849e-06, + "loss": 0.5349, + "step": 14461 + }, + { + "epoch": 3.8392406743661223, + "grad_norm": 0.45337099630836036, + "learning_rate": 1.4467602717887667e-06, + "loss": 0.5477, + "step": 14462 + }, + { + "epoch": 3.8395061728395063, + "grad_norm": 0.4790657068986263, + "learning_rate": 1.4464436498363015e-06, + "loss": 0.5595, + "step": 14463 + }, + { + "epoch": 3.83977167131289, + "grad_norm": 0.47294628174352765, + "learning_rate": 1.4461270484307644e-06, + "loss": 0.5556, + "step": 14464 + }, + { + "epoch": 3.8400371697862736, + "grad_norm": 0.47324777868539475, + "learning_rate": 1.4458104675783308e-06, + "loss": 0.5304, + "step": 14465 + }, + { + "epoch": 3.8403026682596577, + "grad_norm": 0.47758335482577563, + "learning_rate": 1.445493907285173e-06, + "loss": 0.5674, + "step": 14466 + }, + { + "epoch": 3.8405681667330414, + "grad_norm": 0.47106589828473805, + "learning_rate": 1.445177367557467e-06, + "loss": 0.5449, + "step": 14467 + }, + { + "epoch": 3.840833665206425, + "grad_norm": 0.48431801935508106, + "learning_rate": 1.444860848401384e-06, + "loss": 0.5494, + "step": 14468 + }, + { + "epoch": 3.841099163679809, + "grad_norm": 0.4636042267192663, + "learning_rate": 1.4445443498231e-06, + "loss": 0.5735, + "step": 14469 + }, + { + "epoch": 3.8413646621531927, + "grad_norm": 0.47590359672857124, + "learning_rate": 1.4442278718287833e-06, + "loss": 0.589, + "step": 14470 + }, + { + "epoch": 3.8416301606265764, + "grad_norm": 0.4802790373688087, + "learning_rate": 1.443911414424609e-06, + "loss": 0.5282, + "step": 14471 + }, + { + "epoch": 3.8418956590999604, + "grad_norm": 0.46361325091453537, + "learning_rate": 1.4435949776167468e-06, + "loss": 0.5439, + "step": 14472 + }, + { + "epoch": 3.842161157573344, + "grad_norm": 0.46770859599294196, + "learning_rate": 1.4432785614113703e-06, + "loss": 0.5363, + "step": 14473 + }, + { + "epoch": 3.8424266560467277, + "grad_norm": 0.4585079705100281, + "learning_rate": 1.4429621658146482e-06, + "loss": 0.5165, + "step": 14474 + }, + { + "epoch": 3.8426921545201114, + "grad_norm": 0.46219808819634706, + "learning_rate": 1.4426457908327527e-06, + "loss": 0.5386, + "step": 14475 + }, + { + "epoch": 3.842957652993495, + "grad_norm": 0.4623280415737434, + "learning_rate": 1.4423294364718532e-06, + "loss": 0.5155, + "step": 14476 + }, + { + "epoch": 3.843223151466879, + "grad_norm": 0.46750998376957675, + "learning_rate": 1.4420131027381185e-06, + "loss": 0.5372, + "step": 14477 + }, + { + "epoch": 3.8434886499402627, + "grad_norm": 0.4801609342687644, + "learning_rate": 1.4416967896377192e-06, + "loss": 0.5085, + "step": 14478 + }, + { + "epoch": 3.8437541484136464, + "grad_norm": 0.451489283015974, + "learning_rate": 1.441380497176823e-06, + "loss": 0.56, + "step": 14479 + }, + { + "epoch": 3.8440196468870305, + "grad_norm": 0.45443244011911504, + "learning_rate": 1.4410642253616003e-06, + "loss": 0.5495, + "step": 14480 + }, + { + "epoch": 3.844285145360414, + "grad_norm": 0.48046781862994653, + "learning_rate": 1.4407479741982178e-06, + "loss": 0.5564, + "step": 14481 + }, + { + "epoch": 3.8445506438337977, + "grad_norm": 0.48152291067862224, + "learning_rate": 1.4404317436928423e-06, + "loss": 0.5564, + "step": 14482 + }, + { + "epoch": 3.844816142307182, + "grad_norm": 0.46260963055464, + "learning_rate": 1.4401155338516426e-06, + "loss": 0.5201, + "step": 14483 + }, + { + "epoch": 3.8450816407805655, + "grad_norm": 0.4682342478224062, + "learning_rate": 1.4397993446807844e-06, + "loss": 0.5263, + "step": 14484 + }, + { + "epoch": 3.845347139253949, + "grad_norm": 0.47557628930474133, + "learning_rate": 1.4394831761864359e-06, + "loss": 0.5634, + "step": 14485 + }, + { + "epoch": 3.845612637727333, + "grad_norm": 0.4670738646762263, + "learning_rate": 1.4391670283747618e-06, + "loss": 0.538, + "step": 14486 + }, + { + "epoch": 3.845878136200717, + "grad_norm": 0.4715891982322148, + "learning_rate": 1.4388509012519283e-06, + "loss": 0.554, + "step": 14487 + }, + { + "epoch": 3.8461436346741005, + "grad_norm": 0.45574556400426836, + "learning_rate": 1.438534794824099e-06, + "loss": 0.5299, + "step": 14488 + }, + { + "epoch": 3.8464091331474846, + "grad_norm": 0.4861468355702461, + "learning_rate": 1.4382187090974415e-06, + "loss": 0.5455, + "step": 14489 + }, + { + "epoch": 3.846674631620868, + "grad_norm": 0.45165950114669745, + "learning_rate": 1.437902644078118e-06, + "loss": 0.5726, + "step": 14490 + }, + { + "epoch": 3.846940130094252, + "grad_norm": 0.48619679649492226, + "learning_rate": 1.4375865997722937e-06, + "loss": 0.5475, + "step": 14491 + }, + { + "epoch": 3.847205628567636, + "grad_norm": 0.4780104130389211, + "learning_rate": 1.4372705761861325e-06, + "loss": 0.4853, + "step": 14492 + }, + { + "epoch": 3.8474711270410196, + "grad_norm": 0.4660728869490942, + "learning_rate": 1.4369545733257962e-06, + "loss": 0.5354, + "step": 14493 + }, + { + "epoch": 3.847736625514403, + "grad_norm": 0.4690404047889851, + "learning_rate": 1.436638591197449e-06, + "loss": 0.5351, + "step": 14494 + }, + { + "epoch": 3.8480021239877873, + "grad_norm": 0.455333183407981, + "learning_rate": 1.436322629807252e-06, + "loss": 0.5556, + "step": 14495 + }, + { + "epoch": 3.848267622461171, + "grad_norm": 0.4628174230390237, + "learning_rate": 1.4360066891613692e-06, + "loss": 0.5633, + "step": 14496 + }, + { + "epoch": 3.8485331209345546, + "grad_norm": 0.49102227684238586, + "learning_rate": 1.43569076926596e-06, + "loss": 0.5201, + "step": 14497 + }, + { + "epoch": 3.8487986194079387, + "grad_norm": 0.46614854090669466, + "learning_rate": 1.4353748701271885e-06, + "loss": 0.5479, + "step": 14498 + }, + { + "epoch": 3.8490641178813223, + "grad_norm": 0.491352301238081, + "learning_rate": 1.435058991751212e-06, + "loss": 0.5702, + "step": 14499 + }, + { + "epoch": 3.849329616354706, + "grad_norm": 0.4706510491831853, + "learning_rate": 1.4347431341441934e-06, + "loss": 0.4972, + "step": 14500 + }, + { + "epoch": 3.8495951148280896, + "grad_norm": 0.4646121401158364, + "learning_rate": 1.4344272973122909e-06, + "loss": 0.5461, + "step": 14501 + }, + { + "epoch": 3.8498606133014737, + "grad_norm": 0.46455496969584625, + "learning_rate": 1.434111481261665e-06, + "loss": 0.5474, + "step": 14502 + }, + { + "epoch": 3.8501261117748573, + "grad_norm": 0.4709938083723083, + "learning_rate": 1.433795685998477e-06, + "loss": 0.5204, + "step": 14503 + }, + { + "epoch": 3.850391610248241, + "grad_norm": 0.4970623959685443, + "learning_rate": 1.4334799115288816e-06, + "loss": 0.537, + "step": 14504 + }, + { + "epoch": 3.8506571087216246, + "grad_norm": 0.45683009922022233, + "learning_rate": 1.4331641578590405e-06, + "loss": 0.5064, + "step": 14505 + }, + { + "epoch": 3.8509226071950087, + "grad_norm": 0.47260997375881325, + "learning_rate": 1.432848424995109e-06, + "loss": 0.5559, + "step": 14506 + }, + { + "epoch": 3.8511881056683923, + "grad_norm": 0.47397194433524387, + "learning_rate": 1.4325327129432473e-06, + "loss": 0.5455, + "step": 14507 + }, + { + "epoch": 3.851453604141776, + "grad_norm": 0.48146172107955476, + "learning_rate": 1.4322170217096104e-06, + "loss": 0.5526, + "step": 14508 + }, + { + "epoch": 3.85171910261516, + "grad_norm": 0.452216970845305, + "learning_rate": 1.4319013513003565e-06, + "loss": 0.5486, + "step": 14509 + }, + { + "epoch": 3.8519846010885437, + "grad_norm": 0.48138527483949667, + "learning_rate": 1.4315857017216417e-06, + "loss": 0.558, + "step": 14510 + }, + { + "epoch": 3.8522500995619273, + "grad_norm": 0.47406008339555655, + "learning_rate": 1.4312700729796207e-06, + "loss": 0.5268, + "step": 14511 + }, + { + "epoch": 3.8525155980353114, + "grad_norm": 0.4519140120162252, + "learning_rate": 1.430954465080451e-06, + "loss": 0.5437, + "step": 14512 + }, + { + "epoch": 3.852781096508695, + "grad_norm": 0.46678061348580907, + "learning_rate": 1.4306388780302858e-06, + "loss": 0.5261, + "step": 14513 + }, + { + "epoch": 3.8530465949820787, + "grad_norm": 0.457884201921897, + "learning_rate": 1.4303233118352817e-06, + "loss": 0.5604, + "step": 14514 + }, + { + "epoch": 3.8533120934554628, + "grad_norm": 0.469108932427907, + "learning_rate": 1.4300077665015916e-06, + "loss": 0.5744, + "step": 14515 + }, + { + "epoch": 3.8535775919288464, + "grad_norm": 0.4762336786578173, + "learning_rate": 1.4296922420353705e-06, + "loss": 0.58, + "step": 14516 + }, + { + "epoch": 3.85384309040223, + "grad_norm": 0.47588247918152304, + "learning_rate": 1.4293767384427695e-06, + "loss": 0.5452, + "step": 14517 + }, + { + "epoch": 3.854108588875614, + "grad_norm": 0.4676911749308556, + "learning_rate": 1.429061255729945e-06, + "loss": 0.5294, + "step": 14518 + }, + { + "epoch": 3.854374087348998, + "grad_norm": 0.4799964495116778, + "learning_rate": 1.428745793903047e-06, + "loss": 0.5888, + "step": 14519 + }, + { + "epoch": 3.8546395858223814, + "grad_norm": 0.47179741497921146, + "learning_rate": 1.4284303529682297e-06, + "loss": 0.556, + "step": 14520 + }, + { + "epoch": 3.8549050842957655, + "grad_norm": 0.4669240938625809, + "learning_rate": 1.4281149329316442e-06, + "loss": 0.5177, + "step": 14521 + }, + { + "epoch": 3.855170582769149, + "grad_norm": 0.4641420336555434, + "learning_rate": 1.427799533799441e-06, + "loss": 0.5428, + "step": 14522 + }, + { + "epoch": 3.855436081242533, + "grad_norm": 0.4555473662121771, + "learning_rate": 1.4274841555777736e-06, + "loss": 0.5606, + "step": 14523 + }, + { + "epoch": 3.855701579715917, + "grad_norm": 0.47527342394282035, + "learning_rate": 1.4271687982727895e-06, + "loss": 0.5639, + "step": 14524 + }, + { + "epoch": 3.8559670781893005, + "grad_norm": 0.4533830929855282, + "learning_rate": 1.4268534618906415e-06, + "loss": 0.557, + "step": 14525 + }, + { + "epoch": 3.856232576662684, + "grad_norm": 0.4658551494045167, + "learning_rate": 1.4265381464374789e-06, + "loss": 0.5647, + "step": 14526 + }, + { + "epoch": 3.8564980751360682, + "grad_norm": 0.47112906264934346, + "learning_rate": 1.4262228519194498e-06, + "loss": 0.5317, + "step": 14527 + }, + { + "epoch": 3.856763573609452, + "grad_norm": 0.4556243613220827, + "learning_rate": 1.4259075783427048e-06, + "loss": 0.5391, + "step": 14528 + }, + { + "epoch": 3.8570290720828355, + "grad_norm": 0.4713431254775322, + "learning_rate": 1.425592325713391e-06, + "loss": 0.5436, + "step": 14529 + }, + { + "epoch": 3.857294570556219, + "grad_norm": 0.46917277435205174, + "learning_rate": 1.4252770940376587e-06, + "loss": 0.5559, + "step": 14530 + }, + { + "epoch": 3.8575600690296032, + "grad_norm": 0.460098556294013, + "learning_rate": 1.424961883321653e-06, + "loss": 0.5274, + "step": 14531 + }, + { + "epoch": 3.857825567502987, + "grad_norm": 0.4604312877004099, + "learning_rate": 1.424646693571525e-06, + "loss": 0.567, + "step": 14532 + }, + { + "epoch": 3.8580910659763705, + "grad_norm": 0.4749245116778108, + "learning_rate": 1.4243315247934173e-06, + "loss": 0.5543, + "step": 14533 + }, + { + "epoch": 3.858356564449754, + "grad_norm": 0.4753517947610403, + "learning_rate": 1.4240163769934796e-06, + "loss": 0.5548, + "step": 14534 + }, + { + "epoch": 3.8586220629231383, + "grad_norm": 0.46057543412474194, + "learning_rate": 1.4237012501778559e-06, + "loss": 0.5679, + "step": 14535 + }, + { + "epoch": 3.858887561396522, + "grad_norm": 0.46831632774078913, + "learning_rate": 1.4233861443526943e-06, + "loss": 0.5124, + "step": 14536 + }, + { + "epoch": 3.8591530598699055, + "grad_norm": 0.4625166646183859, + "learning_rate": 1.4230710595241376e-06, + "loss": 0.528, + "step": 14537 + }, + { + "epoch": 3.8594185583432896, + "grad_norm": 0.4579865720526916, + "learning_rate": 1.4227559956983332e-06, + "loss": 0.5123, + "step": 14538 + }, + { + "epoch": 3.8596840568166733, + "grad_norm": 0.4577274472522476, + "learning_rate": 1.4224409528814242e-06, + "loss": 0.58, + "step": 14539 + }, + { + "epoch": 3.859949555290057, + "grad_norm": 0.4635938974773529, + "learning_rate": 1.4221259310795542e-06, + "loss": 0.5502, + "step": 14540 + }, + { + "epoch": 3.860215053763441, + "grad_norm": 0.4669266858180894, + "learning_rate": 1.4218109302988687e-06, + "loss": 0.5685, + "step": 14541 + }, + { + "epoch": 3.8604805522368246, + "grad_norm": 0.4815379442617971, + "learning_rate": 1.421495950545509e-06, + "loss": 0.5319, + "step": 14542 + }, + { + "epoch": 3.8607460507102083, + "grad_norm": 0.46579120182554096, + "learning_rate": 1.4211809918256196e-06, + "loss": 0.5804, + "step": 14543 + }, + { + "epoch": 3.8610115491835924, + "grad_norm": 0.467292638195511, + "learning_rate": 1.4208660541453423e-06, + "loss": 0.569, + "step": 14544 + }, + { + "epoch": 3.861277047656976, + "grad_norm": 0.48179160825088924, + "learning_rate": 1.4205511375108188e-06, + "loss": 0.5557, + "step": 14545 + }, + { + "epoch": 3.8615425461303596, + "grad_norm": 0.4779179731973171, + "learning_rate": 1.4202362419281906e-06, + "loss": 0.569, + "step": 14546 + }, + { + "epoch": 3.8618080446037437, + "grad_norm": 0.457104363064252, + "learning_rate": 1.4199213674035994e-06, + "loss": 0.5423, + "step": 14547 + }, + { + "epoch": 3.8620735430771274, + "grad_norm": 0.46092670192163904, + "learning_rate": 1.4196065139431866e-06, + "loss": 0.5332, + "step": 14548 + }, + { + "epoch": 3.862339041550511, + "grad_norm": 0.4585835145816026, + "learning_rate": 1.4192916815530925e-06, + "loss": 0.5568, + "step": 14549 + }, + { + "epoch": 3.862604540023895, + "grad_norm": 0.4620136357378521, + "learning_rate": 1.4189768702394563e-06, + "loss": 0.519, + "step": 14550 + }, + { + "epoch": 3.8628700384972787, + "grad_norm": 0.45864056388057356, + "learning_rate": 1.4186620800084171e-06, + "loss": 0.5414, + "step": 14551 + }, + { + "epoch": 3.8631355369706624, + "grad_norm": 0.44807336374252454, + "learning_rate": 1.4183473108661159e-06, + "loss": 0.5161, + "step": 14552 + }, + { + "epoch": 3.8634010354440464, + "grad_norm": 0.48061621217202666, + "learning_rate": 1.4180325628186895e-06, + "loss": 0.5386, + "step": 14553 + }, + { + "epoch": 3.86366653391743, + "grad_norm": 0.4469776231650078, + "learning_rate": 1.4177178358722781e-06, + "loss": 0.5439, + "step": 14554 + }, + { + "epoch": 3.8639320323908137, + "grad_norm": 0.4700133305226297, + "learning_rate": 1.4174031300330187e-06, + "loss": 0.5669, + "step": 14555 + }, + { + "epoch": 3.8641975308641974, + "grad_norm": 0.469667653125786, + "learning_rate": 1.4170884453070482e-06, + "loss": 0.535, + "step": 14556 + }, + { + "epoch": 3.8644630293375815, + "grad_norm": 0.46710397011748295, + "learning_rate": 1.4167737817005054e-06, + "loss": 0.5358, + "step": 14557 + }, + { + "epoch": 3.864728527810965, + "grad_norm": 0.4688581128704027, + "learning_rate": 1.416459139219525e-06, + "loss": 0.5406, + "step": 14558 + }, + { + "epoch": 3.8649940262843487, + "grad_norm": 0.44771648381199225, + "learning_rate": 1.4161445178702454e-06, + "loss": 0.5444, + "step": 14559 + }, + { + "epoch": 3.8652595247577324, + "grad_norm": 0.49076397048980513, + "learning_rate": 1.4158299176588003e-06, + "loss": 0.5365, + "step": 14560 + }, + { + "epoch": 3.8655250232311165, + "grad_norm": 0.457903973364067, + "learning_rate": 1.4155153385913283e-06, + "loss": 0.5712, + "step": 14561 + }, + { + "epoch": 3.8657905217045, + "grad_norm": 0.46776419291963567, + "learning_rate": 1.4152007806739607e-06, + "loss": 0.525, + "step": 14562 + }, + { + "epoch": 3.8660560201778837, + "grad_norm": 0.4759045900853393, + "learning_rate": 1.4148862439128347e-06, + "loss": 0.5245, + "step": 14563 + }, + { + "epoch": 3.866321518651268, + "grad_norm": 0.46393932652777425, + "learning_rate": 1.4145717283140827e-06, + "loss": 0.5413, + "step": 14564 + }, + { + "epoch": 3.8665870171246515, + "grad_norm": 0.4684092891614069, + "learning_rate": 1.414257233883841e-06, + "loss": 0.5894, + "step": 14565 + }, + { + "epoch": 3.866852515598035, + "grad_norm": 0.48128400002742555, + "learning_rate": 1.4139427606282401e-06, + "loss": 0.5629, + "step": 14566 + }, + { + "epoch": 3.867118014071419, + "grad_norm": 0.4720521679192385, + "learning_rate": 1.4136283085534158e-06, + "loss": 0.586, + "step": 14567 + }, + { + "epoch": 3.867383512544803, + "grad_norm": 0.4698173089740331, + "learning_rate": 1.4133138776654993e-06, + "loss": 0.5533, + "step": 14568 + }, + { + "epoch": 3.8676490110181865, + "grad_norm": 0.4584665536373183, + "learning_rate": 1.412999467970622e-06, + "loss": 0.5095, + "step": 14569 + }, + { + "epoch": 3.8679145094915706, + "grad_norm": 0.46109944385299584, + "learning_rate": 1.4126850794749172e-06, + "loss": 0.55, + "step": 14570 + }, + { + "epoch": 3.868180007964954, + "grad_norm": 0.4626015167052272, + "learning_rate": 1.4123707121845148e-06, + "loss": 0.5653, + "step": 14571 + }, + { + "epoch": 3.868445506438338, + "grad_norm": 0.45827894692358045, + "learning_rate": 1.4120563661055472e-06, + "loss": 0.5439, + "step": 14572 + }, + { + "epoch": 3.868711004911722, + "grad_norm": 0.47474894652575855, + "learning_rate": 1.4117420412441447e-06, + "loss": 0.566, + "step": 14573 + }, + { + "epoch": 3.8689765033851056, + "grad_norm": 0.4647975689649319, + "learning_rate": 1.4114277376064357e-06, + "loss": 0.5378, + "step": 14574 + }, + { + "epoch": 3.869242001858489, + "grad_norm": 0.46221238287730937, + "learning_rate": 1.4111134551985516e-06, + "loss": 0.5716, + "step": 14575 + }, + { + "epoch": 3.8695075003318733, + "grad_norm": 0.47946702977695727, + "learning_rate": 1.4107991940266207e-06, + "loss": 0.5619, + "step": 14576 + }, + { + "epoch": 3.869772998805257, + "grad_norm": 0.4725087688956933, + "learning_rate": 1.4104849540967732e-06, + "loss": 0.5568, + "step": 14577 + }, + { + "epoch": 3.8700384972786406, + "grad_norm": 0.46783068147796747, + "learning_rate": 1.4101707354151367e-06, + "loss": 0.556, + "step": 14578 + }, + { + "epoch": 3.8703039957520247, + "grad_norm": 0.48317973303036454, + "learning_rate": 1.4098565379878393e-06, + "loss": 0.5337, + "step": 14579 + }, + { + "epoch": 3.8705694942254083, + "grad_norm": 0.47190204917526046, + "learning_rate": 1.4095423618210075e-06, + "loss": 0.5494, + "step": 14580 + }, + { + "epoch": 3.870834992698792, + "grad_norm": 0.4710787389006093, + "learning_rate": 1.4092282069207706e-06, + "loss": 0.5438, + "step": 14581 + }, + { + "epoch": 3.871100491172176, + "grad_norm": 0.45138610046654265, + "learning_rate": 1.4089140732932533e-06, + "loss": 0.5271, + "step": 14582 + }, + { + "epoch": 3.8713659896455597, + "grad_norm": 0.4514330753956673, + "learning_rate": 1.4085999609445838e-06, + "loss": 0.5455, + "step": 14583 + }, + { + "epoch": 3.8716314881189433, + "grad_norm": 0.4695342622796812, + "learning_rate": 1.4082858698808876e-06, + "loss": 0.5377, + "step": 14584 + }, + { + "epoch": 3.871896986592327, + "grad_norm": 0.4630429783151699, + "learning_rate": 1.4079718001082888e-06, + "loss": 0.5425, + "step": 14585 + }, + { + "epoch": 3.872162485065711, + "grad_norm": 0.482889101243248, + "learning_rate": 1.4076577516329143e-06, + "loss": 0.5619, + "step": 14586 + }, + { + "epoch": 3.8724279835390947, + "grad_norm": 0.4606513131518334, + "learning_rate": 1.4073437244608878e-06, + "loss": 0.5299, + "step": 14587 + }, + { + "epoch": 3.8726934820124783, + "grad_norm": 0.4626473748109309, + "learning_rate": 1.4070297185983344e-06, + "loss": 0.5556, + "step": 14588 + }, + { + "epoch": 3.872958980485862, + "grad_norm": 0.45200956060469094, + "learning_rate": 1.4067157340513767e-06, + "loss": 0.5275, + "step": 14589 + }, + { + "epoch": 3.873224478959246, + "grad_norm": 0.4723308718625095, + "learning_rate": 1.4064017708261414e-06, + "loss": 0.5583, + "step": 14590 + }, + { + "epoch": 3.8734899774326297, + "grad_norm": 0.47588599151662264, + "learning_rate": 1.4060878289287467e-06, + "loss": 0.5638, + "step": 14591 + }, + { + "epoch": 3.8737554759060133, + "grad_norm": 0.47446160133502857, + "learning_rate": 1.4057739083653176e-06, + "loss": 0.5717, + "step": 14592 + }, + { + "epoch": 3.8740209743793974, + "grad_norm": 0.48387548471050923, + "learning_rate": 1.405460009141978e-06, + "loss": 0.5557, + "step": 14593 + }, + { + "epoch": 3.874286472852781, + "grad_norm": 0.45927624963912456, + "learning_rate": 1.4051461312648465e-06, + "loss": 0.5574, + "step": 14594 + }, + { + "epoch": 3.8745519713261647, + "grad_norm": 0.45610238706752826, + "learning_rate": 1.4048322747400472e-06, + "loss": 0.5347, + "step": 14595 + }, + { + "epoch": 3.8748174697995488, + "grad_norm": 0.4680973971337468, + "learning_rate": 1.4045184395737004e-06, + "loss": 0.5308, + "step": 14596 + }, + { + "epoch": 3.8750829682729324, + "grad_norm": 0.4571717387097576, + "learning_rate": 1.404204625771926e-06, + "loss": 0.573, + "step": 14597 + }, + { + "epoch": 3.875348466746316, + "grad_norm": 0.4759557863354639, + "learning_rate": 1.4038908333408432e-06, + "loss": 0.5791, + "step": 14598 + }, + { + "epoch": 3.8756139652197, + "grad_norm": 0.46506543551314217, + "learning_rate": 1.4035770622865741e-06, + "loss": 0.5415, + "step": 14599 + }, + { + "epoch": 3.875879463693084, + "grad_norm": 0.4679457645941911, + "learning_rate": 1.4032633126152357e-06, + "loss": 0.5566, + "step": 14600 + }, + { + "epoch": 3.8761449621664674, + "grad_norm": 0.44938418925165097, + "learning_rate": 1.4029495843329488e-06, + "loss": 0.549, + "step": 14601 + }, + { + "epoch": 3.8764104606398515, + "grad_norm": 0.45220553531263163, + "learning_rate": 1.4026358774458315e-06, + "loss": 0.5227, + "step": 14602 + }, + { + "epoch": 3.876675959113235, + "grad_norm": 0.46582176291676886, + "learning_rate": 1.40232219196e-06, + "loss": 0.5303, + "step": 14603 + }, + { + "epoch": 3.876941457586619, + "grad_norm": 0.4619228670997488, + "learning_rate": 1.4020085278815745e-06, + "loss": 0.5704, + "step": 14604 + }, + { + "epoch": 3.877206956060003, + "grad_norm": 0.47270917262846524, + "learning_rate": 1.40169488521667e-06, + "loss": 0.5606, + "step": 14605 + }, + { + "epoch": 3.8774724545333865, + "grad_norm": 0.45156577527593345, + "learning_rate": 1.4013812639714053e-06, + "loss": 0.5514, + "step": 14606 + }, + { + "epoch": 3.87773795300677, + "grad_norm": 0.4591387232748298, + "learning_rate": 1.4010676641518961e-06, + "loss": 0.5178, + "step": 14607 + }, + { + "epoch": 3.8780034514801542, + "grad_norm": 0.4598402067380847, + "learning_rate": 1.4007540857642577e-06, + "loss": 0.526, + "step": 14608 + }, + { + "epoch": 3.878268949953538, + "grad_norm": 0.4567263766755188, + "learning_rate": 1.4004405288146056e-06, + "loss": 0.5452, + "step": 14609 + }, + { + "epoch": 3.8785344484269215, + "grad_norm": 0.47399802519519074, + "learning_rate": 1.4001269933090558e-06, + "loss": 0.574, + "step": 14610 + }, + { + "epoch": 3.878799946900305, + "grad_norm": 0.4832410721172336, + "learning_rate": 1.399813479253722e-06, + "loss": 0.5753, + "step": 14611 + }, + { + "epoch": 3.8790654453736892, + "grad_norm": 0.4605716009281784, + "learning_rate": 1.3994999866547199e-06, + "loss": 0.5551, + "step": 14612 + }, + { + "epoch": 3.879330943847073, + "grad_norm": 0.47817868653202517, + "learning_rate": 1.3991865155181622e-06, + "loss": 0.5477, + "step": 14613 + }, + { + "epoch": 3.8795964423204565, + "grad_norm": 0.4807410134645125, + "learning_rate": 1.398873065850162e-06, + "loss": 0.5419, + "step": 14614 + }, + { + "epoch": 3.87986194079384, + "grad_norm": 0.4635422867732951, + "learning_rate": 1.3985596376568345e-06, + "loss": 0.5799, + "step": 14615 + }, + { + "epoch": 3.8801274392672243, + "grad_norm": 0.49090881298409933, + "learning_rate": 1.398246230944289e-06, + "loss": 0.5194, + "step": 14616 + }, + { + "epoch": 3.880392937740608, + "grad_norm": 0.47559051214039766, + "learning_rate": 1.3979328457186414e-06, + "loss": 0.5652, + "step": 14617 + }, + { + "epoch": 3.8806584362139915, + "grad_norm": 0.46196078691913994, + "learning_rate": 1.3976194819860002e-06, + "loss": 0.5202, + "step": 14618 + }, + { + "epoch": 3.8809239346873756, + "grad_norm": 0.46768709326986047, + "learning_rate": 1.397306139752479e-06, + "loss": 0.5331, + "step": 14619 + }, + { + "epoch": 3.8811894331607593, + "grad_norm": 0.4955741192236411, + "learning_rate": 1.3969928190241883e-06, + "loss": 0.5208, + "step": 14620 + }, + { + "epoch": 3.881454931634143, + "grad_norm": 0.5233763873355358, + "learning_rate": 1.3966795198072368e-06, + "loss": 0.5526, + "step": 14621 + }, + { + "epoch": 3.881720430107527, + "grad_norm": 0.4970407519276715, + "learning_rate": 1.3963662421077375e-06, + "loss": 0.5559, + "step": 14622 + }, + { + "epoch": 3.8819859285809106, + "grad_norm": 0.4418444968684636, + "learning_rate": 1.3960529859317975e-06, + "loss": 0.4978, + "step": 14623 + }, + { + "epoch": 3.8822514270542943, + "grad_norm": 0.4728863184738252, + "learning_rate": 1.3957397512855292e-06, + "loss": 0.5227, + "step": 14624 + }, + { + "epoch": 3.8825169255276784, + "grad_norm": 0.49615760807246234, + "learning_rate": 1.3954265381750376e-06, + "loss": 0.537, + "step": 14625 + }, + { + "epoch": 3.882782424001062, + "grad_norm": 0.47594930561159454, + "learning_rate": 1.3951133466064338e-06, + "loss": 0.5344, + "step": 14626 + }, + { + "epoch": 3.8830479224744456, + "grad_norm": 0.43243043685319377, + "learning_rate": 1.394800176585824e-06, + "loss": 0.529, + "step": 14627 + }, + { + "epoch": 3.8833134209478297, + "grad_norm": 0.4769097540956097, + "learning_rate": 1.3944870281193178e-06, + "loss": 0.5297, + "step": 14628 + }, + { + "epoch": 3.8835789194212134, + "grad_norm": 0.46903828762799354, + "learning_rate": 1.3941739012130201e-06, + "loss": 0.5988, + "step": 14629 + }, + { + "epoch": 3.883844417894597, + "grad_norm": 0.48002657148983124, + "learning_rate": 1.3938607958730401e-06, + "loss": 0.5218, + "step": 14630 + }, + { + "epoch": 3.884109916367981, + "grad_norm": 0.45161501279156036, + "learning_rate": 1.393547712105483e-06, + "loss": 0.5094, + "step": 14631 + }, + { + "epoch": 3.8843754148413647, + "grad_norm": 0.4631617045904441, + "learning_rate": 1.3932346499164535e-06, + "loss": 0.5152, + "step": 14632 + }, + { + "epoch": 3.8846409133147484, + "grad_norm": 0.47897668056463677, + "learning_rate": 1.3929216093120588e-06, + "loss": 0.5548, + "step": 14633 + }, + { + "epoch": 3.8849064117881325, + "grad_norm": 0.4564437826538093, + "learning_rate": 1.3926085902984028e-06, + "loss": 0.5471, + "step": 14634 + }, + { + "epoch": 3.885171910261516, + "grad_norm": 0.47695954296621607, + "learning_rate": 1.3922955928815911e-06, + "loss": 0.5581, + "step": 14635 + }, + { + "epoch": 3.8854374087348997, + "grad_norm": 0.46393842216661885, + "learning_rate": 1.3919826170677278e-06, + "loss": 0.5384, + "step": 14636 + }, + { + "epoch": 3.885702907208284, + "grad_norm": 0.4725223226359163, + "learning_rate": 1.3916696628629153e-06, + "loss": 0.5502, + "step": 14637 + }, + { + "epoch": 3.8859684056816675, + "grad_norm": 0.46169768282044543, + "learning_rate": 1.3913567302732588e-06, + "loss": 0.5725, + "step": 14638 + }, + { + "epoch": 3.886233904155051, + "grad_norm": 0.48043285542458675, + "learning_rate": 1.3910438193048598e-06, + "loss": 0.5453, + "step": 14639 + }, + { + "epoch": 3.8864994026284347, + "grad_norm": 0.460069084328353, + "learning_rate": 1.3907309299638229e-06, + "loss": 0.5177, + "step": 14640 + }, + { + "epoch": 3.886764901101819, + "grad_norm": 0.4806198299168726, + "learning_rate": 1.390418062256247e-06, + "loss": 0.5744, + "step": 14641 + }, + { + "epoch": 3.8870303995752025, + "grad_norm": 0.4625539970677221, + "learning_rate": 1.3901052161882382e-06, + "loss": 0.5472, + "step": 14642 + }, + { + "epoch": 3.887295898048586, + "grad_norm": 0.4662855328497534, + "learning_rate": 1.389792391765893e-06, + "loss": 0.562, + "step": 14643 + }, + { + "epoch": 3.8875613965219697, + "grad_norm": 0.46775846636456314, + "learning_rate": 1.3894795889953152e-06, + "loss": 0.5752, + "step": 14644 + }, + { + "epoch": 3.887826894995354, + "grad_norm": 0.47299443247000866, + "learning_rate": 1.3891668078826036e-06, + "loss": 0.5658, + "step": 14645 + }, + { + "epoch": 3.8880923934687375, + "grad_norm": 0.46865238246738633, + "learning_rate": 1.3888540484338604e-06, + "loss": 0.5484, + "step": 14646 + }, + { + "epoch": 3.888357891942121, + "grad_norm": 0.45843471154084525, + "learning_rate": 1.3885413106551832e-06, + "loss": 0.5454, + "step": 14647 + }, + { + "epoch": 3.888623390415505, + "grad_norm": 0.4532309740362265, + "learning_rate": 1.388228594552671e-06, + "loss": 0.5506, + "step": 14648 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.4717129902575639, + "learning_rate": 1.3879159001324244e-06, + "loss": 0.5379, + "step": 14649 + }, + { + "epoch": 3.8891543873622725, + "grad_norm": 0.4661341063159216, + "learning_rate": 1.3876032274005397e-06, + "loss": 0.5183, + "step": 14650 + }, + { + "epoch": 3.8894198858356566, + "grad_norm": 0.46499082008360926, + "learning_rate": 1.3872905763631162e-06, + "loss": 0.5495, + "step": 14651 + }, + { + "epoch": 3.88968538430904, + "grad_norm": 0.4603289353528457, + "learning_rate": 1.3869779470262501e-06, + "loss": 0.5331, + "step": 14652 + }, + { + "epoch": 3.889950882782424, + "grad_norm": 0.47228495258726216, + "learning_rate": 1.3866653393960414e-06, + "loss": 0.5774, + "step": 14653 + }, + { + "epoch": 3.890216381255808, + "grad_norm": 0.45824429091559105, + "learning_rate": 1.3863527534785823e-06, + "loss": 0.5507, + "step": 14654 + }, + { + "epoch": 3.8904818797291916, + "grad_norm": 0.47484075317806473, + "learning_rate": 1.386040189279972e-06, + "loss": 0.5446, + "step": 14655 + }, + { + "epoch": 3.890747378202575, + "grad_norm": 0.45598208266014634, + "learning_rate": 1.3857276468063047e-06, + "loss": 0.5555, + "step": 14656 + }, + { + "epoch": 3.8910128766759593, + "grad_norm": 0.47959725027240613, + "learning_rate": 1.3854151260636777e-06, + "loss": 0.5282, + "step": 14657 + }, + { + "epoch": 3.891278375149343, + "grad_norm": 0.46013834574993473, + "learning_rate": 1.3851026270581835e-06, + "loss": 0.5549, + "step": 14658 + }, + { + "epoch": 3.8915438736227266, + "grad_norm": 0.47313477684873456, + "learning_rate": 1.3847901497959188e-06, + "loss": 0.5429, + "step": 14659 + }, + { + "epoch": 3.8918093720961107, + "grad_norm": 0.476838494579563, + "learning_rate": 1.3844776942829766e-06, + "loss": 0.5231, + "step": 14660 + }, + { + "epoch": 3.8920748705694943, + "grad_norm": 0.47064964081203203, + "learning_rate": 1.3841652605254497e-06, + "loss": 0.5802, + "step": 14661 + }, + { + "epoch": 3.892340369042878, + "grad_norm": 0.46724405130390656, + "learning_rate": 1.3838528485294333e-06, + "loss": 0.5878, + "step": 14662 + }, + { + "epoch": 3.892605867516262, + "grad_norm": 0.469479644671558, + "learning_rate": 1.3835404583010181e-06, + "loss": 0.5687, + "step": 14663 + }, + { + "epoch": 3.8928713659896457, + "grad_norm": 0.48722486070806914, + "learning_rate": 1.3832280898462984e-06, + "loss": 0.5779, + "step": 14664 + }, + { + "epoch": 3.8931368644630293, + "grad_norm": 0.4824443942501269, + "learning_rate": 1.3829157431713655e-06, + "loss": 0.5495, + "step": 14665 + }, + { + "epoch": 3.893402362936413, + "grad_norm": 0.46940619450233945, + "learning_rate": 1.3826034182823095e-06, + "loss": 0.5599, + "step": 14666 + }, + { + "epoch": 3.893667861409797, + "grad_norm": 0.47881562288195634, + "learning_rate": 1.3822911151852238e-06, + "loss": 0.5768, + "step": 14667 + }, + { + "epoch": 3.8939333598831807, + "grad_norm": 0.47096488523874175, + "learning_rate": 1.3819788338861973e-06, + "loss": 0.5698, + "step": 14668 + }, + { + "epoch": 3.8941988583565643, + "grad_norm": 0.48081977540503884, + "learning_rate": 1.3816665743913215e-06, + "loss": 0.5291, + "step": 14669 + }, + { + "epoch": 3.894464356829948, + "grad_norm": 0.47604222578883354, + "learning_rate": 1.3813543367066857e-06, + "loss": 0.5644, + "step": 14670 + }, + { + "epoch": 3.894729855303332, + "grad_norm": 0.4716669152894754, + "learning_rate": 1.3810421208383796e-06, + "loss": 0.5548, + "step": 14671 + }, + { + "epoch": 3.8949953537767157, + "grad_norm": 0.459527186825024, + "learning_rate": 1.3807299267924903e-06, + "loss": 0.558, + "step": 14672 + }, + { + "epoch": 3.8952608522500993, + "grad_norm": 0.47570077369854546, + "learning_rate": 1.380417754575109e-06, + "loss": 0.5201, + "step": 14673 + }, + { + "epoch": 3.8955263507234834, + "grad_norm": 0.47964365079636573, + "learning_rate": 1.380105604192322e-06, + "loss": 0.5423, + "step": 14674 + }, + { + "epoch": 3.895791849196867, + "grad_norm": 0.47107753609713254, + "learning_rate": 1.3797934756502185e-06, + "loss": 0.5801, + "step": 14675 + }, + { + "epoch": 3.8960573476702507, + "grad_norm": 0.4734145644408046, + "learning_rate": 1.3794813689548846e-06, + "loss": 0.5708, + "step": 14676 + }, + { + "epoch": 3.8963228461436348, + "grad_norm": 0.45209016135240243, + "learning_rate": 1.3791692841124072e-06, + "loss": 0.4716, + "step": 14677 + }, + { + "epoch": 3.8965883446170184, + "grad_norm": 0.4763351749587214, + "learning_rate": 1.3788572211288737e-06, + "loss": 0.5699, + "step": 14678 + }, + { + "epoch": 3.896853843090402, + "grad_norm": 0.4622684200439051, + "learning_rate": 1.3785451800103683e-06, + "loss": 0.5181, + "step": 14679 + }, + { + "epoch": 3.897119341563786, + "grad_norm": 0.45861001528541717, + "learning_rate": 1.378233160762979e-06, + "loss": 0.5743, + "step": 14680 + }, + { + "epoch": 3.89738484003717, + "grad_norm": 0.4876186019673994, + "learning_rate": 1.3779211633927886e-06, + "loss": 0.5922, + "step": 14681 + }, + { + "epoch": 3.8976503385105534, + "grad_norm": 0.46860008132698133, + "learning_rate": 1.3776091879058845e-06, + "loss": 0.5393, + "step": 14682 + }, + { + "epoch": 3.8979158369839375, + "grad_norm": 0.459396805439591, + "learning_rate": 1.3772972343083474e-06, + "loss": 0.5375, + "step": 14683 + }, + { + "epoch": 3.898181335457321, + "grad_norm": 0.4767621731472709, + "learning_rate": 1.3769853026062631e-06, + "loss": 0.5705, + "step": 14684 + }, + { + "epoch": 3.898446833930705, + "grad_norm": 0.47236867147316836, + "learning_rate": 1.3766733928057163e-06, + "loss": 0.5424, + "step": 14685 + }, + { + "epoch": 3.898712332404089, + "grad_norm": 0.4659832277113816, + "learning_rate": 1.3763615049127877e-06, + "loss": 0.5331, + "step": 14686 + }, + { + "epoch": 3.8989778308774725, + "grad_norm": 0.47957665241764624, + "learning_rate": 1.3760496389335615e-06, + "loss": 0.5672, + "step": 14687 + }, + { + "epoch": 3.899243329350856, + "grad_norm": 0.4621717170143549, + "learning_rate": 1.3757377948741193e-06, + "loss": 0.5577, + "step": 14688 + }, + { + "epoch": 3.8995088278242402, + "grad_norm": 0.4773324531027125, + "learning_rate": 1.3754259727405429e-06, + "loss": 0.5652, + "step": 14689 + }, + { + "epoch": 3.899774326297624, + "grad_norm": 0.4391500050693001, + "learning_rate": 1.3751141725389128e-06, + "loss": 0.5528, + "step": 14690 + }, + { + "epoch": 3.9000398247710075, + "grad_norm": 0.4710193575485356, + "learning_rate": 1.3748023942753109e-06, + "loss": 0.5366, + "step": 14691 + }, + { + "epoch": 3.9003053232443916, + "grad_norm": 0.4924782222550652, + "learning_rate": 1.3744906379558165e-06, + "loss": 0.526, + "step": 14692 + }, + { + "epoch": 3.9005708217177752, + "grad_norm": 0.48019451054364637, + "learning_rate": 1.3741789035865116e-06, + "loss": 0.5331, + "step": 14693 + }, + { + "epoch": 3.900836320191159, + "grad_norm": 0.4737861642442203, + "learning_rate": 1.3738671911734747e-06, + "loss": 0.5509, + "step": 14694 + }, + { + "epoch": 3.9011018186645425, + "grad_norm": 0.4821552896576482, + "learning_rate": 1.3735555007227832e-06, + "loss": 0.5284, + "step": 14695 + }, + { + "epoch": 3.9013673171379266, + "grad_norm": 0.46493916851530914, + "learning_rate": 1.373243832240519e-06, + "loss": 0.5074, + "step": 14696 + }, + { + "epoch": 3.9016328156113103, + "grad_norm": 0.48142185346293015, + "learning_rate": 1.3729321857327576e-06, + "loss": 0.5471, + "step": 14697 + }, + { + "epoch": 3.901898314084694, + "grad_norm": 0.4679003214755217, + "learning_rate": 1.3726205612055793e-06, + "loss": 0.5292, + "step": 14698 + }, + { + "epoch": 3.9021638125580775, + "grad_norm": 0.46199303878001674, + "learning_rate": 1.37230895866506e-06, + "loss": 0.5425, + "step": 14699 + }, + { + "epoch": 3.9024293110314616, + "grad_norm": 0.4650254977365548, + "learning_rate": 1.3719973781172774e-06, + "loss": 0.5266, + "step": 14700 + }, + { + "epoch": 3.9026948095048453, + "grad_norm": 0.4599529816970895, + "learning_rate": 1.3716858195683064e-06, + "loss": 0.5501, + "step": 14701 + }, + { + "epoch": 3.902960307978229, + "grad_norm": 0.47174524987245536, + "learning_rate": 1.371374283024226e-06, + "loss": 0.5196, + "step": 14702 + }, + { + "epoch": 3.903225806451613, + "grad_norm": 0.4860061819626319, + "learning_rate": 1.3710627684911092e-06, + "loss": 0.5455, + "step": 14703 + }, + { + "epoch": 3.9034913049249966, + "grad_norm": 0.4647455081266664, + "learning_rate": 1.3707512759750324e-06, + "loss": 0.5011, + "step": 14704 + }, + { + "epoch": 3.9037568033983803, + "grad_norm": 0.4632936640194412, + "learning_rate": 1.3704398054820728e-06, + "loss": 0.5645, + "step": 14705 + }, + { + "epoch": 3.9040223018717644, + "grad_norm": 0.47138748801159297, + "learning_rate": 1.3701283570183004e-06, + "loss": 0.5351, + "step": 14706 + }, + { + "epoch": 3.904287800345148, + "grad_norm": 0.4748584263480536, + "learning_rate": 1.3698169305897926e-06, + "loss": 0.5416, + "step": 14707 + }, + { + "epoch": 3.9045532988185316, + "grad_norm": 0.4731263686579328, + "learning_rate": 1.3695055262026208e-06, + "loss": 0.5676, + "step": 14708 + }, + { + "epoch": 3.9048187972919157, + "grad_norm": 0.45485223735229724, + "learning_rate": 1.3691941438628604e-06, + "loss": 0.5306, + "step": 14709 + }, + { + "epoch": 3.9050842957652994, + "grad_norm": 0.46877002053759415, + "learning_rate": 1.3688827835765817e-06, + "loss": 0.531, + "step": 14710 + }, + { + "epoch": 3.905349794238683, + "grad_norm": 0.46882009389509993, + "learning_rate": 1.3685714453498589e-06, + "loss": 0.5317, + "step": 14711 + }, + { + "epoch": 3.905615292712067, + "grad_norm": 0.4927628202076177, + "learning_rate": 1.3682601291887632e-06, + "loss": 0.5696, + "step": 14712 + }, + { + "epoch": 3.9058807911854507, + "grad_norm": 0.46806782904094213, + "learning_rate": 1.3679488350993653e-06, + "loss": 0.5307, + "step": 14713 + }, + { + "epoch": 3.9061462896588344, + "grad_norm": 0.4659189979546424, + "learning_rate": 1.3676375630877376e-06, + "loss": 0.5101, + "step": 14714 + }, + { + "epoch": 3.9064117881322185, + "grad_norm": 0.46741460357093256, + "learning_rate": 1.3673263131599491e-06, + "loss": 0.5593, + "step": 14715 + }, + { + "epoch": 3.906677286605602, + "grad_norm": 0.4683662134335134, + "learning_rate": 1.3670150853220716e-06, + "loss": 0.5499, + "step": 14716 + }, + { + "epoch": 3.9069427850789857, + "grad_norm": 0.48825242576534766, + "learning_rate": 1.3667038795801741e-06, + "loss": 0.5333, + "step": 14717 + }, + { + "epoch": 3.90720828355237, + "grad_norm": 0.467194376278856, + "learning_rate": 1.3663926959403256e-06, + "loss": 0.551, + "step": 14718 + }, + { + "epoch": 3.9074737820257535, + "grad_norm": 0.4707464622147138, + "learning_rate": 1.366081534408594e-06, + "loss": 0.5506, + "step": 14719 + }, + { + "epoch": 3.907739280499137, + "grad_norm": 0.4633981052191705, + "learning_rate": 1.36577039499105e-06, + "loss": 0.5324, + "step": 14720 + }, + { + "epoch": 3.9080047789725207, + "grad_norm": 0.49972504157934, + "learning_rate": 1.3654592776937592e-06, + "loss": 0.5666, + "step": 14721 + }, + { + "epoch": 3.908270277445905, + "grad_norm": 0.48421220256866543, + "learning_rate": 1.3651481825227914e-06, + "loss": 0.5377, + "step": 14722 + }, + { + "epoch": 3.9085357759192885, + "grad_norm": 0.46252709166324507, + "learning_rate": 1.3648371094842128e-06, + "loss": 0.5494, + "step": 14723 + }, + { + "epoch": 3.908801274392672, + "grad_norm": 0.4655571775981846, + "learning_rate": 1.3645260585840886e-06, + "loss": 0.5442, + "step": 14724 + }, + { + "epoch": 3.9090667728660558, + "grad_norm": 0.45749525360659066, + "learning_rate": 1.3642150298284876e-06, + "loss": 0.5408, + "step": 14725 + }, + { + "epoch": 3.90933227133944, + "grad_norm": 0.47346948334622846, + "learning_rate": 1.3639040232234734e-06, + "loss": 0.547, + "step": 14726 + }, + { + "epoch": 3.9095977698128235, + "grad_norm": 0.45563439821141155, + "learning_rate": 1.3635930387751134e-06, + "loss": 0.5444, + "step": 14727 + }, + { + "epoch": 3.909863268286207, + "grad_norm": 0.47627641554470057, + "learning_rate": 1.3632820764894715e-06, + "loss": 0.5614, + "step": 14728 + }, + { + "epoch": 3.910128766759591, + "grad_norm": 0.46449916185390694, + "learning_rate": 1.362971136372611e-06, + "loss": 0.5546, + "step": 14729 + }, + { + "epoch": 3.910394265232975, + "grad_norm": 0.47105299584319793, + "learning_rate": 1.3626602184305987e-06, + "loss": 0.5421, + "step": 14730 + }, + { + "epoch": 3.9106597637063585, + "grad_norm": 0.46411543357985235, + "learning_rate": 1.3623493226694955e-06, + "loss": 0.542, + "step": 14731 + }, + { + "epoch": 3.9109252621797426, + "grad_norm": 0.4534898392466617, + "learning_rate": 1.3620384490953669e-06, + "loss": 0.5155, + "step": 14732 + }, + { + "epoch": 3.911190760653126, + "grad_norm": 0.4691154059876752, + "learning_rate": 1.3617275977142742e-06, + "loss": 0.5414, + "step": 14733 + }, + { + "epoch": 3.91145625912651, + "grad_norm": 0.47389648575515175, + "learning_rate": 1.3614167685322821e-06, + "loss": 0.5665, + "step": 14734 + }, + { + "epoch": 3.911721757599894, + "grad_norm": 0.47715441810130527, + "learning_rate": 1.3611059615554487e-06, + "loss": 0.565, + "step": 14735 + }, + { + "epoch": 3.9119872560732776, + "grad_norm": 0.470702503721107, + "learning_rate": 1.3607951767898386e-06, + "loss": 0.5513, + "step": 14736 + }, + { + "epoch": 3.912252754546661, + "grad_norm": 0.4606711704956551, + "learning_rate": 1.3604844142415108e-06, + "loss": 0.5188, + "step": 14737 + }, + { + "epoch": 3.9125182530200453, + "grad_norm": 0.4805378115092284, + "learning_rate": 1.3601736739165278e-06, + "loss": 0.5436, + "step": 14738 + }, + { + "epoch": 3.912783751493429, + "grad_norm": 0.4682016375021728, + "learning_rate": 1.359862955820948e-06, + "loss": 0.5516, + "step": 14739 + }, + { + "epoch": 3.9130492499668126, + "grad_norm": 0.46870657749539757, + "learning_rate": 1.3595522599608333e-06, + "loss": 0.5714, + "step": 14740 + }, + { + "epoch": 3.9133147484401967, + "grad_norm": 0.47012056161416405, + "learning_rate": 1.3592415863422415e-06, + "loss": 0.5523, + "step": 14741 + }, + { + "epoch": 3.9135802469135803, + "grad_norm": 0.4606365008269365, + "learning_rate": 1.358930934971231e-06, + "loss": 0.5331, + "step": 14742 + }, + { + "epoch": 3.913845745386964, + "grad_norm": 0.45945994973378745, + "learning_rate": 1.3586203058538622e-06, + "loss": 0.5457, + "step": 14743 + }, + { + "epoch": 3.914111243860348, + "grad_norm": 0.4517789628173039, + "learning_rate": 1.3583096989961908e-06, + "loss": 0.5674, + "step": 14744 + }, + { + "epoch": 3.9143767423337317, + "grad_norm": 0.4793254762824073, + "learning_rate": 1.3579991144042775e-06, + "loss": 0.5808, + "step": 14745 + }, + { + "epoch": 3.9146422408071153, + "grad_norm": 0.4649389067770687, + "learning_rate": 1.357688552084176e-06, + "loss": 0.5673, + "step": 14746 + }, + { + "epoch": 3.9149077392804994, + "grad_norm": 0.45690113761182305, + "learning_rate": 1.3573780120419455e-06, + "loss": 0.5459, + "step": 14747 + }, + { + "epoch": 3.915173237753883, + "grad_norm": 0.4545940354472747, + "learning_rate": 1.3570674942836401e-06, + "loss": 0.5472, + "step": 14748 + }, + { + "epoch": 3.9154387362272667, + "grad_norm": 0.4878460082851874, + "learning_rate": 1.3567569988153172e-06, + "loss": 0.5089, + "step": 14749 + }, + { + "epoch": 3.9157042347006503, + "grad_norm": 0.44989938921248396, + "learning_rate": 1.3564465256430328e-06, + "loss": 0.5473, + "step": 14750 + }, + { + "epoch": 3.9159697331740344, + "grad_norm": 0.4681212086887664, + "learning_rate": 1.3561360747728411e-06, + "loss": 0.5552, + "step": 14751 + }, + { + "epoch": 3.916235231647418, + "grad_norm": 0.47618175085230485, + "learning_rate": 1.3558256462107965e-06, + "loss": 0.5645, + "step": 14752 + }, + { + "epoch": 3.9165007301208017, + "grad_norm": 0.46681277787031755, + "learning_rate": 1.3555152399629523e-06, + "loss": 0.5805, + "step": 14753 + }, + { + "epoch": 3.9167662285941853, + "grad_norm": 0.475974934513959, + "learning_rate": 1.3552048560353643e-06, + "loss": 0.5596, + "step": 14754 + }, + { + "epoch": 3.9170317270675694, + "grad_norm": 0.46687103599044416, + "learning_rate": 1.3548944944340833e-06, + "loss": 0.5534, + "step": 14755 + }, + { + "epoch": 3.917297225540953, + "grad_norm": 0.4638734034082637, + "learning_rate": 1.3545841551651646e-06, + "loss": 0.5303, + "step": 14756 + }, + { + "epoch": 3.9175627240143367, + "grad_norm": 0.463631788461443, + "learning_rate": 1.3542738382346593e-06, + "loss": 0.5287, + "step": 14757 + }, + { + "epoch": 3.917828222487721, + "grad_norm": 0.45519936138630906, + "learning_rate": 1.353963543648618e-06, + "loss": 0.5065, + "step": 14758 + }, + { + "epoch": 3.9180937209611044, + "grad_norm": 0.4549791289533537, + "learning_rate": 1.353653271413095e-06, + "loss": 0.5528, + "step": 14759 + }, + { + "epoch": 3.918359219434488, + "grad_norm": 0.47460011697691484, + "learning_rate": 1.3533430215341387e-06, + "loss": 0.5686, + "step": 14760 + }, + { + "epoch": 3.918624717907872, + "grad_norm": 0.4656560956477604, + "learning_rate": 1.353032794017802e-06, + "loss": 0.5851, + "step": 14761 + }, + { + "epoch": 3.918890216381256, + "grad_norm": 0.47292410819616754, + "learning_rate": 1.3527225888701334e-06, + "loss": 0.5917, + "step": 14762 + }, + { + "epoch": 3.9191557148546394, + "grad_norm": 0.47564106618319935, + "learning_rate": 1.352412406097185e-06, + "loss": 0.5342, + "step": 14763 + }, + { + "epoch": 3.9194212133280235, + "grad_norm": 0.4676455558627603, + "learning_rate": 1.3521022457050026e-06, + "loss": 0.5223, + "step": 14764 + }, + { + "epoch": 3.919686711801407, + "grad_norm": 0.4618649659818899, + "learning_rate": 1.3517921076996375e-06, + "loss": 0.5328, + "step": 14765 + }, + { + "epoch": 3.919952210274791, + "grad_norm": 0.47475299638691565, + "learning_rate": 1.3514819920871372e-06, + "loss": 0.5189, + "step": 14766 + }, + { + "epoch": 3.920217708748175, + "grad_norm": 0.4695573020525525, + "learning_rate": 1.3511718988735508e-06, + "loss": 0.5367, + "step": 14767 + }, + { + "epoch": 3.9204832072215585, + "grad_norm": 0.47557159012312633, + "learning_rate": 1.3508618280649256e-06, + "loss": 0.5404, + "step": 14768 + }, + { + "epoch": 3.920748705694942, + "grad_norm": 0.45567423353409175, + "learning_rate": 1.3505517796673069e-06, + "loss": 0.5524, + "step": 14769 + }, + { + "epoch": 3.9210142041683262, + "grad_norm": 0.4498274454675176, + "learning_rate": 1.350241753686744e-06, + "loss": 0.5504, + "step": 14770 + }, + { + "epoch": 3.92127970264171, + "grad_norm": 0.4709544892520749, + "learning_rate": 1.3499317501292807e-06, + "loss": 0.5184, + "step": 14771 + }, + { + "epoch": 3.9215452011150935, + "grad_norm": 0.46477447907187447, + "learning_rate": 1.3496217690009655e-06, + "loss": 0.5624, + "step": 14772 + }, + { + "epoch": 3.9218106995884776, + "grad_norm": 0.4641581477946725, + "learning_rate": 1.3493118103078412e-06, + "loss": 0.5492, + "step": 14773 + }, + { + "epoch": 3.9220761980618613, + "grad_norm": 0.4661662264609392, + "learning_rate": 1.3490018740559551e-06, + "loss": 0.5399, + "step": 14774 + }, + { + "epoch": 3.922341696535245, + "grad_norm": 0.4507233777564372, + "learning_rate": 1.3486919602513504e-06, + "loss": 0.5526, + "step": 14775 + }, + { + "epoch": 3.922607195008629, + "grad_norm": 0.45844797168259865, + "learning_rate": 1.3483820689000706e-06, + "loss": 0.5573, + "step": 14776 + }, + { + "epoch": 3.9228726934820126, + "grad_norm": 0.46032682770825356, + "learning_rate": 1.3480722000081612e-06, + "loss": 0.5215, + "step": 14777 + }, + { + "epoch": 3.9231381919553963, + "grad_norm": 0.47644789528767656, + "learning_rate": 1.347762353581663e-06, + "loss": 0.5343, + "step": 14778 + }, + { + "epoch": 3.92340369042878, + "grad_norm": 0.4741474173426853, + "learning_rate": 1.347452529626621e-06, + "loss": 0.5337, + "step": 14779 + }, + { + "epoch": 3.9236691889021635, + "grad_norm": 0.4470769954143039, + "learning_rate": 1.3471427281490767e-06, + "loss": 0.5311, + "step": 14780 + }, + { + "epoch": 3.9239346873755476, + "grad_norm": 0.47366138218918785, + "learning_rate": 1.3468329491550722e-06, + "loss": 0.556, + "step": 14781 + }, + { + "epoch": 3.9242001858489313, + "grad_norm": 0.4716855584503836, + "learning_rate": 1.3465231926506473e-06, + "loss": 0.5453, + "step": 14782 + }, + { + "epoch": 3.924465684322315, + "grad_norm": 0.45839694715983687, + "learning_rate": 1.3462134586418454e-06, + "loss": 0.5579, + "step": 14783 + }, + { + "epoch": 3.924731182795699, + "grad_norm": 0.4626636308750909, + "learning_rate": 1.3459037471347054e-06, + "loss": 0.5474, + "step": 14784 + }, + { + "epoch": 3.9249966812690826, + "grad_norm": 0.46704971921087174, + "learning_rate": 1.3455940581352683e-06, + "loss": 0.5748, + "step": 14785 + }, + { + "epoch": 3.9252621797424663, + "grad_norm": 0.4697042045144857, + "learning_rate": 1.345284391649574e-06, + "loss": 0.5486, + "step": 14786 + }, + { + "epoch": 3.9255276782158504, + "grad_norm": 0.46126506591276045, + "learning_rate": 1.3449747476836602e-06, + "loss": 0.5347, + "step": 14787 + }, + { + "epoch": 3.925793176689234, + "grad_norm": 0.4645857555559977, + "learning_rate": 1.3446651262435679e-06, + "loss": 0.5461, + "step": 14788 + }, + { + "epoch": 3.9260586751626176, + "grad_norm": 0.45783179615857134, + "learning_rate": 1.344355527335333e-06, + "loss": 0.5377, + "step": 14789 + }, + { + "epoch": 3.9263241736360017, + "grad_norm": 0.4597795593447878, + "learning_rate": 1.3440459509649967e-06, + "loss": 0.5544, + "step": 14790 + }, + { + "epoch": 3.9265896721093854, + "grad_norm": 0.49152681822810573, + "learning_rate": 1.3437363971385938e-06, + "loss": 0.5651, + "step": 14791 + }, + { + "epoch": 3.926855170582769, + "grad_norm": 0.4806215672257861, + "learning_rate": 1.3434268658621625e-06, + "loss": 0.5487, + "step": 14792 + }, + { + "epoch": 3.927120669056153, + "grad_norm": 0.47156812211848087, + "learning_rate": 1.3431173571417381e-06, + "loss": 0.5393, + "step": 14793 + }, + { + "epoch": 3.9273861675295367, + "grad_norm": 0.4699178651649868, + "learning_rate": 1.3428078709833587e-06, + "loss": 0.5408, + "step": 14794 + }, + { + "epoch": 3.9276516660029204, + "grad_norm": 0.45337841094996384, + "learning_rate": 1.3424984073930581e-06, + "loss": 0.5187, + "step": 14795 + }, + { + "epoch": 3.9279171644763045, + "grad_norm": 0.4676407777429846, + "learning_rate": 1.3421889663768725e-06, + "loss": 0.5569, + "step": 14796 + }, + { + "epoch": 3.928182662949688, + "grad_norm": 0.47774782040438807, + "learning_rate": 1.3418795479408391e-06, + "loss": 0.538, + "step": 14797 + }, + { + "epoch": 3.9284481614230717, + "grad_norm": 0.4696848613453608, + "learning_rate": 1.341570152090988e-06, + "loss": 0.5259, + "step": 14798 + }, + { + "epoch": 3.928713659896456, + "grad_norm": 0.47483248781232507, + "learning_rate": 1.3412607788333561e-06, + "loss": 0.5616, + "step": 14799 + }, + { + "epoch": 3.9289791583698395, + "grad_norm": 0.47468798862305933, + "learning_rate": 1.3409514281739752e-06, + "loss": 0.5367, + "step": 14800 + }, + { + "epoch": 3.929244656843223, + "grad_norm": 0.4693627597834553, + "learning_rate": 1.3406421001188804e-06, + "loss": 0.5768, + "step": 14801 + }, + { + "epoch": 3.929510155316607, + "grad_norm": 0.45419436746230885, + "learning_rate": 1.3403327946741023e-06, + "loss": 0.5147, + "step": 14802 + }, + { + "epoch": 3.929775653789991, + "grad_norm": 0.4703471378904216, + "learning_rate": 1.3400235118456747e-06, + "loss": 0.5283, + "step": 14803 + }, + { + "epoch": 3.9300411522633745, + "grad_norm": 0.47465765235788776, + "learning_rate": 1.3397142516396289e-06, + "loss": 0.5827, + "step": 14804 + }, + { + "epoch": 3.930306650736758, + "grad_norm": 0.45880215706656197, + "learning_rate": 1.3394050140619952e-06, + "loss": 0.5509, + "step": 14805 + }, + { + "epoch": 3.930572149210142, + "grad_norm": 0.47061397682799333, + "learning_rate": 1.3390957991188065e-06, + "loss": 0.5093, + "step": 14806 + }, + { + "epoch": 3.930837647683526, + "grad_norm": 0.45233954420363803, + "learning_rate": 1.3387866068160907e-06, + "loss": 0.5772, + "step": 14807 + }, + { + "epoch": 3.9311031461569095, + "grad_norm": 0.4668690771195901, + "learning_rate": 1.3384774371598802e-06, + "loss": 0.5532, + "step": 14808 + }, + { + "epoch": 3.931368644630293, + "grad_norm": 0.4758249101545327, + "learning_rate": 1.3381682901562038e-06, + "loss": 0.5659, + "step": 14809 + }, + { + "epoch": 3.931634143103677, + "grad_norm": 0.4665533378575868, + "learning_rate": 1.3378591658110907e-06, + "loss": 0.5389, + "step": 14810 + }, + { + "epoch": 3.931899641577061, + "grad_norm": 0.48120507322810896, + "learning_rate": 1.3375500641305678e-06, + "loss": 0.5306, + "step": 14811 + }, + { + "epoch": 3.9321651400504445, + "grad_norm": 0.46340656197819297, + "learning_rate": 1.3372409851206658e-06, + "loss": 0.5434, + "step": 14812 + }, + { + "epoch": 3.9324306385238286, + "grad_norm": 0.46759445951567835, + "learning_rate": 1.3369319287874106e-06, + "loss": 0.5323, + "step": 14813 + }, + { + "epoch": 3.932696136997212, + "grad_norm": 0.4651530615615993, + "learning_rate": 1.3366228951368316e-06, + "loss": 0.5463, + "step": 14814 + }, + { + "epoch": 3.932961635470596, + "grad_norm": 0.4687370250929926, + "learning_rate": 1.3363138841749546e-06, + "loss": 0.5649, + "step": 14815 + }, + { + "epoch": 3.93322713394398, + "grad_norm": 0.46066991619179465, + "learning_rate": 1.3360048959078049e-06, + "loss": 0.5304, + "step": 14816 + }, + { + "epoch": 3.9334926324173636, + "grad_norm": 0.47275272449684125, + "learning_rate": 1.3356959303414109e-06, + "loss": 0.5398, + "step": 14817 + }, + { + "epoch": 3.933758130890747, + "grad_norm": 0.47817314307800457, + "learning_rate": 1.3353869874817956e-06, + "loss": 0.5679, + "step": 14818 + }, + { + "epoch": 3.9340236293641313, + "grad_norm": 0.4664786438407297, + "learning_rate": 1.3350780673349866e-06, + "loss": 0.5856, + "step": 14819 + }, + { + "epoch": 3.934289127837515, + "grad_norm": 0.4720284762677622, + "learning_rate": 1.3347691699070077e-06, + "loss": 0.5802, + "step": 14820 + }, + { + "epoch": 3.9345546263108986, + "grad_norm": 0.48540750881622635, + "learning_rate": 1.334460295203882e-06, + "loss": 0.5127, + "step": 14821 + }, + { + "epoch": 3.9348201247842827, + "grad_norm": 0.4601365605090677, + "learning_rate": 1.3341514432316351e-06, + "loss": 0.5423, + "step": 14822 + }, + { + "epoch": 3.9350856232576663, + "grad_norm": 0.4896311309951293, + "learning_rate": 1.3338426139962888e-06, + "loss": 0.5681, + "step": 14823 + }, + { + "epoch": 3.93535112173105, + "grad_norm": 0.4595978496339517, + "learning_rate": 1.3335338075038672e-06, + "loss": 0.5158, + "step": 14824 + }, + { + "epoch": 3.935616620204434, + "grad_norm": 0.45037210659350246, + "learning_rate": 1.333225023760392e-06, + "loss": 0.5377, + "step": 14825 + }, + { + "epoch": 3.9358821186778177, + "grad_norm": 0.4667936830525237, + "learning_rate": 1.3329162627718872e-06, + "loss": 0.564, + "step": 14826 + }, + { + "epoch": 3.9361476171512013, + "grad_norm": 0.47828288652236456, + "learning_rate": 1.3326075245443709e-06, + "loss": 0.542, + "step": 14827 + }, + { + "epoch": 3.9364131156245854, + "grad_norm": 0.4642667444604403, + "learning_rate": 1.3322988090838672e-06, + "loss": 0.5373, + "step": 14828 + }, + { + "epoch": 3.936678614097969, + "grad_norm": 0.47312198338014666, + "learning_rate": 1.331990116396395e-06, + "loss": 0.518, + "step": 14829 + }, + { + "epoch": 3.9369441125713527, + "grad_norm": 0.4551803703513252, + "learning_rate": 1.3316814464879756e-06, + "loss": 0.5149, + "step": 14830 + }, + { + "epoch": 3.9372096110447368, + "grad_norm": 0.4794636626720055, + "learning_rate": 1.331372799364628e-06, + "loss": 0.5473, + "step": 14831 + }, + { + "epoch": 3.9374751095181204, + "grad_norm": 0.4654946039303813, + "learning_rate": 1.3310641750323732e-06, + "loss": 0.5383, + "step": 14832 + }, + { + "epoch": 3.937740607991504, + "grad_norm": 0.4664094002493494, + "learning_rate": 1.3307555734972288e-06, + "loss": 0.5724, + "step": 14833 + }, + { + "epoch": 3.9380061064648877, + "grad_norm": 0.4765294250005343, + "learning_rate": 1.3304469947652127e-06, + "loss": 0.5212, + "step": 14834 + }, + { + "epoch": 3.9382716049382713, + "grad_norm": 0.45930543403932333, + "learning_rate": 1.330138438842345e-06, + "loss": 0.5675, + "step": 14835 + }, + { + "epoch": 3.9385371034116554, + "grad_norm": 0.4677371969994075, + "learning_rate": 1.3298299057346407e-06, + "loss": 0.5308, + "step": 14836 + }, + { + "epoch": 3.938802601885039, + "grad_norm": 0.48130753586316255, + "learning_rate": 1.3295213954481193e-06, + "loss": 0.5784, + "step": 14837 + }, + { + "epoch": 3.9390681003584227, + "grad_norm": 0.4631457997059472, + "learning_rate": 1.3292129079887967e-06, + "loss": 0.5505, + "step": 14838 + }, + { + "epoch": 3.939333598831807, + "grad_norm": 0.46810729642748544, + "learning_rate": 1.3289044433626885e-06, + "loss": 0.549, + "step": 14839 + }, + { + "epoch": 3.9395990973051904, + "grad_norm": 0.497236890113771, + "learning_rate": 1.3285960015758103e-06, + "loss": 0.5839, + "step": 14840 + }, + { + "epoch": 3.939864595778574, + "grad_norm": 0.47780628487422905, + "learning_rate": 1.3282875826341786e-06, + "loss": 0.5642, + "step": 14841 + }, + { + "epoch": 3.940130094251958, + "grad_norm": 0.47180685368458763, + "learning_rate": 1.3279791865438082e-06, + "loss": 0.5336, + "step": 14842 + }, + { + "epoch": 3.940395592725342, + "grad_norm": 0.46569956576017574, + "learning_rate": 1.3276708133107136e-06, + "loss": 0.5616, + "step": 14843 + }, + { + "epoch": 3.9406610911987254, + "grad_norm": 0.48243148254923435, + "learning_rate": 1.3273624629409081e-06, + "loss": 0.5652, + "step": 14844 + }, + { + "epoch": 3.9409265896721095, + "grad_norm": 0.4682108082491541, + "learning_rate": 1.3270541354404054e-06, + "loss": 0.5018, + "step": 14845 + }, + { + "epoch": 3.941192088145493, + "grad_norm": 0.4657765859010105, + "learning_rate": 1.3267458308152192e-06, + "loss": 0.5737, + "step": 14846 + }, + { + "epoch": 3.941457586618877, + "grad_norm": 0.47517881991646244, + "learning_rate": 1.3264375490713615e-06, + "loss": 0.5452, + "step": 14847 + }, + { + "epoch": 3.941723085092261, + "grad_norm": 0.4720445690260556, + "learning_rate": 1.3261292902148457e-06, + "loss": 0.5613, + "step": 14848 + }, + { + "epoch": 3.9419885835656445, + "grad_norm": 0.46282798434916744, + "learning_rate": 1.3258210542516831e-06, + "loss": 0.529, + "step": 14849 + }, + { + "epoch": 3.942254082039028, + "grad_norm": 0.45637987901777416, + "learning_rate": 1.3255128411878835e-06, + "loss": 0.5737, + "step": 14850 + }, + { + "epoch": 3.9425195805124122, + "grad_norm": 0.46718630484378726, + "learning_rate": 1.3252046510294603e-06, + "loss": 0.5242, + "step": 14851 + }, + { + "epoch": 3.942785078985796, + "grad_norm": 0.45520849313014605, + "learning_rate": 1.3248964837824218e-06, + "loss": 0.5079, + "step": 14852 + }, + { + "epoch": 3.9430505774591795, + "grad_norm": 0.46631529817604855, + "learning_rate": 1.3245883394527797e-06, + "loss": 0.5518, + "step": 14853 + }, + { + "epoch": 3.9433160759325636, + "grad_norm": 0.468082249143158, + "learning_rate": 1.3242802180465425e-06, + "loss": 0.5505, + "step": 14854 + }, + { + "epoch": 3.9435815744059473, + "grad_norm": 0.4736161409528284, + "learning_rate": 1.3239721195697215e-06, + "loss": 0.5504, + "step": 14855 + }, + { + "epoch": 3.943847072879331, + "grad_norm": 0.46790978106992953, + "learning_rate": 1.3236640440283211e-06, + "loss": 0.5733, + "step": 14856 + }, + { + "epoch": 3.944112571352715, + "grad_norm": 0.4571746779654453, + "learning_rate": 1.3233559914283533e-06, + "loss": 0.5464, + "step": 14857 + }, + { + "epoch": 3.9443780698260986, + "grad_norm": 0.47775425692843776, + "learning_rate": 1.3230479617758238e-06, + "loss": 0.591, + "step": 14858 + }, + { + "epoch": 3.9446435682994823, + "grad_norm": 0.46886693288058695, + "learning_rate": 1.3227399550767417e-06, + "loss": 0.5693, + "step": 14859 + }, + { + "epoch": 3.944909066772866, + "grad_norm": 0.46013332659341644, + "learning_rate": 1.3224319713371118e-06, + "loss": 0.5235, + "step": 14860 + }, + { + "epoch": 3.94517456524625, + "grad_norm": 0.47029919247162005, + "learning_rate": 1.3221240105629424e-06, + "loss": 0.5676, + "step": 14861 + }, + { + "epoch": 3.9454400637196336, + "grad_norm": 0.44522102222662774, + "learning_rate": 1.3218160727602392e-06, + "loss": 0.5397, + "step": 14862 + }, + { + "epoch": 3.9457055621930173, + "grad_norm": 0.47232099266320243, + "learning_rate": 1.3215081579350058e-06, + "loss": 0.5517, + "step": 14863 + }, + { + "epoch": 3.945971060666401, + "grad_norm": 0.4769886166124681, + "learning_rate": 1.32120026609325e-06, + "loss": 0.5228, + "step": 14864 + }, + { + "epoch": 3.946236559139785, + "grad_norm": 0.45065780731769306, + "learning_rate": 1.3208923972409743e-06, + "loss": 0.5311, + "step": 14865 + }, + { + "epoch": 3.9465020576131686, + "grad_norm": 0.46535236450105755, + "learning_rate": 1.3205845513841848e-06, + "loss": 0.5565, + "step": 14866 + }, + { + "epoch": 3.9467675560865523, + "grad_norm": 0.4617883519259185, + "learning_rate": 1.3202767285288841e-06, + "loss": 0.5249, + "step": 14867 + }, + { + "epoch": 3.9470330545599364, + "grad_norm": 0.4476884485491734, + "learning_rate": 1.3199689286810746e-06, + "loss": 0.5244, + "step": 14868 + }, + { + "epoch": 3.94729855303332, + "grad_norm": 0.4544271027766419, + "learning_rate": 1.3196611518467613e-06, + "loss": 0.5102, + "step": 14869 + }, + { + "epoch": 3.9475640515067036, + "grad_norm": 0.4662108009250808, + "learning_rate": 1.3193533980319445e-06, + "loss": 0.5111, + "step": 14870 + }, + { + "epoch": 3.9478295499800877, + "grad_norm": 0.4597279757957401, + "learning_rate": 1.3190456672426283e-06, + "loss": 0.5281, + "step": 14871 + }, + { + "epoch": 3.9480950484534714, + "grad_norm": 0.48514579912030736, + "learning_rate": 1.3187379594848132e-06, + "loss": 0.547, + "step": 14872 + }, + { + "epoch": 3.948360546926855, + "grad_norm": 0.46871236579097586, + "learning_rate": 1.3184302747644998e-06, + "loss": 0.5188, + "step": 14873 + }, + { + "epoch": 3.948626045400239, + "grad_norm": 0.4611095522184463, + "learning_rate": 1.318122613087688e-06, + "loss": 0.5167, + "step": 14874 + }, + { + "epoch": 3.9488915438736227, + "grad_norm": 0.4490969768247677, + "learning_rate": 1.3178149744603802e-06, + "loss": 0.5404, + "step": 14875 + }, + { + "epoch": 3.9491570423470064, + "grad_norm": 0.4764462021797949, + "learning_rate": 1.3175073588885737e-06, + "loss": 0.5569, + "step": 14876 + }, + { + "epoch": 3.9494225408203905, + "grad_norm": 0.4723204586935236, + "learning_rate": 1.31719976637827e-06, + "loss": 0.5439, + "step": 14877 + }, + { + "epoch": 3.949688039293774, + "grad_norm": 0.45915051903743664, + "learning_rate": 1.316892196935467e-06, + "loss": 0.5253, + "step": 14878 + }, + { + "epoch": 3.9499535377671577, + "grad_norm": 0.4696321657211413, + "learning_rate": 1.3165846505661617e-06, + "loss": 0.5399, + "step": 14879 + }, + { + "epoch": 3.950219036240542, + "grad_norm": 0.46455405135823113, + "learning_rate": 1.3162771272763542e-06, + "loss": 0.5546, + "step": 14880 + }, + { + "epoch": 3.9504845347139255, + "grad_norm": 0.48124207104601524, + "learning_rate": 1.3159696270720402e-06, + "loss": 0.5493, + "step": 14881 + }, + { + "epoch": 3.950750033187309, + "grad_norm": 0.46823800725655346, + "learning_rate": 1.3156621499592182e-06, + "loss": 0.5516, + "step": 14882 + }, + { + "epoch": 3.951015531660693, + "grad_norm": 0.4669344561071254, + "learning_rate": 1.3153546959438831e-06, + "loss": 0.5164, + "step": 14883 + }, + { + "epoch": 3.951281030134077, + "grad_norm": 0.46787232763637854, + "learning_rate": 1.3150472650320339e-06, + "loss": 0.5513, + "step": 14884 + }, + { + "epoch": 3.9515465286074605, + "grad_norm": 0.47087851483888055, + "learning_rate": 1.3147398572296628e-06, + "loss": 0.5198, + "step": 14885 + }, + { + "epoch": 3.9518120270808446, + "grad_norm": 0.4630743249748499, + "learning_rate": 1.314432472542766e-06, + "loss": 0.5308, + "step": 14886 + }, + { + "epoch": 3.952077525554228, + "grad_norm": 0.4823119321843204, + "learning_rate": 1.3141251109773395e-06, + "loss": 0.5175, + "step": 14887 + }, + { + "epoch": 3.952343024027612, + "grad_norm": 0.473496364292403, + "learning_rate": 1.3138177725393763e-06, + "loss": 0.568, + "step": 14888 + }, + { + "epoch": 3.9526085225009955, + "grad_norm": 0.46097811089796303, + "learning_rate": 1.3135104572348728e-06, + "loss": 0.5209, + "step": 14889 + }, + { + "epoch": 3.952874020974379, + "grad_norm": 0.47341156683093305, + "learning_rate": 1.3132031650698185e-06, + "loss": 0.5668, + "step": 14890 + }, + { + "epoch": 3.953139519447763, + "grad_norm": 0.4823496185058247, + "learning_rate": 1.3128958960502093e-06, + "loss": 0.5578, + "step": 14891 + }, + { + "epoch": 3.953405017921147, + "grad_norm": 0.46678230106189345, + "learning_rate": 1.312588650182035e-06, + "loss": 0.5444, + "step": 14892 + }, + { + "epoch": 3.9536705163945305, + "grad_norm": 0.47024215035505973, + "learning_rate": 1.3122814274712908e-06, + "loss": 0.541, + "step": 14893 + }, + { + "epoch": 3.9539360148679146, + "grad_norm": 0.47079368322296195, + "learning_rate": 1.3119742279239659e-06, + "loss": 0.5531, + "step": 14894 + }, + { + "epoch": 3.954201513341298, + "grad_norm": 0.4667568399265366, + "learning_rate": 1.3116670515460534e-06, + "loss": 0.5166, + "step": 14895 + }, + { + "epoch": 3.954467011814682, + "grad_norm": 0.4520028673063646, + "learning_rate": 1.3113598983435427e-06, + "loss": 0.5665, + "step": 14896 + }, + { + "epoch": 3.954732510288066, + "grad_norm": 0.4652620205820074, + "learning_rate": 1.3110527683224233e-06, + "loss": 0.5506, + "step": 14897 + }, + { + "epoch": 3.9549980087614496, + "grad_norm": 0.4703129809452641, + "learning_rate": 1.310745661488687e-06, + "loss": 0.5401, + "step": 14898 + }, + { + "epoch": 3.955263507234833, + "grad_norm": 0.4537487879806315, + "learning_rate": 1.310438577848321e-06, + "loss": 0.5349, + "step": 14899 + }, + { + "epoch": 3.9555290057082173, + "grad_norm": 0.47308000784751225, + "learning_rate": 1.3101315174073162e-06, + "loss": 0.552, + "step": 14900 + }, + { + "epoch": 3.955794504181601, + "grad_norm": 0.4599804241241482, + "learning_rate": 1.3098244801716603e-06, + "loss": 0.5603, + "step": 14901 + }, + { + "epoch": 3.9560600026549846, + "grad_norm": 0.4780432011214476, + "learning_rate": 1.309517466147341e-06, + "loss": 0.5734, + "step": 14902 + }, + { + "epoch": 3.9563255011283687, + "grad_norm": 0.468186307602429, + "learning_rate": 1.3092104753403451e-06, + "loss": 0.5691, + "step": 14903 + }, + { + "epoch": 3.9565909996017523, + "grad_norm": 0.4796090794452198, + "learning_rate": 1.3089035077566615e-06, + "loss": 0.5492, + "step": 14904 + }, + { + "epoch": 3.956856498075136, + "grad_norm": 0.47630735178455125, + "learning_rate": 1.3085965634022746e-06, + "loss": 0.5589, + "step": 14905 + }, + { + "epoch": 3.95712199654852, + "grad_norm": 0.49052019406847824, + "learning_rate": 1.3082896422831726e-06, + "loss": 0.541, + "step": 14906 + }, + { + "epoch": 3.9573874950219037, + "grad_norm": 0.4753799756949209, + "learning_rate": 1.3079827444053405e-06, + "loss": 0.5609, + "step": 14907 + }, + { + "epoch": 3.9576529934952873, + "grad_norm": 0.46578493267809273, + "learning_rate": 1.3076758697747626e-06, + "loss": 0.5229, + "step": 14908 + }, + { + "epoch": 3.9579184919686714, + "grad_norm": 0.4879936706424716, + "learning_rate": 1.3073690183974258e-06, + "loss": 0.5238, + "step": 14909 + }, + { + "epoch": 3.958183990442055, + "grad_norm": 0.45275372421366017, + "learning_rate": 1.3070621902793119e-06, + "loss": 0.5357, + "step": 14910 + }, + { + "epoch": 3.9584494889154387, + "grad_norm": 0.47478716552696326, + "learning_rate": 1.306755385426407e-06, + "loss": 0.5709, + "step": 14911 + }, + { + "epoch": 3.9587149873888228, + "grad_norm": 0.4625179402739653, + "learning_rate": 1.3064486038446938e-06, + "loss": 0.5445, + "step": 14912 + }, + { + "epoch": 3.9589804858622064, + "grad_norm": 0.46130592873113957, + "learning_rate": 1.3061418455401541e-06, + "loss": 0.5689, + "step": 14913 + }, + { + "epoch": 3.95924598433559, + "grad_norm": 0.4688052922858833, + "learning_rate": 1.3058351105187727e-06, + "loss": 0.5602, + "step": 14914 + }, + { + "epoch": 3.9595114828089737, + "grad_norm": 0.4755720544845919, + "learning_rate": 1.305528398786529e-06, + "loss": 0.5277, + "step": 14915 + }, + { + "epoch": 3.9597769812823578, + "grad_norm": 0.4722655047576312, + "learning_rate": 1.3052217103494074e-06, + "loss": 0.5287, + "step": 14916 + }, + { + "epoch": 3.9600424797557414, + "grad_norm": 0.4686952058619256, + "learning_rate": 1.304915045213387e-06, + "loss": 0.5177, + "step": 14917 + }, + { + "epoch": 3.960307978229125, + "grad_norm": 0.4668946603789969, + "learning_rate": 1.3046084033844507e-06, + "loss": 0.5323, + "step": 14918 + }, + { + "epoch": 3.9605734767025087, + "grad_norm": 0.47850541577127204, + "learning_rate": 1.3043017848685758e-06, + "loss": 0.5769, + "step": 14919 + }, + { + "epoch": 3.960838975175893, + "grad_norm": 0.4711221477497711, + "learning_rate": 1.3039951896717445e-06, + "loss": 0.5267, + "step": 14920 + }, + { + "epoch": 3.9611044736492764, + "grad_norm": 0.46058899888001475, + "learning_rate": 1.3036886177999347e-06, + "loss": 0.5534, + "step": 14921 + }, + { + "epoch": 3.96136997212266, + "grad_norm": 0.45203158357220796, + "learning_rate": 1.3033820692591264e-06, + "loss": 0.5408, + "step": 14922 + }, + { + "epoch": 3.961635470596044, + "grad_norm": 0.46265349759709323, + "learning_rate": 1.3030755440552967e-06, + "loss": 0.5162, + "step": 14923 + }, + { + "epoch": 3.961900969069428, + "grad_norm": 0.47067238410677087, + "learning_rate": 1.3027690421944256e-06, + "loss": 0.5659, + "step": 14924 + }, + { + "epoch": 3.9621664675428114, + "grad_norm": 0.46148835914390723, + "learning_rate": 1.3024625636824894e-06, + "loss": 0.4911, + "step": 14925 + }, + { + "epoch": 3.9624319660161955, + "grad_norm": 0.46504065583258425, + "learning_rate": 1.3021561085254643e-06, + "loss": 0.5517, + "step": 14926 + }, + { + "epoch": 3.962697464489579, + "grad_norm": 0.46869581892077433, + "learning_rate": 1.3018496767293292e-06, + "loss": 0.5475, + "step": 14927 + }, + { + "epoch": 3.962962962962963, + "grad_norm": 0.46919287656944525, + "learning_rate": 1.3015432683000578e-06, + "loss": 0.5306, + "step": 14928 + }, + { + "epoch": 3.963228461436347, + "grad_norm": 0.485851738415879, + "learning_rate": 1.3012368832436278e-06, + "loss": 0.5858, + "step": 14929 + }, + { + "epoch": 3.9634939599097305, + "grad_norm": 0.4732987072397958, + "learning_rate": 1.3009305215660139e-06, + "loss": 0.5535, + "step": 14930 + }, + { + "epoch": 3.963759458383114, + "grad_norm": 0.450174190873984, + "learning_rate": 1.3006241832731902e-06, + "loss": 0.5323, + "step": 14931 + }, + { + "epoch": 3.9640249568564982, + "grad_norm": 0.46949930360077097, + "learning_rate": 1.300317868371131e-06, + "loss": 0.5209, + "step": 14932 + }, + { + "epoch": 3.964290455329882, + "grad_norm": 0.4676639960211, + "learning_rate": 1.3000115768658105e-06, + "loss": 0.5845, + "step": 14933 + }, + { + "epoch": 3.9645559538032655, + "grad_norm": 0.4719369244599207, + "learning_rate": 1.2997053087632034e-06, + "loss": 0.5388, + "step": 14934 + }, + { + "epoch": 3.9648214522766496, + "grad_norm": 0.470898942578198, + "learning_rate": 1.2993990640692805e-06, + "loss": 0.526, + "step": 14935 + }, + { + "epoch": 3.9650869507500333, + "grad_norm": 0.466199010049318, + "learning_rate": 1.2990928427900175e-06, + "loss": 0.5942, + "step": 14936 + }, + { + "epoch": 3.965352449223417, + "grad_norm": 0.47872100441615095, + "learning_rate": 1.2987866449313824e-06, + "loss": 0.5379, + "step": 14937 + }, + { + "epoch": 3.965617947696801, + "grad_norm": 0.4752506107634398, + "learning_rate": 1.2984804704993499e-06, + "loss": 0.5446, + "step": 14938 + }, + { + "epoch": 3.9658834461701846, + "grad_norm": 0.4659055309987489, + "learning_rate": 1.2981743194998892e-06, + "loss": 0.5641, + "step": 14939 + }, + { + "epoch": 3.9661489446435683, + "grad_norm": 0.48334892369584376, + "learning_rate": 1.2978681919389725e-06, + "loss": 0.5439, + "step": 14940 + }, + { + "epoch": 3.9664144431169523, + "grad_norm": 0.46241064877041416, + "learning_rate": 1.2975620878225692e-06, + "loss": 0.5573, + "step": 14941 + }, + { + "epoch": 3.966679941590336, + "grad_norm": 0.46948984537769817, + "learning_rate": 1.2972560071566486e-06, + "loss": 0.5261, + "step": 14942 + }, + { + "epoch": 3.9669454400637196, + "grad_norm": 0.4523405533912353, + "learning_rate": 1.2969499499471815e-06, + "loss": 0.5463, + "step": 14943 + }, + { + "epoch": 3.9672109385371033, + "grad_norm": 0.47618601543911376, + "learning_rate": 1.2966439162001353e-06, + "loss": 0.5717, + "step": 14944 + }, + { + "epoch": 3.9674764370104874, + "grad_norm": 0.46217312074765254, + "learning_rate": 1.2963379059214799e-06, + "loss": 0.5502, + "step": 14945 + }, + { + "epoch": 3.967741935483871, + "grad_norm": 0.4691137261536149, + "learning_rate": 1.2960319191171813e-06, + "loss": 0.5652, + "step": 14946 + }, + { + "epoch": 3.9680074339572546, + "grad_norm": 0.4852501491985933, + "learning_rate": 1.29572595579321e-06, + "loss": 0.5465, + "step": 14947 + }, + { + "epoch": 3.9682729324306383, + "grad_norm": 0.46379310370950805, + "learning_rate": 1.2954200159555294e-06, + "loss": 0.5306, + "step": 14948 + }, + { + "epoch": 3.9685384309040224, + "grad_norm": 0.47226271747624765, + "learning_rate": 1.2951140996101086e-06, + "loss": 0.5799, + "step": 14949 + }, + { + "epoch": 3.968803929377406, + "grad_norm": 0.47750893448024595, + "learning_rate": 1.294808206762912e-06, + "loss": 0.5478, + "step": 14950 + }, + { + "epoch": 3.9690694278507896, + "grad_norm": 0.47414409582408085, + "learning_rate": 1.2945023374199072e-06, + "loss": 0.5465, + "step": 14951 + }, + { + "epoch": 3.9693349263241737, + "grad_norm": 0.477569394229894, + "learning_rate": 1.2941964915870575e-06, + "loss": 0.5238, + "step": 14952 + }, + { + "epoch": 3.9696004247975574, + "grad_norm": 0.464624804336812, + "learning_rate": 1.2938906692703297e-06, + "loss": 0.5759, + "step": 14953 + }, + { + "epoch": 3.969865923270941, + "grad_norm": 0.4703693306690422, + "learning_rate": 1.2935848704756865e-06, + "loss": 0.5454, + "step": 14954 + }, + { + "epoch": 3.970131421744325, + "grad_norm": 0.454985436313049, + "learning_rate": 1.2932790952090912e-06, + "loss": 0.563, + "step": 14955 + }, + { + "epoch": 3.9703969202177087, + "grad_norm": 0.46422809659587905, + "learning_rate": 1.292973343476509e-06, + "loss": 0.5229, + "step": 14956 + }, + { + "epoch": 3.9706624186910924, + "grad_norm": 0.4841446626267229, + "learning_rate": 1.2926676152839014e-06, + "loss": 0.5696, + "step": 14957 + }, + { + "epoch": 3.9709279171644765, + "grad_norm": 0.4814343609958354, + "learning_rate": 1.292361910637232e-06, + "loss": 0.5497, + "step": 14958 + }, + { + "epoch": 3.97119341563786, + "grad_norm": 0.45941633575722013, + "learning_rate": 1.2920562295424622e-06, + "loss": 0.561, + "step": 14959 + }, + { + "epoch": 3.9714589141112437, + "grad_norm": 0.5050150602135764, + "learning_rate": 1.2917505720055522e-06, + "loss": 0.531, + "step": 14960 + }, + { + "epoch": 3.971724412584628, + "grad_norm": 0.47499322069921385, + "learning_rate": 1.2914449380324657e-06, + "loss": 0.5448, + "step": 14961 + }, + { + "epoch": 3.9719899110580115, + "grad_norm": 0.4837694118754429, + "learning_rate": 1.291139327629161e-06, + "loss": 0.5768, + "step": 14962 + }, + { + "epoch": 3.972255409531395, + "grad_norm": 0.4642834733446551, + "learning_rate": 1.2908337408016002e-06, + "loss": 0.5143, + "step": 14963 + }, + { + "epoch": 3.972520908004779, + "grad_norm": 0.45152121745984114, + "learning_rate": 1.290528177555742e-06, + "loss": 0.5475, + "step": 14964 + }, + { + "epoch": 3.972786406478163, + "grad_norm": 0.47164458270950205, + "learning_rate": 1.2902226378975451e-06, + "loss": 0.5346, + "step": 14965 + }, + { + "epoch": 3.9730519049515465, + "grad_norm": 0.49289374882653286, + "learning_rate": 1.2899171218329687e-06, + "loss": 0.5592, + "step": 14966 + }, + { + "epoch": 3.9733174034249306, + "grad_norm": 0.4620583919626006, + "learning_rate": 1.2896116293679717e-06, + "loss": 0.5454, + "step": 14967 + }, + { + "epoch": 3.973582901898314, + "grad_norm": 0.4750846373561971, + "learning_rate": 1.2893061605085105e-06, + "loss": 0.516, + "step": 14968 + }, + { + "epoch": 3.973848400371698, + "grad_norm": 0.4451672458331521, + "learning_rate": 1.2890007152605444e-06, + "loss": 0.5264, + "step": 14969 + }, + { + "epoch": 3.9741138988450815, + "grad_norm": 0.46290862552706813, + "learning_rate": 1.2886952936300295e-06, + "loss": 0.5601, + "step": 14970 + }, + { + "epoch": 3.9743793973184656, + "grad_norm": 0.47511836888988485, + "learning_rate": 1.2883898956229213e-06, + "loss": 0.6266, + "step": 14971 + }, + { + "epoch": 3.974644895791849, + "grad_norm": 0.46549697888156516, + "learning_rate": 1.2880845212451776e-06, + "loss": 0.547, + "step": 14972 + }, + { + "epoch": 3.974910394265233, + "grad_norm": 0.47350507962105876, + "learning_rate": 1.2877791705027518e-06, + "loss": 0.5252, + "step": 14973 + }, + { + "epoch": 3.9751758927386165, + "grad_norm": 0.47653616983064395, + "learning_rate": 1.2874738434016012e-06, + "loss": 0.5274, + "step": 14974 + }, + { + "epoch": 3.9754413912120006, + "grad_norm": 0.4801727463890366, + "learning_rate": 1.2871685399476785e-06, + "loss": 0.5597, + "step": 14975 + }, + { + "epoch": 3.975706889685384, + "grad_norm": 0.4916498242613426, + "learning_rate": 1.286863260146941e-06, + "loss": 0.5626, + "step": 14976 + }, + { + "epoch": 3.975972388158768, + "grad_norm": 0.47674732949084403, + "learning_rate": 1.2865580040053382e-06, + "loss": 0.5693, + "step": 14977 + }, + { + "epoch": 3.976237886632152, + "grad_norm": 0.47814701520184716, + "learning_rate": 1.286252771528825e-06, + "loss": 0.5456, + "step": 14978 + }, + { + "epoch": 3.9765033851055356, + "grad_norm": 0.46373599533236476, + "learning_rate": 1.2859475627233558e-06, + "loss": 0.5501, + "step": 14979 + }, + { + "epoch": 3.9767688835789192, + "grad_norm": 0.4740841867568828, + "learning_rate": 1.2856423775948807e-06, + "loss": 0.5269, + "step": 14980 + }, + { + "epoch": 3.9770343820523033, + "grad_norm": 0.47173562719011025, + "learning_rate": 1.2853372161493532e-06, + "loss": 0.5736, + "step": 14981 + }, + { + "epoch": 3.977299880525687, + "grad_norm": 0.4696597120038557, + "learning_rate": 1.2850320783927242e-06, + "loss": 0.5306, + "step": 14982 + }, + { + "epoch": 3.9775653789990706, + "grad_norm": 0.47349760083705583, + "learning_rate": 1.2847269643309444e-06, + "loss": 0.5907, + "step": 14983 + }, + { + "epoch": 3.9778308774724547, + "grad_norm": 0.4584450553247524, + "learning_rate": 1.2844218739699632e-06, + "loss": 0.5323, + "step": 14984 + }, + { + "epoch": 3.9780963759458383, + "grad_norm": 0.46733540990008776, + "learning_rate": 1.2841168073157328e-06, + "loss": 0.5579, + "step": 14985 + }, + { + "epoch": 3.978361874419222, + "grad_norm": 0.4630328162399076, + "learning_rate": 1.2838117643742007e-06, + "loss": 0.5377, + "step": 14986 + }, + { + "epoch": 3.978627372892606, + "grad_norm": 0.45743296090341, + "learning_rate": 1.2835067451513179e-06, + "loss": 0.5568, + "step": 14987 + }, + { + "epoch": 3.9788928713659897, + "grad_norm": 0.4764792511234045, + "learning_rate": 1.2832017496530319e-06, + "loss": 0.5369, + "step": 14988 + }, + { + "epoch": 3.9791583698393733, + "grad_norm": 0.4631299068274557, + "learning_rate": 1.2828967778852901e-06, + "loss": 0.5591, + "step": 14989 + }, + { + "epoch": 3.9794238683127574, + "grad_norm": 0.46736424303170726, + "learning_rate": 1.2825918298540425e-06, + "loss": 0.5734, + "step": 14990 + }, + { + "epoch": 3.979689366786141, + "grad_norm": 0.4596673588404188, + "learning_rate": 1.2822869055652332e-06, + "loss": 0.5685, + "step": 14991 + }, + { + "epoch": 3.9799548652595247, + "grad_norm": 0.4608041842215645, + "learning_rate": 1.2819820050248122e-06, + "loss": 0.5536, + "step": 14992 + }, + { + "epoch": 3.9802203637329088, + "grad_norm": 0.45844389242061234, + "learning_rate": 1.2816771282387241e-06, + "loss": 0.5664, + "step": 14993 + }, + { + "epoch": 3.9804858622062924, + "grad_norm": 0.46642773693792194, + "learning_rate": 1.2813722752129153e-06, + "loss": 0.5438, + "step": 14994 + }, + { + "epoch": 3.980751360679676, + "grad_norm": 0.47945142635099863, + "learning_rate": 1.2810674459533295e-06, + "loss": 0.6066, + "step": 14995 + }, + { + "epoch": 3.98101685915306, + "grad_norm": 0.47017956493759977, + "learning_rate": 1.2807626404659144e-06, + "loss": 0.5233, + "step": 14996 + }, + { + "epoch": 3.981282357626444, + "grad_norm": 0.46503565794306523, + "learning_rate": 1.2804578587566118e-06, + "loss": 0.5322, + "step": 14997 + }, + { + "epoch": 3.9815478560998274, + "grad_norm": 0.46604491644306034, + "learning_rate": 1.2801531008313666e-06, + "loss": 0.5379, + "step": 14998 + }, + { + "epoch": 3.981813354573211, + "grad_norm": 0.4813322584832262, + "learning_rate": 1.279848366696125e-06, + "loss": 0.5515, + "step": 14999 + }, + { + "epoch": 3.982078853046595, + "grad_norm": 0.47208275209553235, + "learning_rate": 1.2795436563568255e-06, + "loss": 0.5398, + "step": 15000 + }, + { + "epoch": 3.982344351519979, + "grad_norm": 0.46783170853735734, + "learning_rate": 1.2792389698194141e-06, + "loss": 0.5711, + "step": 15001 + }, + { + "epoch": 3.9826098499933624, + "grad_norm": 0.48102809369045024, + "learning_rate": 1.2789343070898308e-06, + "loss": 0.55, + "step": 15002 + }, + { + "epoch": 3.982875348466746, + "grad_norm": 0.44590997140997696, + "learning_rate": 1.2786296681740187e-06, + "loss": 0.4976, + "step": 15003 + }, + { + "epoch": 3.98314084694013, + "grad_norm": 0.46793353801810605, + "learning_rate": 1.278325053077918e-06, + "loss": 0.5645, + "step": 15004 + }, + { + "epoch": 3.983406345413514, + "grad_norm": 0.46773767470305805, + "learning_rate": 1.2780204618074707e-06, + "loss": 0.5299, + "step": 15005 + }, + { + "epoch": 3.9836718438868974, + "grad_norm": 0.476223994819842, + "learning_rate": 1.2777158943686164e-06, + "loss": 0.5807, + "step": 15006 + }, + { + "epoch": 3.9839373423602815, + "grad_norm": 0.46343285836038395, + "learning_rate": 1.2774113507672936e-06, + "loss": 0.5488, + "step": 15007 + }, + { + "epoch": 3.984202840833665, + "grad_norm": 0.4797275722706083, + "learning_rate": 1.2771068310094441e-06, + "loss": 0.5641, + "step": 15008 + }, + { + "epoch": 3.984468339307049, + "grad_norm": 0.48851460862904245, + "learning_rate": 1.2768023351010045e-06, + "loss": 0.5685, + "step": 15009 + }, + { + "epoch": 3.984733837780433, + "grad_norm": 0.4870580072755374, + "learning_rate": 1.2764978630479164e-06, + "loss": 0.5024, + "step": 15010 + }, + { + "epoch": 3.9849993362538165, + "grad_norm": 0.47268997662237905, + "learning_rate": 1.2761934148561133e-06, + "loss": 0.546, + "step": 15011 + }, + { + "epoch": 3.9852648347272, + "grad_norm": 0.4644151461883268, + "learning_rate": 1.2758889905315363e-06, + "loss": 0.5472, + "step": 15012 + }, + { + "epoch": 3.9855303332005843, + "grad_norm": 0.4591858538447491, + "learning_rate": 1.2755845900801201e-06, + "loss": 0.5828, + "step": 15013 + }, + { + "epoch": 3.985795831673968, + "grad_norm": 0.4801824961065117, + "learning_rate": 1.2752802135078032e-06, + "loss": 0.5231, + "step": 15014 + }, + { + "epoch": 3.9860613301473515, + "grad_norm": 0.4550210477955791, + "learning_rate": 1.27497586082052e-06, + "loss": 0.5197, + "step": 15015 + }, + { + "epoch": 3.9863268286207356, + "grad_norm": 0.4669918544578851, + "learning_rate": 1.2746715320242075e-06, + "loss": 0.5582, + "step": 15016 + }, + { + "epoch": 3.9865923270941193, + "grad_norm": 0.46599598017282245, + "learning_rate": 1.2743672271248004e-06, + "loss": 0.5379, + "step": 15017 + }, + { + "epoch": 3.986857825567503, + "grad_norm": 0.463843518734699, + "learning_rate": 1.2740629461282322e-06, + "loss": 0.554, + "step": 15018 + }, + { + "epoch": 3.987123324040887, + "grad_norm": 0.46760077274773915, + "learning_rate": 1.273758689040439e-06, + "loss": 0.5255, + "step": 15019 + }, + { + "epoch": 3.9873888225142706, + "grad_norm": 0.4580637302811964, + "learning_rate": 1.2734544558673533e-06, + "loss": 0.5602, + "step": 15020 + }, + { + "epoch": 3.9876543209876543, + "grad_norm": 0.456662076845957, + "learning_rate": 1.2731502466149094e-06, + "loss": 0.5572, + "step": 15021 + }, + { + "epoch": 3.9879198194610384, + "grad_norm": 0.47140501833254544, + "learning_rate": 1.2728460612890399e-06, + "loss": 0.5251, + "step": 15022 + }, + { + "epoch": 3.988185317934422, + "grad_norm": 0.4619975230201168, + "learning_rate": 1.272541899895675e-06, + "loss": 0.5672, + "step": 15023 + }, + { + "epoch": 3.9884508164078056, + "grad_norm": 0.47328048651979565, + "learning_rate": 1.27223776244075e-06, + "loss": 0.5725, + "step": 15024 + }, + { + "epoch": 3.9887163148811893, + "grad_norm": 0.4639850826698106, + "learning_rate": 1.2719336489301937e-06, + "loss": 0.5766, + "step": 15025 + }, + { + "epoch": 3.9889818133545734, + "grad_norm": 0.4516415805619876, + "learning_rate": 1.271629559369939e-06, + "loss": 0.5498, + "step": 15026 + }, + { + "epoch": 3.989247311827957, + "grad_norm": 0.4725439799571295, + "learning_rate": 1.2713254937659141e-06, + "loss": 0.5433, + "step": 15027 + }, + { + "epoch": 3.9895128103013406, + "grad_norm": 0.4716593087307472, + "learning_rate": 1.2710214521240527e-06, + "loss": 0.5836, + "step": 15028 + }, + { + "epoch": 3.9897783087747243, + "grad_norm": 0.47480872975186994, + "learning_rate": 1.2707174344502805e-06, + "loss": 0.5435, + "step": 15029 + }, + { + "epoch": 3.9900438072481084, + "grad_norm": 0.4723830975557132, + "learning_rate": 1.2704134407505284e-06, + "loss": 0.5226, + "step": 15030 + }, + { + "epoch": 3.990309305721492, + "grad_norm": 0.4740560231250832, + "learning_rate": 1.270109471030724e-06, + "loss": 0.5487, + "step": 15031 + }, + { + "epoch": 3.9905748041948756, + "grad_norm": 0.46974667786352736, + "learning_rate": 1.2698055252967973e-06, + "loss": 0.5332, + "step": 15032 + }, + { + "epoch": 3.9908403026682597, + "grad_norm": 0.47602224874885063, + "learning_rate": 1.2695016035546747e-06, + "loss": 0.5454, + "step": 15033 + }, + { + "epoch": 3.9911058011416434, + "grad_norm": 0.4649963303079685, + "learning_rate": 1.2691977058102828e-06, + "loss": 0.5499, + "step": 15034 + }, + { + "epoch": 3.991371299615027, + "grad_norm": 0.470827609554469, + "learning_rate": 1.2688938320695502e-06, + "loss": 0.5511, + "step": 15035 + }, + { + "epoch": 3.991636798088411, + "grad_norm": 0.4825681075893088, + "learning_rate": 1.268589982338401e-06, + "loss": 0.501, + "step": 15036 + }, + { + "epoch": 3.9919022965617947, + "grad_norm": 0.4691903735719901, + "learning_rate": 1.2682861566227634e-06, + "loss": 0.5257, + "step": 15037 + }, + { + "epoch": 3.9921677950351784, + "grad_norm": 0.4551943054152704, + "learning_rate": 1.2679823549285602e-06, + "loss": 0.5284, + "step": 15038 + }, + { + "epoch": 3.9924332935085625, + "grad_norm": 0.4912882594930225, + "learning_rate": 1.2676785772617195e-06, + "loss": 0.5639, + "step": 15039 + }, + { + "epoch": 3.992698791981946, + "grad_norm": 0.46242044827996776, + "learning_rate": 1.267374823628162e-06, + "loss": 0.5562, + "step": 15040 + }, + { + "epoch": 3.9929642904553297, + "grad_norm": 0.46519884382487803, + "learning_rate": 1.2670710940338143e-06, + "loss": 0.5559, + "step": 15041 + }, + { + "epoch": 3.993229788928714, + "grad_norm": 0.48517177385218785, + "learning_rate": 1.2667673884845983e-06, + "loss": 0.5084, + "step": 15042 + }, + { + "epoch": 3.9934952874020975, + "grad_norm": 0.4565881485044334, + "learning_rate": 1.2664637069864382e-06, + "loss": 0.5008, + "step": 15043 + }, + { + "epoch": 3.993760785875481, + "grad_norm": 0.46227386453857205, + "learning_rate": 1.2661600495452553e-06, + "loss": 0.5433, + "step": 15044 + }, + { + "epoch": 3.994026284348865, + "grad_norm": 0.47903784432868496, + "learning_rate": 1.2658564161669734e-06, + "loss": 0.5318, + "step": 15045 + }, + { + "epoch": 3.994291782822249, + "grad_norm": 0.4802638886121846, + "learning_rate": 1.2655528068575134e-06, + "loss": 0.585, + "step": 15046 + }, + { + "epoch": 3.9945572812956325, + "grad_norm": 0.48037249518518654, + "learning_rate": 1.2652492216227946e-06, + "loss": 0.5503, + "step": 15047 + }, + { + "epoch": 3.9948227797690166, + "grad_norm": 0.465752790651287, + "learning_rate": 1.2649456604687404e-06, + "loss": 0.5192, + "step": 15048 + }, + { + "epoch": 3.9950882782424, + "grad_norm": 0.47675762816693384, + "learning_rate": 1.264642123401269e-06, + "loss": 0.5089, + "step": 15049 + }, + { + "epoch": 3.995353776715784, + "grad_norm": 0.47698388151398013, + "learning_rate": 1.2643386104263015e-06, + "loss": 0.5611, + "step": 15050 + }, + { + "epoch": 3.995619275189168, + "grad_norm": 0.457349939900204, + "learning_rate": 1.2640351215497565e-06, + "loss": 0.5281, + "step": 15051 + }, + { + "epoch": 3.9958847736625516, + "grad_norm": 0.47369061902029114, + "learning_rate": 1.263731656777552e-06, + "loss": 0.5078, + "step": 15052 + }, + { + "epoch": 3.996150272135935, + "grad_norm": 0.46872906783928264, + "learning_rate": 1.263428216115608e-06, + "loss": 0.589, + "step": 15053 + }, + { + "epoch": 3.996415770609319, + "grad_norm": 0.4674385960335175, + "learning_rate": 1.2631247995698405e-06, + "loss": 0.5777, + "step": 15054 + }, + { + "epoch": 3.996681269082703, + "grad_norm": 0.4657049807108756, + "learning_rate": 1.2628214071461687e-06, + "loss": 0.5381, + "step": 15055 + }, + { + "epoch": 3.9969467675560866, + "grad_norm": 0.46820083907544646, + "learning_rate": 1.2625180388505077e-06, + "loss": 0.5603, + "step": 15056 + }, + { + "epoch": 3.99721226602947, + "grad_norm": 0.48812374655270796, + "learning_rate": 1.2622146946887768e-06, + "loss": 0.5025, + "step": 15057 + }, + { + "epoch": 3.997477764502854, + "grad_norm": 0.4760098176879443, + "learning_rate": 1.261911374666888e-06, + "loss": 0.508, + "step": 15058 + }, + { + "epoch": 3.997743262976238, + "grad_norm": 0.47084740892680127, + "learning_rate": 1.2616080787907597e-06, + "loss": 0.5239, + "step": 15059 + }, + { + "epoch": 3.9980087614496216, + "grad_norm": 0.4669291354264763, + "learning_rate": 1.2613048070663048e-06, + "loss": 0.5397, + "step": 15060 + }, + { + "epoch": 3.9982742599230052, + "grad_norm": 0.4676787332570303, + "learning_rate": 1.2610015594994402e-06, + "loss": 0.5899, + "step": 15061 + }, + { + "epoch": 3.9985397583963893, + "grad_norm": 0.4840774397232102, + "learning_rate": 1.260698336096079e-06, + "loss": 0.55, + "step": 15062 + }, + { + "epoch": 3.998805256869773, + "grad_norm": 0.4788547395594691, + "learning_rate": 1.2603951368621332e-06, + "loss": 0.5546, + "step": 15063 + }, + { + "epoch": 3.9990707553431566, + "grad_norm": 0.47469291867159735, + "learning_rate": 1.2600919618035185e-06, + "loss": 0.5581, + "step": 15064 + }, + { + "epoch": 3.9993362538165407, + "grad_norm": 0.4670355853769135, + "learning_rate": 1.2597888109261452e-06, + "loss": 0.4904, + "step": 15065 + }, + { + "epoch": 3.9996017522899243, + "grad_norm": 0.4522452935609553, + "learning_rate": 1.2594856842359276e-06, + "loss": 0.5373, + "step": 15066 + }, + { + "epoch": 3.999867250763308, + "grad_norm": 0.48647357772196476, + "learning_rate": 1.259182581738776e-06, + "loss": 0.5413, + "step": 15067 + }, + { + "epoch": 4.0, + "grad_norm": 0.7339422262195487, + "learning_rate": 1.2588795034406037e-06, + "loss": 0.524, + "step": 15068 + }, + { + "epoch": 4.000265498473384, + "grad_norm": 0.4820985888355415, + "learning_rate": 1.258576449347318e-06, + "loss": 0.544, + "step": 15069 + }, + { + "epoch": 4.000530996946767, + "grad_norm": 0.4891848268480109, + "learning_rate": 1.2582734194648308e-06, + "loss": 0.5128, + "step": 15070 + }, + { + "epoch": 4.000796495420151, + "grad_norm": 0.4645609964529606, + "learning_rate": 1.2579704137990534e-06, + "loss": 0.5411, + "step": 15071 + }, + { + "epoch": 4.0010619938935355, + "grad_norm": 0.46660115952006626, + "learning_rate": 1.257667432355893e-06, + "loss": 0.5325, + "step": 15072 + }, + { + "epoch": 4.001327492366919, + "grad_norm": 0.47860768609232207, + "learning_rate": 1.2573644751412601e-06, + "loss": 0.5332, + "step": 15073 + }, + { + "epoch": 4.001592990840303, + "grad_norm": 0.4596484348263277, + "learning_rate": 1.2570615421610627e-06, + "loss": 0.538, + "step": 15074 + }, + { + "epoch": 4.001858489313687, + "grad_norm": 0.4729811199921599, + "learning_rate": 1.2567586334212085e-06, + "loss": 0.5362, + "step": 15075 + }, + { + "epoch": 4.00212398778707, + "grad_norm": 0.45889927094779487, + "learning_rate": 1.2564557489276035e-06, + "loss": 0.5362, + "step": 15076 + }, + { + "epoch": 4.002389486260454, + "grad_norm": 0.47692005688045225, + "learning_rate": 1.2561528886861576e-06, + "loss": 0.5034, + "step": 15077 + }, + { + "epoch": 4.002654984733838, + "grad_norm": 0.4838636670985966, + "learning_rate": 1.2558500527027748e-06, + "loss": 0.5512, + "step": 15078 + }, + { + "epoch": 4.002920483207221, + "grad_norm": 0.469017416043095, + "learning_rate": 1.255547240983363e-06, + "loss": 0.5283, + "step": 15079 + }, + { + "epoch": 4.0031859816806055, + "grad_norm": 0.4784606122633153, + "learning_rate": 1.2552444535338271e-06, + "loss": 0.5405, + "step": 15080 + }, + { + "epoch": 4.0034514801539896, + "grad_norm": 0.4615472487565943, + "learning_rate": 1.2549416903600708e-06, + "loss": 0.562, + "step": 15081 + }, + { + "epoch": 4.003716978627373, + "grad_norm": 0.47166247930936844, + "learning_rate": 1.2546389514680013e-06, + "loss": 0.514, + "step": 15082 + }, + { + "epoch": 4.003982477100757, + "grad_norm": 0.46931310789938635, + "learning_rate": 1.2543362368635204e-06, + "loss": 0.5201, + "step": 15083 + }, + { + "epoch": 4.00424797557414, + "grad_norm": 0.4601523589558971, + "learning_rate": 1.2540335465525333e-06, + "loss": 0.56, + "step": 15084 + }, + { + "epoch": 4.004513474047524, + "grad_norm": 0.4647278613682582, + "learning_rate": 1.2537308805409431e-06, + "loss": 0.5088, + "step": 15085 + }, + { + "epoch": 4.004778972520908, + "grad_norm": 0.46579187163982266, + "learning_rate": 1.253428238834652e-06, + "loss": 0.5327, + "step": 15086 + }, + { + "epoch": 4.005044470994291, + "grad_norm": 0.4699192785317288, + "learning_rate": 1.2531256214395616e-06, + "loss": 0.5437, + "step": 15087 + }, + { + "epoch": 4.0053099694676755, + "grad_norm": 0.46590889105783856, + "learning_rate": 1.2528230283615753e-06, + "loss": 0.5602, + "step": 15088 + }, + { + "epoch": 4.00557546794106, + "grad_norm": 0.4694549150317479, + "learning_rate": 1.2525204596065926e-06, + "loss": 0.5317, + "step": 15089 + }, + { + "epoch": 4.005840966414443, + "grad_norm": 0.4659588151797014, + "learning_rate": 1.2522179151805154e-06, + "loss": 0.5257, + "step": 15090 + }, + { + "epoch": 4.006106464887827, + "grad_norm": 0.4769916576213578, + "learning_rate": 1.2519153950892454e-06, + "loss": 0.5546, + "step": 15091 + }, + { + "epoch": 4.006371963361211, + "grad_norm": 0.47532170357266684, + "learning_rate": 1.2516128993386795e-06, + "loss": 0.5268, + "step": 15092 + }, + { + "epoch": 4.006637461834594, + "grad_norm": 0.47856842474950184, + "learning_rate": 1.2513104279347197e-06, + "loss": 0.5472, + "step": 15093 + }, + { + "epoch": 4.006902960307978, + "grad_norm": 0.4747897205521734, + "learning_rate": 1.2510079808832626e-06, + "loss": 0.5702, + "step": 15094 + }, + { + "epoch": 4.007168458781362, + "grad_norm": 0.466727683258946, + "learning_rate": 1.250705558190209e-06, + "loss": 0.5452, + "step": 15095 + }, + { + "epoch": 4.0074339572547455, + "grad_norm": 0.4694928059112087, + "learning_rate": 1.2504031598614552e-06, + "loss": 0.522, + "step": 15096 + }, + { + "epoch": 4.00769945572813, + "grad_norm": 0.47282426766526553, + "learning_rate": 1.2501007859029e-06, + "loss": 0.5473, + "step": 15097 + }, + { + "epoch": 4.007964954201514, + "grad_norm": 0.4700057955716974, + "learning_rate": 1.2497984363204397e-06, + "loss": 0.5521, + "step": 15098 + }, + { + "epoch": 4.008230452674897, + "grad_norm": 0.4748939460167583, + "learning_rate": 1.24949611111997e-06, + "loss": 0.5592, + "step": 15099 + }, + { + "epoch": 4.008495951148281, + "grad_norm": 0.4658720456268867, + "learning_rate": 1.249193810307389e-06, + "loss": 0.5385, + "step": 15100 + }, + { + "epoch": 4.008761449621665, + "grad_norm": 0.46925426369022755, + "learning_rate": 1.2488915338885898e-06, + "loss": 0.5587, + "step": 15101 + }, + { + "epoch": 4.009026948095048, + "grad_norm": 0.4739052126587744, + "learning_rate": 1.2485892818694704e-06, + "loss": 0.5338, + "step": 15102 + }, + { + "epoch": 4.009292446568432, + "grad_norm": 0.46871425812520534, + "learning_rate": 1.2482870542559236e-06, + "loss": 0.505, + "step": 15103 + }, + { + "epoch": 4.009557945041816, + "grad_norm": 0.48031954924964076, + "learning_rate": 1.2479848510538441e-06, + "loss": 0.5658, + "step": 15104 + }, + { + "epoch": 4.0098234435152, + "grad_norm": 0.4663280610617128, + "learning_rate": 1.2476826722691246e-06, + "loss": 0.5169, + "step": 15105 + }, + { + "epoch": 4.010088941988584, + "grad_norm": 0.48017454406500715, + "learning_rate": 1.2473805179076602e-06, + "loss": 0.5135, + "step": 15106 + }, + { + "epoch": 4.010354440461968, + "grad_norm": 0.4582409422880802, + "learning_rate": 1.2470783879753414e-06, + "loss": 0.4932, + "step": 15107 + }, + { + "epoch": 4.010619938935351, + "grad_norm": 0.4767150972651666, + "learning_rate": 1.246776282478063e-06, + "loss": 0.5328, + "step": 15108 + }, + { + "epoch": 4.010885437408735, + "grad_norm": 0.4710208021070399, + "learning_rate": 1.2464742014217154e-06, + "loss": 0.5288, + "step": 15109 + }, + { + "epoch": 4.011150935882118, + "grad_norm": 0.4479582410856575, + "learning_rate": 1.2461721448121892e-06, + "loss": 0.497, + "step": 15110 + }, + { + "epoch": 4.011416434355502, + "grad_norm": 0.4666377704642966, + "learning_rate": 1.245870112655377e-06, + "loss": 0.5223, + "step": 15111 + }, + { + "epoch": 4.011681932828886, + "grad_norm": 0.4743029599968229, + "learning_rate": 1.2455681049571668e-06, + "loss": 0.5302, + "step": 15112 + }, + { + "epoch": 4.01194743130227, + "grad_norm": 0.46113639576866716, + "learning_rate": 1.2452661217234513e-06, + "loss": 0.5272, + "step": 15113 + }, + { + "epoch": 4.012212929775654, + "grad_norm": 0.45353049843610055, + "learning_rate": 1.2449641629601186e-06, + "loss": 0.5247, + "step": 15114 + }, + { + "epoch": 4.012478428249038, + "grad_norm": 0.482724336445895, + "learning_rate": 1.2446622286730565e-06, + "loss": 0.5622, + "step": 15115 + }, + { + "epoch": 4.012743926722421, + "grad_norm": 0.4593623862960477, + "learning_rate": 1.2443603188681556e-06, + "loss": 0.5417, + "step": 15116 + }, + { + "epoch": 4.013009425195805, + "grad_norm": 0.48565089234772596, + "learning_rate": 1.2440584335513017e-06, + "loss": 0.5334, + "step": 15117 + }, + { + "epoch": 4.013274923669189, + "grad_norm": 0.4694511639189563, + "learning_rate": 1.2437565727283847e-06, + "loss": 0.5215, + "step": 15118 + }, + { + "epoch": 4.013540422142572, + "grad_norm": 0.4795164205350878, + "learning_rate": 1.243454736405289e-06, + "loss": 0.525, + "step": 15119 + }, + { + "epoch": 4.013805920615956, + "grad_norm": 0.46760623786188343, + "learning_rate": 1.2431529245879045e-06, + "loss": 0.5537, + "step": 15120 + }, + { + "epoch": 4.0140714190893405, + "grad_norm": 0.4650283214255255, + "learning_rate": 1.2428511372821133e-06, + "loss": 0.5345, + "step": 15121 + }, + { + "epoch": 4.014336917562724, + "grad_norm": 0.4582910710954033, + "learning_rate": 1.242549374493804e-06, + "loss": 0.5226, + "step": 15122 + }, + { + "epoch": 4.014602416036108, + "grad_norm": 0.45872486497099324, + "learning_rate": 1.24224763622886e-06, + "loss": 0.5354, + "step": 15123 + }, + { + "epoch": 4.014867914509492, + "grad_norm": 0.47317562291897564, + "learning_rate": 1.241945922493167e-06, + "loss": 0.5488, + "step": 15124 + }, + { + "epoch": 4.015133412982875, + "grad_norm": 0.4632405809174931, + "learning_rate": 1.2416442332926081e-06, + "loss": 0.5217, + "step": 15125 + }, + { + "epoch": 4.015398911456259, + "grad_norm": 0.4828355395608374, + "learning_rate": 1.2413425686330687e-06, + "loss": 0.5224, + "step": 15126 + }, + { + "epoch": 4.015664409929643, + "grad_norm": 0.46521468550106915, + "learning_rate": 1.2410409285204309e-06, + "loss": 0.5824, + "step": 15127 + }, + { + "epoch": 4.015929908403026, + "grad_norm": 0.4609770207623812, + "learning_rate": 1.2407393129605764e-06, + "loss": 0.5386, + "step": 15128 + }, + { + "epoch": 4.0161954068764105, + "grad_norm": 0.4727625267966508, + "learning_rate": 1.2404377219593895e-06, + "loss": 0.5392, + "step": 15129 + }, + { + "epoch": 4.016460905349795, + "grad_norm": 0.4572189493513875, + "learning_rate": 1.2401361555227497e-06, + "loss": 0.5321, + "step": 15130 + }, + { + "epoch": 4.016726403823178, + "grad_norm": 0.4690048434692796, + "learning_rate": 1.239834613656542e-06, + "loss": 0.5074, + "step": 15131 + }, + { + "epoch": 4.016991902296562, + "grad_norm": 0.4808546755740111, + "learning_rate": 1.2395330963666424e-06, + "loss": 0.5854, + "step": 15132 + }, + { + "epoch": 4.017257400769946, + "grad_norm": 0.46795790230096174, + "learning_rate": 1.2392316036589348e-06, + "loss": 0.4958, + "step": 15133 + }, + { + "epoch": 4.017522899243329, + "grad_norm": 0.4572411191246579, + "learning_rate": 1.2389301355392968e-06, + "loss": 0.5234, + "step": 15134 + }, + { + "epoch": 4.017788397716713, + "grad_norm": 0.4737647827584667, + "learning_rate": 1.2386286920136086e-06, + "loss": 0.5432, + "step": 15135 + }, + { + "epoch": 4.018053896190097, + "grad_norm": 0.47081635468680044, + "learning_rate": 1.2383272730877505e-06, + "loss": 0.5435, + "step": 15136 + }, + { + "epoch": 4.0183193946634805, + "grad_norm": 0.47403332627906536, + "learning_rate": 1.2380258787675994e-06, + "loss": 0.5403, + "step": 15137 + }, + { + "epoch": 4.018584893136865, + "grad_norm": 0.4787354795810244, + "learning_rate": 1.237724509059034e-06, + "loss": 0.5667, + "step": 15138 + }, + { + "epoch": 4.018850391610248, + "grad_norm": 0.4631558901475824, + "learning_rate": 1.2374231639679298e-06, + "loss": 0.5407, + "step": 15139 + }, + { + "epoch": 4.019115890083632, + "grad_norm": 0.4698161161490426, + "learning_rate": 1.2371218435001663e-06, + "loss": 0.5459, + "step": 15140 + }, + { + "epoch": 4.019381388557016, + "grad_norm": 0.4717195378694964, + "learning_rate": 1.2368205476616183e-06, + "loss": 0.5237, + "step": 15141 + }, + { + "epoch": 4.019646887030399, + "grad_norm": 0.4537909073316982, + "learning_rate": 1.2365192764581629e-06, + "loss": 0.4669, + "step": 15142 + }, + { + "epoch": 4.019912385503783, + "grad_norm": 0.4721326414633325, + "learning_rate": 1.2362180298956753e-06, + "loss": 0.5559, + "step": 15143 + }, + { + "epoch": 4.020177883977167, + "grad_norm": 0.46193782272043576, + "learning_rate": 1.2359168079800296e-06, + "loss": 0.5116, + "step": 15144 + }, + { + "epoch": 4.0204433824505506, + "grad_norm": 0.48091718268308714, + "learning_rate": 1.2356156107171019e-06, + "loss": 0.5592, + "step": 15145 + }, + { + "epoch": 4.020708880923935, + "grad_norm": 0.470809424251723, + "learning_rate": 1.2353144381127646e-06, + "loss": 0.5357, + "step": 15146 + }, + { + "epoch": 4.020974379397319, + "grad_norm": 0.4566358104233879, + "learning_rate": 1.2350132901728934e-06, + "loss": 0.4914, + "step": 15147 + }, + { + "epoch": 4.021239877870702, + "grad_norm": 0.47403974634995716, + "learning_rate": 1.234712166903359e-06, + "loss": 0.4885, + "step": 15148 + }, + { + "epoch": 4.021505376344086, + "grad_norm": 0.4812993924959656, + "learning_rate": 1.2344110683100375e-06, + "loss": 0.5493, + "step": 15149 + }, + { + "epoch": 4.02177087481747, + "grad_norm": 0.4767209232024363, + "learning_rate": 1.2341099943987969e-06, + "loss": 0.5397, + "step": 15150 + }, + { + "epoch": 4.022036373290853, + "grad_norm": 0.4911823060173162, + "learning_rate": 1.2338089451755117e-06, + "loss": 0.5044, + "step": 15151 + }, + { + "epoch": 4.022301871764237, + "grad_norm": 0.49424609422786137, + "learning_rate": 1.2335079206460512e-06, + "loss": 0.5445, + "step": 15152 + }, + { + "epoch": 4.0225673702376215, + "grad_norm": 0.4806444626602993, + "learning_rate": 1.2332069208162884e-06, + "loss": 0.5049, + "step": 15153 + }, + { + "epoch": 4.022832868711005, + "grad_norm": 0.47078534670571326, + "learning_rate": 1.232905945692092e-06, + "loss": 0.501, + "step": 15154 + }, + { + "epoch": 4.023098367184389, + "grad_norm": 0.4784174344647439, + "learning_rate": 1.2326049952793312e-06, + "loss": 0.5632, + "step": 15155 + }, + { + "epoch": 4.023363865657773, + "grad_norm": 0.4615727309926617, + "learning_rate": 1.232304069583877e-06, + "loss": 0.5103, + "step": 15156 + }, + { + "epoch": 4.023629364131156, + "grad_norm": 0.4862371864053715, + "learning_rate": 1.2320031686115966e-06, + "loss": 0.56, + "step": 15157 + }, + { + "epoch": 4.02389486260454, + "grad_norm": 0.47448233343347956, + "learning_rate": 1.2317022923683596e-06, + "loss": 0.5233, + "step": 15158 + }, + { + "epoch": 4.024160361077924, + "grad_norm": 0.4706810586303399, + "learning_rate": 1.2314014408600325e-06, + "loss": 0.5419, + "step": 15159 + }, + { + "epoch": 4.024425859551307, + "grad_norm": 0.4662326618062684, + "learning_rate": 1.2311006140924842e-06, + "loss": 0.5183, + "step": 15160 + }, + { + "epoch": 4.0246913580246915, + "grad_norm": 0.4875483424826858, + "learning_rate": 1.2307998120715807e-06, + "loss": 0.5652, + "step": 15161 + }, + { + "epoch": 4.0249568564980756, + "grad_norm": 0.46283564302730323, + "learning_rate": 1.2304990348031876e-06, + "loss": 0.5147, + "step": 15162 + }, + { + "epoch": 4.025222354971459, + "grad_norm": 0.47172967620604617, + "learning_rate": 1.2301982822931723e-06, + "loss": 0.5333, + "step": 15163 + }, + { + "epoch": 4.025487853444843, + "grad_norm": 0.4653283732356548, + "learning_rate": 1.229897554547399e-06, + "loss": 0.5439, + "step": 15164 + }, + { + "epoch": 4.025753351918226, + "grad_norm": 0.47772153127868505, + "learning_rate": 1.2295968515717339e-06, + "loss": 0.5441, + "step": 15165 + }, + { + "epoch": 4.02601885039161, + "grad_norm": 0.4552695518886499, + "learning_rate": 1.2292961733720404e-06, + "loss": 0.5113, + "step": 15166 + }, + { + "epoch": 4.026284348864994, + "grad_norm": 0.487452889726155, + "learning_rate": 1.228995519954183e-06, + "loss": 0.5379, + "step": 15167 + }, + { + "epoch": 4.026549847338377, + "grad_norm": 0.4852489657333027, + "learning_rate": 1.228694891324024e-06, + "loss": 0.5366, + "step": 15168 + }, + { + "epoch": 4.0268153458117615, + "grad_norm": 0.4725465835954684, + "learning_rate": 1.228394287487428e-06, + "loss": 0.5572, + "step": 15169 + }, + { + "epoch": 4.027080844285146, + "grad_norm": 0.4799833013088205, + "learning_rate": 1.2280937084502562e-06, + "loss": 0.5192, + "step": 15170 + }, + { + "epoch": 4.027346342758529, + "grad_norm": 0.46122959950651277, + "learning_rate": 1.227793154218372e-06, + "loss": 0.5245, + "step": 15171 + }, + { + "epoch": 4.027611841231913, + "grad_norm": 0.4681215278308655, + "learning_rate": 1.2274926247976363e-06, + "loss": 0.5274, + "step": 15172 + }, + { + "epoch": 4.027877339705297, + "grad_norm": 0.4803107173108429, + "learning_rate": 1.2271921201939088e-06, + "loss": 0.4897, + "step": 15173 + }, + { + "epoch": 4.02814283817868, + "grad_norm": 0.48259089077755596, + "learning_rate": 1.2268916404130527e-06, + "loss": 0.5689, + "step": 15174 + }, + { + "epoch": 4.028408336652064, + "grad_norm": 0.47767060409598194, + "learning_rate": 1.2265911854609256e-06, + "loss": 0.5282, + "step": 15175 + }, + { + "epoch": 4.028673835125448, + "grad_norm": 0.463270902948231, + "learning_rate": 1.226290755343389e-06, + "loss": 0.5192, + "step": 15176 + }, + { + "epoch": 4.0289393335988315, + "grad_norm": 0.47500790849470903, + "learning_rate": 1.2259903500663004e-06, + "loss": 0.5794, + "step": 15177 + }, + { + "epoch": 4.029204832072216, + "grad_norm": 0.473154625041754, + "learning_rate": 1.2256899696355213e-06, + "loss": 0.571, + "step": 15178 + }, + { + "epoch": 4.0294703305456, + "grad_norm": 0.47431211324842987, + "learning_rate": 1.225389614056906e-06, + "loss": 0.5478, + "step": 15179 + }, + { + "epoch": 4.029735829018983, + "grad_norm": 0.47081352177226965, + "learning_rate": 1.2250892833363148e-06, + "loss": 0.5453, + "step": 15180 + }, + { + "epoch": 4.030001327492367, + "grad_norm": 0.4856017007307107, + "learning_rate": 1.2247889774796032e-06, + "loss": 0.5483, + "step": 15181 + }, + { + "epoch": 4.030266825965751, + "grad_norm": 0.45904397838686417, + "learning_rate": 1.2244886964926286e-06, + "loss": 0.533, + "step": 15182 + }, + { + "epoch": 4.030532324439134, + "grad_norm": 0.48175309887582607, + "learning_rate": 1.2241884403812496e-06, + "loss": 0.5575, + "step": 15183 + }, + { + "epoch": 4.030797822912518, + "grad_norm": 0.46019218586563576, + "learning_rate": 1.2238882091513177e-06, + "loss": 0.4939, + "step": 15184 + }, + { + "epoch": 4.031063321385902, + "grad_norm": 0.4772008950455075, + "learning_rate": 1.223588002808691e-06, + "loss": 0.5359, + "step": 15185 + }, + { + "epoch": 4.031328819859286, + "grad_norm": 0.46075610778062087, + "learning_rate": 1.2232878213592228e-06, + "loss": 0.5074, + "step": 15186 + }, + { + "epoch": 4.03159431833267, + "grad_norm": 0.44992203894422067, + "learning_rate": 1.2229876648087688e-06, + "loss": 0.5137, + "step": 15187 + }, + { + "epoch": 4.031859816806054, + "grad_norm": 0.47564738011028035, + "learning_rate": 1.222687533163181e-06, + "loss": 0.5565, + "step": 15188 + }, + { + "epoch": 4.032125315279437, + "grad_norm": 0.4712632496105434, + "learning_rate": 1.2223874264283148e-06, + "loss": 0.5017, + "step": 15189 + }, + { + "epoch": 4.032390813752821, + "grad_norm": 0.4786331251356405, + "learning_rate": 1.2220873446100219e-06, + "loss": 0.5111, + "step": 15190 + }, + { + "epoch": 4.032656312226205, + "grad_norm": 0.44860954709131745, + "learning_rate": 1.2217872877141534e-06, + "loss": 0.498, + "step": 15191 + }, + { + "epoch": 4.032921810699588, + "grad_norm": 0.46175062564963604, + "learning_rate": 1.2214872557465637e-06, + "loss": 0.5144, + "step": 15192 + }, + { + "epoch": 4.033187309172972, + "grad_norm": 0.47049623638302096, + "learning_rate": 1.2211872487131018e-06, + "loss": 0.5324, + "step": 15193 + }, + { + "epoch": 4.033452807646356, + "grad_norm": 0.4812756516586452, + "learning_rate": 1.2208872666196205e-06, + "loss": 0.5196, + "step": 15194 + }, + { + "epoch": 4.03371830611974, + "grad_norm": 0.4635024827044325, + "learning_rate": 1.2205873094719695e-06, + "loss": 0.5659, + "step": 15195 + }, + { + "epoch": 4.033983804593124, + "grad_norm": 0.47225619901555776, + "learning_rate": 1.2202873772759983e-06, + "loss": 0.5476, + "step": 15196 + }, + { + "epoch": 4.034249303066507, + "grad_norm": 0.4682788135016163, + "learning_rate": 1.2199874700375558e-06, + "loss": 0.5342, + "step": 15197 + }, + { + "epoch": 4.034514801539891, + "grad_norm": 0.47748398198053876, + "learning_rate": 1.2196875877624922e-06, + "loss": 0.5418, + "step": 15198 + }, + { + "epoch": 4.034780300013275, + "grad_norm": 0.4800900381401272, + "learning_rate": 1.2193877304566548e-06, + "loss": 0.5446, + "step": 15199 + }, + { + "epoch": 4.035045798486658, + "grad_norm": 0.46530528291875894, + "learning_rate": 1.219087898125893e-06, + "loss": 0.533, + "step": 15200 + }, + { + "epoch": 4.035311296960042, + "grad_norm": 0.4676566056172326, + "learning_rate": 1.2187880907760532e-06, + "loss": 0.5186, + "step": 15201 + }, + { + "epoch": 4.0355767954334265, + "grad_norm": 0.46970752411790573, + "learning_rate": 1.218488308412982e-06, + "loss": 0.5582, + "step": 15202 + }, + { + "epoch": 4.03584229390681, + "grad_norm": 0.4831752456650113, + "learning_rate": 1.218188551042527e-06, + "loss": 0.549, + "step": 15203 + }, + { + "epoch": 4.036107792380194, + "grad_norm": 0.46515161664200344, + "learning_rate": 1.2178888186705328e-06, + "loss": 0.5062, + "step": 15204 + }, + { + "epoch": 4.036373290853578, + "grad_norm": 0.4737898749217837, + "learning_rate": 1.217589111302847e-06, + "loss": 0.5219, + "step": 15205 + }, + { + "epoch": 4.036638789326961, + "grad_norm": 0.4707188988826055, + "learning_rate": 1.2172894289453133e-06, + "loss": 0.5617, + "step": 15206 + }, + { + "epoch": 4.036904287800345, + "grad_norm": 0.4717745918345947, + "learning_rate": 1.2169897716037755e-06, + "loss": 0.5021, + "step": 15207 + }, + { + "epoch": 4.037169786273729, + "grad_norm": 0.47983557361231594, + "learning_rate": 1.216690139284079e-06, + "loss": 0.483, + "step": 15208 + }, + { + "epoch": 4.037435284747112, + "grad_norm": 0.46951526672174815, + "learning_rate": 1.2163905319920666e-06, + "loss": 0.5508, + "step": 15209 + }, + { + "epoch": 4.0377007832204965, + "grad_norm": 0.47225471633764826, + "learning_rate": 1.2160909497335822e-06, + "loss": 0.5614, + "step": 15210 + }, + { + "epoch": 4.037966281693881, + "grad_norm": 0.4677476702573962, + "learning_rate": 1.2157913925144668e-06, + "loss": 0.5165, + "step": 15211 + }, + { + "epoch": 4.038231780167264, + "grad_norm": 0.47124823412181144, + "learning_rate": 1.2154918603405654e-06, + "loss": 0.5284, + "step": 15212 + }, + { + "epoch": 4.038497278640648, + "grad_norm": 0.4683950895334416, + "learning_rate": 1.2151923532177162e-06, + "loss": 0.5627, + "step": 15213 + }, + { + "epoch": 4.038762777114032, + "grad_norm": 0.4810815170116101, + "learning_rate": 1.2148928711517628e-06, + "loss": 0.5454, + "step": 15214 + }, + { + "epoch": 4.039028275587415, + "grad_norm": 0.46453213325088966, + "learning_rate": 1.2145934141485436e-06, + "loss": 0.5199, + "step": 15215 + }, + { + "epoch": 4.039293774060799, + "grad_norm": 0.46572784396581357, + "learning_rate": 1.214293982213901e-06, + "loss": 0.5085, + "step": 15216 + }, + { + "epoch": 4.039559272534183, + "grad_norm": 0.45327669140427373, + "learning_rate": 1.213994575353673e-06, + "loss": 0.5039, + "step": 15217 + }, + { + "epoch": 4.0398247710075665, + "grad_norm": 0.46761248484343776, + "learning_rate": 1.2136951935737002e-06, + "loss": 0.5478, + "step": 15218 + }, + { + "epoch": 4.040090269480951, + "grad_norm": 0.4948793322285906, + "learning_rate": 1.2133958368798208e-06, + "loss": 0.5321, + "step": 15219 + }, + { + "epoch": 4.040355767954335, + "grad_norm": 0.47296897417976336, + "learning_rate": 1.2130965052778715e-06, + "loss": 0.5476, + "step": 15220 + }, + { + "epoch": 4.040621266427718, + "grad_norm": 0.4773097338499646, + "learning_rate": 1.2127971987736922e-06, + "loss": 0.5189, + "step": 15221 + }, + { + "epoch": 4.040886764901102, + "grad_norm": 0.4825265851453496, + "learning_rate": 1.2124979173731186e-06, + "loss": 0.5337, + "step": 15222 + }, + { + "epoch": 4.041152263374485, + "grad_norm": 0.4783762641194874, + "learning_rate": 1.2121986610819885e-06, + "loss": 0.5392, + "step": 15223 + }, + { + "epoch": 4.041417761847869, + "grad_norm": 0.48612510042936663, + "learning_rate": 1.2118994299061376e-06, + "loss": 0.5446, + "step": 15224 + }, + { + "epoch": 4.041683260321253, + "grad_norm": 0.468994732260362, + "learning_rate": 1.2116002238514018e-06, + "loss": 0.5396, + "step": 15225 + }, + { + "epoch": 4.0419487587946366, + "grad_norm": 0.48934230686631847, + "learning_rate": 1.2113010429236152e-06, + "loss": 0.5394, + "step": 15226 + }, + { + "epoch": 4.042214257268021, + "grad_norm": 0.4882510660968333, + "learning_rate": 1.2110018871286136e-06, + "loss": 0.539, + "step": 15227 + }, + { + "epoch": 4.042479755741405, + "grad_norm": 0.47741061682579605, + "learning_rate": 1.210702756472232e-06, + "loss": 0.5056, + "step": 15228 + }, + { + "epoch": 4.042745254214788, + "grad_norm": 0.45013491291370716, + "learning_rate": 1.2104036509603035e-06, + "loss": 0.5027, + "step": 15229 + }, + { + "epoch": 4.043010752688172, + "grad_norm": 0.47532277911891085, + "learning_rate": 1.210104570598661e-06, + "loss": 0.4995, + "step": 15230 + }, + { + "epoch": 4.043276251161556, + "grad_norm": 0.49375967662526105, + "learning_rate": 1.209805515393137e-06, + "loss": 0.4771, + "step": 15231 + }, + { + "epoch": 4.043541749634939, + "grad_norm": 0.4692949273710238, + "learning_rate": 1.2095064853495654e-06, + "loss": 0.4713, + "step": 15232 + }, + { + "epoch": 4.043807248108323, + "grad_norm": 0.4715172167833221, + "learning_rate": 1.2092074804737758e-06, + "loss": 0.5732, + "step": 15233 + }, + { + "epoch": 4.0440727465817075, + "grad_norm": 0.46584287593035656, + "learning_rate": 1.2089085007716018e-06, + "loss": 0.5369, + "step": 15234 + }, + { + "epoch": 4.044338245055091, + "grad_norm": 0.47600923575858844, + "learning_rate": 1.2086095462488732e-06, + "loss": 0.5409, + "step": 15235 + }, + { + "epoch": 4.044603743528475, + "grad_norm": 0.4597264401641963, + "learning_rate": 1.2083106169114192e-06, + "loss": 0.537, + "step": 15236 + }, + { + "epoch": 4.044869242001859, + "grad_norm": 0.46813744482225095, + "learning_rate": 1.2080117127650718e-06, + "loss": 0.5358, + "step": 15237 + }, + { + "epoch": 4.045134740475242, + "grad_norm": 0.4608256052459822, + "learning_rate": 1.2077128338156582e-06, + "loss": 0.5394, + "step": 15238 + }, + { + "epoch": 4.045400238948626, + "grad_norm": 0.4626668247201617, + "learning_rate": 1.2074139800690096e-06, + "loss": 0.5302, + "step": 15239 + }, + { + "epoch": 4.04566573742201, + "grad_norm": 0.46870202786166637, + "learning_rate": 1.2071151515309523e-06, + "loss": 0.534, + "step": 15240 + }, + { + "epoch": 4.045931235895393, + "grad_norm": 0.4541865240216533, + "learning_rate": 1.2068163482073169e-06, + "loss": 0.5177, + "step": 15241 + }, + { + "epoch": 4.0461967343687775, + "grad_norm": 0.49038073174270946, + "learning_rate": 1.206517570103927e-06, + "loss": 0.5575, + "step": 15242 + }, + { + "epoch": 4.0464622328421616, + "grad_norm": 0.4684684004724672, + "learning_rate": 1.2062188172266124e-06, + "loss": 0.5337, + "step": 15243 + }, + { + "epoch": 4.046727731315545, + "grad_norm": 0.4667188547874859, + "learning_rate": 1.205920089581198e-06, + "loss": 0.5475, + "step": 15244 + }, + { + "epoch": 4.046993229788929, + "grad_norm": 0.46382710253369824, + "learning_rate": 1.2056213871735106e-06, + "loss": 0.527, + "step": 15245 + }, + { + "epoch": 4.047258728262313, + "grad_norm": 0.48021942292750724, + "learning_rate": 1.2053227100093749e-06, + "loss": 0.5203, + "step": 15246 + }, + { + "epoch": 4.047524226735696, + "grad_norm": 0.48472603190774366, + "learning_rate": 1.205024058094617e-06, + "loss": 0.5366, + "step": 15247 + }, + { + "epoch": 4.04778972520908, + "grad_norm": 0.46430059336920937, + "learning_rate": 1.2047254314350606e-06, + "loss": 0.5308, + "step": 15248 + }, + { + "epoch": 4.048055223682463, + "grad_norm": 0.46567218920106784, + "learning_rate": 1.204426830036529e-06, + "loss": 0.5113, + "step": 15249 + }, + { + "epoch": 4.0483207221558475, + "grad_norm": 0.4807050830318009, + "learning_rate": 1.204128253904847e-06, + "loss": 0.5574, + "step": 15250 + }, + { + "epoch": 4.048586220629232, + "grad_norm": 0.4743521584489147, + "learning_rate": 1.203829703045836e-06, + "loss": 0.5378, + "step": 15251 + }, + { + "epoch": 4.048851719102615, + "grad_norm": 0.4604919292489176, + "learning_rate": 1.2035311774653202e-06, + "loss": 0.5314, + "step": 15252 + }, + { + "epoch": 4.049117217575999, + "grad_norm": 0.47683426397981876, + "learning_rate": 1.2032326771691212e-06, + "loss": 0.5327, + "step": 15253 + }, + { + "epoch": 4.049382716049383, + "grad_norm": 0.4732988959960685, + "learning_rate": 1.2029342021630586e-06, + "loss": 0.5446, + "step": 15254 + }, + { + "epoch": 4.049648214522766, + "grad_norm": 0.45847991352379697, + "learning_rate": 1.2026357524529558e-06, + "loss": 0.536, + "step": 15255 + }, + { + "epoch": 4.04991371299615, + "grad_norm": 0.49095872625856923, + "learning_rate": 1.2023373280446316e-06, + "loss": 0.56, + "step": 15256 + }, + { + "epoch": 4.050179211469534, + "grad_norm": 0.48524195458709124, + "learning_rate": 1.2020389289439075e-06, + "loss": 0.5649, + "step": 15257 + }, + { + "epoch": 4.0504447099429175, + "grad_norm": 0.4820878099628291, + "learning_rate": 1.2017405551566021e-06, + "loss": 0.5712, + "step": 15258 + }, + { + "epoch": 4.050710208416302, + "grad_norm": 0.46590308238229877, + "learning_rate": 1.2014422066885351e-06, + "loss": 0.5015, + "step": 15259 + }, + { + "epoch": 4.050975706889686, + "grad_norm": 0.4759747425476968, + "learning_rate": 1.201143883545523e-06, + "loss": 0.5319, + "step": 15260 + }, + { + "epoch": 4.051241205363069, + "grad_norm": 0.4613691980873959, + "learning_rate": 1.2008455857333864e-06, + "loss": 0.5009, + "step": 15261 + }, + { + "epoch": 4.051506703836453, + "grad_norm": 0.4858590990687217, + "learning_rate": 1.200547313257941e-06, + "loss": 0.5648, + "step": 15262 + }, + { + "epoch": 4.051772202309837, + "grad_norm": 0.4626307351952384, + "learning_rate": 1.2002490661250054e-06, + "loss": 0.5422, + "step": 15263 + }, + { + "epoch": 4.05203770078322, + "grad_norm": 0.4822199844852934, + "learning_rate": 1.1999508443403954e-06, + "loss": 0.5568, + "step": 15264 + }, + { + "epoch": 4.052303199256604, + "grad_norm": 0.4775558199724062, + "learning_rate": 1.199652647909926e-06, + "loss": 0.5282, + "step": 15265 + }, + { + "epoch": 4.052568697729988, + "grad_norm": 0.4724474403137747, + "learning_rate": 1.1993544768394147e-06, + "loss": 0.5232, + "step": 15266 + }, + { + "epoch": 4.052834196203372, + "grad_norm": 0.47124186596810175, + "learning_rate": 1.1990563311346748e-06, + "loss": 0.5595, + "step": 15267 + }, + { + "epoch": 4.053099694676756, + "grad_norm": 0.4699839949251924, + "learning_rate": 1.1987582108015228e-06, + "loss": 0.5351, + "step": 15268 + }, + { + "epoch": 4.05336519315014, + "grad_norm": 0.4753557568539712, + "learning_rate": 1.1984601158457703e-06, + "loss": 0.5396, + "step": 15269 + }, + { + "epoch": 4.053630691623523, + "grad_norm": 0.4763047255567218, + "learning_rate": 1.1981620462732344e-06, + "loss": 0.5414, + "step": 15270 + }, + { + "epoch": 4.053896190096907, + "grad_norm": 0.4767971089431187, + "learning_rate": 1.1978640020897242e-06, + "loss": 0.5444, + "step": 15271 + }, + { + "epoch": 4.054161688570291, + "grad_norm": 0.4681479133829081, + "learning_rate": 1.1975659833010539e-06, + "loss": 0.5483, + "step": 15272 + }, + { + "epoch": 4.054427187043674, + "grad_norm": 0.47090736788337534, + "learning_rate": 1.1972679899130371e-06, + "loss": 0.5622, + "step": 15273 + }, + { + "epoch": 4.054692685517058, + "grad_norm": 0.46396486852501095, + "learning_rate": 1.1969700219314831e-06, + "loss": 0.5122, + "step": 15274 + }, + { + "epoch": 4.054958183990442, + "grad_norm": 0.4745448359019339, + "learning_rate": 1.1966720793622048e-06, + "loss": 0.5469, + "step": 15275 + }, + { + "epoch": 4.055223682463826, + "grad_norm": 0.4626784809823271, + "learning_rate": 1.1963741622110122e-06, + "loss": 0.5214, + "step": 15276 + }, + { + "epoch": 4.05548918093721, + "grad_norm": 0.46653088299621404, + "learning_rate": 1.196076270483715e-06, + "loss": 0.5549, + "step": 15277 + }, + { + "epoch": 4.055754679410593, + "grad_norm": 0.4707670261389961, + "learning_rate": 1.1957784041861226e-06, + "loss": 0.5596, + "step": 15278 + }, + { + "epoch": 4.056020177883977, + "grad_norm": 0.48157785941842957, + "learning_rate": 1.195480563324045e-06, + "loss": 0.5513, + "step": 15279 + }, + { + "epoch": 4.056285676357361, + "grad_norm": 0.46439429571247676, + "learning_rate": 1.1951827479032901e-06, + "loss": 0.5364, + "step": 15280 + }, + { + "epoch": 4.056551174830744, + "grad_norm": 0.4735868234163533, + "learning_rate": 1.194884957929667e-06, + "loss": 0.5159, + "step": 15281 + }, + { + "epoch": 4.056816673304128, + "grad_norm": 0.48221284524780833, + "learning_rate": 1.1945871934089826e-06, + "loss": 0.566, + "step": 15282 + }, + { + "epoch": 4.0570821717775125, + "grad_norm": 0.4886720479988301, + "learning_rate": 1.1942894543470434e-06, + "loss": 0.505, + "step": 15283 + }, + { + "epoch": 4.057347670250896, + "grad_norm": 0.46084531172381, + "learning_rate": 1.1939917407496576e-06, + "loss": 0.5152, + "step": 15284 + }, + { + "epoch": 4.05761316872428, + "grad_norm": 0.48300248616089253, + "learning_rate": 1.1936940526226295e-06, + "loss": 0.5528, + "step": 15285 + }, + { + "epoch": 4.057878667197664, + "grad_norm": 0.47918006766306404, + "learning_rate": 1.1933963899717668e-06, + "loss": 0.562, + "step": 15286 + }, + { + "epoch": 4.058144165671047, + "grad_norm": 0.46790617448347743, + "learning_rate": 1.1930987528028737e-06, + "loss": 0.5491, + "step": 15287 + }, + { + "epoch": 4.058409664144431, + "grad_norm": 0.4756146151115185, + "learning_rate": 1.1928011411217545e-06, + "loss": 0.5009, + "step": 15288 + }, + { + "epoch": 4.058675162617815, + "grad_norm": 0.4689201213308079, + "learning_rate": 1.192503554934213e-06, + "loss": 0.5393, + "step": 15289 + }, + { + "epoch": 4.058940661091198, + "grad_norm": 0.4767789691691604, + "learning_rate": 1.1922059942460539e-06, + "loss": 0.5716, + "step": 15290 + }, + { + "epoch": 4.0592061595645825, + "grad_norm": 0.4782801756543797, + "learning_rate": 1.1919084590630794e-06, + "loss": 0.5178, + "step": 15291 + }, + { + "epoch": 4.059471658037967, + "grad_norm": 0.46368782840948475, + "learning_rate": 1.1916109493910937e-06, + "loss": 0.482, + "step": 15292 + }, + { + "epoch": 4.05973715651135, + "grad_norm": 0.4631731760114486, + "learning_rate": 1.191313465235898e-06, + "loss": 0.5461, + "step": 15293 + }, + { + "epoch": 4.060002654984734, + "grad_norm": 0.4811916201362852, + "learning_rate": 1.1910160066032928e-06, + "loss": 0.537, + "step": 15294 + }, + { + "epoch": 4.060268153458118, + "grad_norm": 0.4648490441782714, + "learning_rate": 1.1907185734990814e-06, + "loss": 0.5052, + "step": 15295 + }, + { + "epoch": 4.060533651931501, + "grad_norm": 0.4848048504538911, + "learning_rate": 1.1904211659290627e-06, + "loss": 0.5461, + "step": 15296 + }, + { + "epoch": 4.060799150404885, + "grad_norm": 0.4651667451206629, + "learning_rate": 1.1901237838990385e-06, + "loss": 0.5357, + "step": 15297 + }, + { + "epoch": 4.061064648878269, + "grad_norm": 0.477377319462052, + "learning_rate": 1.1898264274148067e-06, + "loss": 0.5214, + "step": 15298 + }, + { + "epoch": 4.0613301473516525, + "grad_norm": 0.4872921338439011, + "learning_rate": 1.1895290964821685e-06, + "loss": 0.5331, + "step": 15299 + }, + { + "epoch": 4.061595645825037, + "grad_norm": 0.46819902409357517, + "learning_rate": 1.1892317911069212e-06, + "loss": 0.5272, + "step": 15300 + }, + { + "epoch": 4.061861144298421, + "grad_norm": 0.4630313282628995, + "learning_rate": 1.1889345112948624e-06, + "loss": 0.546, + "step": 15301 + }, + { + "epoch": 4.062126642771804, + "grad_norm": 0.4821826806296449, + "learning_rate": 1.1886372570517917e-06, + "loss": 0.518, + "step": 15302 + }, + { + "epoch": 4.062392141245188, + "grad_norm": 0.47594826621935443, + "learning_rate": 1.1883400283835044e-06, + "loss": 0.5247, + "step": 15303 + }, + { + "epoch": 4.062657639718571, + "grad_norm": 0.4833855245415668, + "learning_rate": 1.1880428252958001e-06, + "loss": 0.5878, + "step": 15304 + }, + { + "epoch": 4.062923138191955, + "grad_norm": 0.45649196946309656, + "learning_rate": 1.1877456477944707e-06, + "loss": 0.5236, + "step": 15305 + }, + { + "epoch": 4.063188636665339, + "grad_norm": 0.4557553307661307, + "learning_rate": 1.1874484958853153e-06, + "loss": 0.5304, + "step": 15306 + }, + { + "epoch": 4.063454135138723, + "grad_norm": 0.4711309149466802, + "learning_rate": 1.187151369574127e-06, + "loss": 0.5337, + "step": 15307 + }, + { + "epoch": 4.063719633612107, + "grad_norm": 0.47590406561081133, + "learning_rate": 1.186854268866702e-06, + "loss": 0.503, + "step": 15308 + }, + { + "epoch": 4.063985132085491, + "grad_norm": 0.48984722589565893, + "learning_rate": 1.186557193768833e-06, + "loss": 0.54, + "step": 15309 + }, + { + "epoch": 4.064250630558874, + "grad_norm": 0.48796745471143543, + "learning_rate": 1.1862601442863158e-06, + "loss": 0.553, + "step": 15310 + }, + { + "epoch": 4.064516129032258, + "grad_norm": 0.4765259659436347, + "learning_rate": 1.1859631204249422e-06, + "loss": 0.5454, + "step": 15311 + }, + { + "epoch": 4.064781627505642, + "grad_norm": 0.47654842225866384, + "learning_rate": 1.1856661221905041e-06, + "loss": 0.5581, + "step": 15312 + }, + { + "epoch": 4.065047125979025, + "grad_norm": 0.48410421429554673, + "learning_rate": 1.1853691495887958e-06, + "loss": 0.5462, + "step": 15313 + }, + { + "epoch": 4.065312624452409, + "grad_norm": 0.47497710593538495, + "learning_rate": 1.1850722026256067e-06, + "loss": 0.5135, + "step": 15314 + }, + { + "epoch": 4.0655781229257935, + "grad_norm": 0.47271319237251674, + "learning_rate": 1.1847752813067304e-06, + "loss": 0.5448, + "step": 15315 + }, + { + "epoch": 4.065843621399177, + "grad_norm": 0.4861264911121132, + "learning_rate": 1.184478385637956e-06, + "loss": 0.5708, + "step": 15316 + }, + { + "epoch": 4.066109119872561, + "grad_norm": 0.4816718011082714, + "learning_rate": 1.184181515625074e-06, + "loss": 0.5158, + "step": 15317 + }, + { + "epoch": 4.066374618345945, + "grad_norm": 0.4826982993719964, + "learning_rate": 1.1838846712738735e-06, + "loss": 0.5214, + "step": 15318 + }, + { + "epoch": 4.066640116819328, + "grad_norm": 0.46762958161854784, + "learning_rate": 1.1835878525901443e-06, + "loss": 0.5266, + "step": 15319 + }, + { + "epoch": 4.066905615292712, + "grad_norm": 0.4791146882970028, + "learning_rate": 1.1832910595796757e-06, + "loss": 0.5227, + "step": 15320 + }, + { + "epoch": 4.067171113766096, + "grad_norm": 0.46856974905413734, + "learning_rate": 1.1829942922482545e-06, + "loss": 0.5296, + "step": 15321 + }, + { + "epoch": 4.067436612239479, + "grad_norm": 0.4640277367142785, + "learning_rate": 1.1826975506016714e-06, + "loss": 0.5401, + "step": 15322 + }, + { + "epoch": 4.0677021107128635, + "grad_norm": 0.4743643068830104, + "learning_rate": 1.182400834645709e-06, + "loss": 0.581, + "step": 15323 + }, + { + "epoch": 4.067967609186248, + "grad_norm": 0.4632200314572056, + "learning_rate": 1.1821041443861575e-06, + "loss": 0.5132, + "step": 15324 + }, + { + "epoch": 4.068233107659631, + "grad_norm": 0.46129276437946515, + "learning_rate": 1.1818074798288012e-06, + "loss": 0.5575, + "step": 15325 + }, + { + "epoch": 4.068498606133015, + "grad_norm": 0.4613027380576767, + "learning_rate": 1.1815108409794273e-06, + "loss": 0.5064, + "step": 15326 + }, + { + "epoch": 4.068764104606399, + "grad_norm": 0.4753145277392267, + "learning_rate": 1.1812142278438202e-06, + "loss": 0.5625, + "step": 15327 + }, + { + "epoch": 4.069029603079782, + "grad_norm": 0.4805050311831935, + "learning_rate": 1.1809176404277637e-06, + "loss": 0.5367, + "step": 15328 + }, + { + "epoch": 4.069295101553166, + "grad_norm": 0.47317748272457183, + "learning_rate": 1.1806210787370436e-06, + "loss": 0.5285, + "step": 15329 + }, + { + "epoch": 4.06956060002655, + "grad_norm": 0.46825559705124414, + "learning_rate": 1.1803245427774424e-06, + "loss": 0.5169, + "step": 15330 + }, + { + "epoch": 4.0698260984999335, + "grad_norm": 0.476465886374575, + "learning_rate": 1.1800280325547445e-06, + "loss": 0.5323, + "step": 15331 + }, + { + "epoch": 4.070091596973318, + "grad_norm": 0.4532279973895195, + "learning_rate": 1.1797315480747307e-06, + "loss": 0.4969, + "step": 15332 + }, + { + "epoch": 4.070357095446701, + "grad_norm": 0.4756226834657503, + "learning_rate": 1.179435089343186e-06, + "loss": 0.5649, + "step": 15333 + }, + { + "epoch": 4.070622593920085, + "grad_norm": 0.4495892052217501, + "learning_rate": 1.179138656365889e-06, + "loss": 0.5048, + "step": 15334 + }, + { + "epoch": 4.070888092393469, + "grad_norm": 0.4783500987244886, + "learning_rate": 1.1788422491486228e-06, + "loss": 0.4976, + "step": 15335 + }, + { + "epoch": 4.071153590866852, + "grad_norm": 0.4814531287656348, + "learning_rate": 1.1785458676971666e-06, + "loss": 0.5444, + "step": 15336 + }, + { + "epoch": 4.071419089340236, + "grad_norm": 0.4726424582963027, + "learning_rate": 1.1782495120173024e-06, + "loss": 0.57, + "step": 15337 + }, + { + "epoch": 4.07168458781362, + "grad_norm": 0.47045057779885807, + "learning_rate": 1.1779531821148083e-06, + "loss": 0.5419, + "step": 15338 + }, + { + "epoch": 4.0719500862870035, + "grad_norm": 0.4995612227957978, + "learning_rate": 1.1776568779954644e-06, + "loss": 0.579, + "step": 15339 + }, + { + "epoch": 4.072215584760388, + "grad_norm": 0.47865369747694886, + "learning_rate": 1.1773605996650495e-06, + "loss": 0.5494, + "step": 15340 + }, + { + "epoch": 4.072481083233772, + "grad_norm": 0.4756500319571732, + "learning_rate": 1.1770643471293403e-06, + "loss": 0.551, + "step": 15341 + }, + { + "epoch": 4.072746581707155, + "grad_norm": 0.47400874728396725, + "learning_rate": 1.1767681203941165e-06, + "loss": 0.5416, + "step": 15342 + }, + { + "epoch": 4.073012080180539, + "grad_norm": 0.45946739185190155, + "learning_rate": 1.1764719194651528e-06, + "loss": 0.5346, + "step": 15343 + }, + { + "epoch": 4.073277578653923, + "grad_norm": 0.47532188945572756, + "learning_rate": 1.1761757443482285e-06, + "loss": 0.5285, + "step": 15344 + }, + { + "epoch": 4.073543077127306, + "grad_norm": 0.4797388114666441, + "learning_rate": 1.1758795950491184e-06, + "loss": 0.5798, + "step": 15345 + }, + { + "epoch": 4.07380857560069, + "grad_norm": 0.485271505679119, + "learning_rate": 1.1755834715735973e-06, + "loss": 0.528, + "step": 15346 + }, + { + "epoch": 4.074074074074074, + "grad_norm": 0.47970389572301153, + "learning_rate": 1.175287373927442e-06, + "loss": 0.5382, + "step": 15347 + }, + { + "epoch": 4.074339572547458, + "grad_norm": 0.46593528882108654, + "learning_rate": 1.1749913021164255e-06, + "loss": 0.5276, + "step": 15348 + }, + { + "epoch": 4.074605071020842, + "grad_norm": 0.48487399286365107, + "learning_rate": 1.1746952561463238e-06, + "loss": 0.4979, + "step": 15349 + }, + { + "epoch": 4.074870569494226, + "grad_norm": 0.45727755486644306, + "learning_rate": 1.1743992360229096e-06, + "loss": 0.5512, + "step": 15350 + }, + { + "epoch": 4.075136067967609, + "grad_norm": 0.48028764181032724, + "learning_rate": 1.1741032417519557e-06, + "loss": 0.5581, + "step": 15351 + }, + { + "epoch": 4.075401566440993, + "grad_norm": 0.47768840036637783, + "learning_rate": 1.1738072733392343e-06, + "loss": 0.5156, + "step": 15352 + }, + { + "epoch": 4.075667064914377, + "grad_norm": 0.4601302356588837, + "learning_rate": 1.1735113307905186e-06, + "loss": 0.5285, + "step": 15353 + }, + { + "epoch": 4.07593256338776, + "grad_norm": 0.46846694396161453, + "learning_rate": 1.1732154141115793e-06, + "loss": 0.5157, + "step": 15354 + }, + { + "epoch": 4.076198061861144, + "grad_norm": 0.4812337278251216, + "learning_rate": 1.1729195233081885e-06, + "loss": 0.5711, + "step": 15355 + }, + { + "epoch": 4.0764635603345285, + "grad_norm": 0.4692316621318943, + "learning_rate": 1.1726236583861164e-06, + "loss": 0.5394, + "step": 15356 + }, + { + "epoch": 4.076729058807912, + "grad_norm": 0.49621358990959286, + "learning_rate": 1.1723278193511322e-06, + "loss": 0.5581, + "step": 15357 + }, + { + "epoch": 4.076994557281296, + "grad_norm": 0.46629862975972525, + "learning_rate": 1.172032006209007e-06, + "loss": 0.5088, + "step": 15358 + }, + { + "epoch": 4.077260055754679, + "grad_norm": 0.4678915136677538, + "learning_rate": 1.1717362189655081e-06, + "loss": 0.513, + "step": 15359 + }, + { + "epoch": 4.077525554228063, + "grad_norm": 0.48223867474177495, + "learning_rate": 1.1714404576264063e-06, + "loss": 0.5577, + "step": 15360 + }, + { + "epoch": 4.077791052701447, + "grad_norm": 0.48019797143928405, + "learning_rate": 1.1711447221974672e-06, + "loss": 0.5247, + "step": 15361 + }, + { + "epoch": 4.07805655117483, + "grad_norm": 0.4772406195961187, + "learning_rate": 1.1708490126844615e-06, + "loss": 0.5424, + "step": 15362 + }, + { + "epoch": 4.078322049648214, + "grad_norm": 0.4769514324409852, + "learning_rate": 1.170553329093153e-06, + "loss": 0.5464, + "step": 15363 + }, + { + "epoch": 4.0785875481215985, + "grad_norm": 0.49150032153037826, + "learning_rate": 1.170257671429309e-06, + "loss": 0.5736, + "step": 15364 + }, + { + "epoch": 4.078853046594982, + "grad_norm": 0.4828193422297218, + "learning_rate": 1.1699620396986974e-06, + "loss": 0.545, + "step": 15365 + }, + { + "epoch": 4.079118545068366, + "grad_norm": 0.4845799788006039, + "learning_rate": 1.1696664339070815e-06, + "loss": 0.5521, + "step": 15366 + }, + { + "epoch": 4.07938404354175, + "grad_norm": 0.4830359249173579, + "learning_rate": 1.1693708540602284e-06, + "loss": 0.5666, + "step": 15367 + }, + { + "epoch": 4.079649542015133, + "grad_norm": 0.4733848034091678, + "learning_rate": 1.1690753001639012e-06, + "loss": 0.5385, + "step": 15368 + }, + { + "epoch": 4.079915040488517, + "grad_norm": 0.4853737838345869, + "learning_rate": 1.1687797722238647e-06, + "loss": 0.5176, + "step": 15369 + }, + { + "epoch": 4.080180538961901, + "grad_norm": 0.4721494892442286, + "learning_rate": 1.1684842702458812e-06, + "loss": 0.5039, + "step": 15370 + }, + { + "epoch": 4.0804460374352844, + "grad_norm": 0.46281745321023837, + "learning_rate": 1.1681887942357154e-06, + "loss": 0.5414, + "step": 15371 + }, + { + "epoch": 4.0807115359086685, + "grad_norm": 0.46314902099768956, + "learning_rate": 1.1678933441991278e-06, + "loss": 0.5339, + "step": 15372 + }, + { + "epoch": 4.080977034382053, + "grad_norm": 0.47762926617506124, + "learning_rate": 1.1675979201418824e-06, + "loss": 0.5588, + "step": 15373 + }, + { + "epoch": 4.081242532855436, + "grad_norm": 0.4804263269469661, + "learning_rate": 1.1673025220697401e-06, + "loss": 0.5445, + "step": 15374 + }, + { + "epoch": 4.08150803132882, + "grad_norm": 0.4587883281413736, + "learning_rate": 1.1670071499884608e-06, + "loss": 0.4721, + "step": 15375 + }, + { + "epoch": 4.081773529802204, + "grad_norm": 0.4885394960529969, + "learning_rate": 1.1667118039038063e-06, + "loss": 0.5525, + "step": 15376 + }, + { + "epoch": 4.082039028275587, + "grad_norm": 0.4819447851660282, + "learning_rate": 1.1664164838215355e-06, + "loss": 0.5116, + "step": 15377 + }, + { + "epoch": 4.082304526748971, + "grad_norm": 0.46866352339357137, + "learning_rate": 1.1661211897474092e-06, + "loss": 0.5282, + "step": 15378 + }, + { + "epoch": 4.082570025222355, + "grad_norm": 0.4686368275561569, + "learning_rate": 1.1658259216871858e-06, + "loss": 0.5611, + "step": 15379 + }, + { + "epoch": 4.0828355236957385, + "grad_norm": 0.46414397678079544, + "learning_rate": 1.1655306796466234e-06, + "loss": 0.507, + "step": 15380 + }, + { + "epoch": 4.083101022169123, + "grad_norm": 0.4731892407955562, + "learning_rate": 1.1652354636314792e-06, + "loss": 0.5453, + "step": 15381 + }, + { + "epoch": 4.083366520642507, + "grad_norm": 0.4929337264327907, + "learning_rate": 1.1649402736475124e-06, + "loss": 0.5067, + "step": 15382 + }, + { + "epoch": 4.08363201911589, + "grad_norm": 0.4660932981669949, + "learning_rate": 1.1646451097004781e-06, + "loss": 0.5056, + "step": 15383 + }, + { + "epoch": 4.083897517589274, + "grad_norm": 0.4707144092539894, + "learning_rate": 1.164349971796134e-06, + "loss": 0.5582, + "step": 15384 + }, + { + "epoch": 4.084163016062657, + "grad_norm": 0.48288355973386526, + "learning_rate": 1.1640548599402371e-06, + "loss": 0.5329, + "step": 15385 + }, + { + "epoch": 4.084428514536041, + "grad_norm": 0.4935139811592437, + "learning_rate": 1.1637597741385398e-06, + "loss": 0.5176, + "step": 15386 + }, + { + "epoch": 4.084694013009425, + "grad_norm": 0.47923897780261815, + "learning_rate": 1.1634647143967994e-06, + "loss": 0.5724, + "step": 15387 + }, + { + "epoch": 4.084959511482809, + "grad_norm": 0.4790410497393249, + "learning_rate": 1.1631696807207688e-06, + "loss": 0.4873, + "step": 15388 + }, + { + "epoch": 4.085225009956193, + "grad_norm": 0.49323182560277307, + "learning_rate": 1.162874673116203e-06, + "loss": 0.5239, + "step": 15389 + }, + { + "epoch": 4.085490508429577, + "grad_norm": 0.47724023629664386, + "learning_rate": 1.1625796915888544e-06, + "loss": 0.5806, + "step": 15390 + }, + { + "epoch": 4.08575600690296, + "grad_norm": 0.47681147842768906, + "learning_rate": 1.1622847361444773e-06, + "loss": 0.5379, + "step": 15391 + }, + { + "epoch": 4.086021505376344, + "grad_norm": 0.47231315409210184, + "learning_rate": 1.1619898067888226e-06, + "loss": 0.5381, + "step": 15392 + }, + { + "epoch": 4.086287003849728, + "grad_norm": 0.4695759483539134, + "learning_rate": 1.1616949035276423e-06, + "loss": 0.5401, + "step": 15393 + }, + { + "epoch": 4.086552502323111, + "grad_norm": 0.4789459177210307, + "learning_rate": 1.161400026366689e-06, + "loss": 0.5251, + "step": 15394 + }, + { + "epoch": 4.086818000796495, + "grad_norm": 0.4634008031719571, + "learning_rate": 1.1611051753117117e-06, + "loss": 0.5399, + "step": 15395 + }, + { + "epoch": 4.0870834992698795, + "grad_norm": 0.46131747788778044, + "learning_rate": 1.1608103503684623e-06, + "loss": 0.5137, + "step": 15396 + }, + { + "epoch": 4.087348997743263, + "grad_norm": 0.45453070236140375, + "learning_rate": 1.1605155515426903e-06, + "loss": 0.5037, + "step": 15397 + }, + { + "epoch": 4.087614496216647, + "grad_norm": 0.46897052288012575, + "learning_rate": 1.1602207788401446e-06, + "loss": 0.5551, + "step": 15398 + }, + { + "epoch": 4.087879994690031, + "grad_norm": 0.4785642935837407, + "learning_rate": 1.1599260322665728e-06, + "loss": 0.548, + "step": 15399 + }, + { + "epoch": 4.088145493163414, + "grad_norm": 0.4698628269754639, + "learning_rate": 1.1596313118277257e-06, + "loss": 0.5273, + "step": 15400 + }, + { + "epoch": 4.088410991636798, + "grad_norm": 0.47769471394607665, + "learning_rate": 1.1593366175293488e-06, + "loss": 0.5739, + "step": 15401 + }, + { + "epoch": 4.088676490110182, + "grad_norm": 0.4815844365115442, + "learning_rate": 1.1590419493771915e-06, + "loss": 0.5472, + "step": 15402 + }, + { + "epoch": 4.088941988583565, + "grad_norm": 0.4744064643503215, + "learning_rate": 1.1587473073769991e-06, + "loss": 0.5597, + "step": 15403 + }, + { + "epoch": 4.0892074870569495, + "grad_norm": 0.4709593160176362, + "learning_rate": 1.1584526915345174e-06, + "loss": 0.5319, + "step": 15404 + }, + { + "epoch": 4.089472985530334, + "grad_norm": 0.47509094804297247, + "learning_rate": 1.1581581018554939e-06, + "loss": 0.5517, + "step": 15405 + }, + { + "epoch": 4.089738484003717, + "grad_norm": 0.46634770021070465, + "learning_rate": 1.1578635383456716e-06, + "loss": 0.5461, + "step": 15406 + }, + { + "epoch": 4.090003982477101, + "grad_norm": 0.46550451517465524, + "learning_rate": 1.1575690010107976e-06, + "loss": 0.5517, + "step": 15407 + }, + { + "epoch": 4.090269480950485, + "grad_norm": 0.489951664934729, + "learning_rate": 1.1572744898566146e-06, + "loss": 0.5859, + "step": 15408 + }, + { + "epoch": 4.090534979423868, + "grad_norm": 0.48053420500710897, + "learning_rate": 1.156980004888866e-06, + "loss": 0.5589, + "step": 15409 + }, + { + "epoch": 4.090800477897252, + "grad_norm": 0.46893462199055064, + "learning_rate": 1.1566855461132963e-06, + "loss": 0.5376, + "step": 15410 + }, + { + "epoch": 4.091065976370636, + "grad_norm": 0.4588324628884706, + "learning_rate": 1.1563911135356468e-06, + "loss": 0.5344, + "step": 15411 + }, + { + "epoch": 4.0913314748440195, + "grad_norm": 0.4822858221906309, + "learning_rate": 1.1560967071616612e-06, + "loss": 0.5305, + "step": 15412 + }, + { + "epoch": 4.091596973317404, + "grad_norm": 0.4788976470478269, + "learning_rate": 1.155802326997079e-06, + "loss": 0.5284, + "step": 15413 + }, + { + "epoch": 4.091862471790787, + "grad_norm": 0.48463632455514644, + "learning_rate": 1.1555079730476448e-06, + "loss": 0.5681, + "step": 15414 + }, + { + "epoch": 4.092127970264171, + "grad_norm": 0.47671625881163676, + "learning_rate": 1.155213645319095e-06, + "loss": 0.5436, + "step": 15415 + }, + { + "epoch": 4.092393468737555, + "grad_norm": 0.47351882971634107, + "learning_rate": 1.1549193438171724e-06, + "loss": 0.532, + "step": 15416 + }, + { + "epoch": 4.092658967210938, + "grad_norm": 0.4833109004186374, + "learning_rate": 1.1546250685476155e-06, + "loss": 0.5336, + "step": 15417 + }, + { + "epoch": 4.092924465684322, + "grad_norm": 0.47087944813485966, + "learning_rate": 1.1543308195161643e-06, + "loss": 0.5163, + "step": 15418 + }, + { + "epoch": 4.093189964157706, + "grad_norm": 0.4647098002558098, + "learning_rate": 1.154036596728556e-06, + "loss": 0.5154, + "step": 15419 + }, + { + "epoch": 4.0934554626310895, + "grad_norm": 0.4640195698522609, + "learning_rate": 1.1537424001905309e-06, + "loss": 0.5534, + "step": 15420 + }, + { + "epoch": 4.093720961104474, + "grad_norm": 0.4722462043037923, + "learning_rate": 1.1534482299078247e-06, + "loss": 0.4885, + "step": 15421 + }, + { + "epoch": 4.093986459577858, + "grad_norm": 0.47256089414829705, + "learning_rate": 1.1531540858861741e-06, + "loss": 0.5402, + "step": 15422 + }, + { + "epoch": 4.094251958051241, + "grad_norm": 0.47321317654208594, + "learning_rate": 1.1528599681313177e-06, + "loss": 0.5385, + "step": 15423 + }, + { + "epoch": 4.094517456524625, + "grad_norm": 0.4844058743497153, + "learning_rate": 1.152565876648989e-06, + "loss": 0.5435, + "step": 15424 + }, + { + "epoch": 4.094782954998009, + "grad_norm": 0.46486715437271514, + "learning_rate": 1.1522718114449272e-06, + "loss": 0.518, + "step": 15425 + }, + { + "epoch": 4.095048453471392, + "grad_norm": 0.4624993544497032, + "learning_rate": 1.1519777725248627e-06, + "loss": 0.5102, + "step": 15426 + }, + { + "epoch": 4.095313951944776, + "grad_norm": 0.4732863416922178, + "learning_rate": 1.151683759894533e-06, + "loss": 0.5273, + "step": 15427 + }, + { + "epoch": 4.09557945041816, + "grad_norm": 0.46464623280533895, + "learning_rate": 1.1513897735596702e-06, + "loss": 0.5183, + "step": 15428 + }, + { + "epoch": 4.095844948891544, + "grad_norm": 0.4663027851831423, + "learning_rate": 1.15109581352601e-06, + "loss": 0.5569, + "step": 15429 + }, + { + "epoch": 4.096110447364928, + "grad_norm": 0.4902728855169866, + "learning_rate": 1.1508018797992832e-06, + "loss": 0.5314, + "step": 15430 + }, + { + "epoch": 4.096375945838312, + "grad_norm": 0.4821755261176922, + "learning_rate": 1.150507972385224e-06, + "loss": 0.5778, + "step": 15431 + }, + { + "epoch": 4.096641444311695, + "grad_norm": 0.4573758376930445, + "learning_rate": 1.1502140912895633e-06, + "loss": 0.5022, + "step": 15432 + }, + { + "epoch": 4.096906942785079, + "grad_norm": 0.4871239544339889, + "learning_rate": 1.1499202365180317e-06, + "loss": 0.5216, + "step": 15433 + }, + { + "epoch": 4.097172441258463, + "grad_norm": 0.47534498859128965, + "learning_rate": 1.1496264080763622e-06, + "loss": 0.5349, + "step": 15434 + }, + { + "epoch": 4.097437939731846, + "grad_norm": 0.48286790749003305, + "learning_rate": 1.149332605970283e-06, + "loss": 0.5383, + "step": 15435 + }, + { + "epoch": 4.09770343820523, + "grad_norm": 0.4751447745762735, + "learning_rate": 1.1490388302055257e-06, + "loss": 0.5044, + "step": 15436 + }, + { + "epoch": 4.0979689366786145, + "grad_norm": 0.4627625048492895, + "learning_rate": 1.1487450807878196e-06, + "loss": 0.5248, + "step": 15437 + }, + { + "epoch": 4.098234435151998, + "grad_norm": 0.47613938667242905, + "learning_rate": 1.148451357722891e-06, + "loss": 0.5557, + "step": 15438 + }, + { + "epoch": 4.098499933625382, + "grad_norm": 0.48247944211328836, + "learning_rate": 1.148157661016472e-06, + "loss": 0.5588, + "step": 15439 + }, + { + "epoch": 4.098765432098766, + "grad_norm": 0.4799147552815529, + "learning_rate": 1.147863990674287e-06, + "loss": 0.541, + "step": 15440 + }, + { + "epoch": 4.099030930572149, + "grad_norm": 0.4694703630859369, + "learning_rate": 1.1475703467020658e-06, + "loss": 0.5573, + "step": 15441 + }, + { + "epoch": 4.099296429045533, + "grad_norm": 0.48546992650724763, + "learning_rate": 1.147276729105533e-06, + "loss": 0.5859, + "step": 15442 + }, + { + "epoch": 4.099561927518916, + "grad_norm": 0.47419531800443093, + "learning_rate": 1.1469831378904183e-06, + "loss": 0.4804, + "step": 15443 + }, + { + "epoch": 4.0998274259923, + "grad_norm": 0.48278851249749727, + "learning_rate": 1.146689573062443e-06, + "loss": 0.5685, + "step": 15444 + }, + { + "epoch": 4.1000929244656845, + "grad_norm": 0.4649842514134834, + "learning_rate": 1.1463960346273356e-06, + "loss": 0.5503, + "step": 15445 + }, + { + "epoch": 4.100358422939068, + "grad_norm": 0.4728270775971377, + "learning_rate": 1.1461025225908188e-06, + "loss": 0.5665, + "step": 15446 + }, + { + "epoch": 4.100623921412452, + "grad_norm": 0.47134001836755457, + "learning_rate": 1.1458090369586186e-06, + "loss": 0.5334, + "step": 15447 + }, + { + "epoch": 4.100889419885836, + "grad_norm": 0.4735592696378864, + "learning_rate": 1.1455155777364576e-06, + "loss": 0.5448, + "step": 15448 + }, + { + "epoch": 4.101154918359219, + "grad_norm": 0.4833817732500202, + "learning_rate": 1.1452221449300588e-06, + "loss": 0.561, + "step": 15449 + }, + { + "epoch": 4.101420416832603, + "grad_norm": 0.4774546010657383, + "learning_rate": 1.1449287385451458e-06, + "loss": 0.5265, + "step": 15450 + }, + { + "epoch": 4.101685915305987, + "grad_norm": 0.4756031557966933, + "learning_rate": 1.1446353585874393e-06, + "loss": 0.5602, + "step": 15451 + }, + { + "epoch": 4.1019514137793704, + "grad_norm": 0.4735285706441791, + "learning_rate": 1.1443420050626624e-06, + "loss": 0.5511, + "step": 15452 + }, + { + "epoch": 4.1022169122527545, + "grad_norm": 0.4823422993637103, + "learning_rate": 1.1440486779765352e-06, + "loss": 0.5355, + "step": 15453 + }, + { + "epoch": 4.102482410726139, + "grad_norm": 0.47948960013614494, + "learning_rate": 1.1437553773347804e-06, + "loss": 0.547, + "step": 15454 + }, + { + "epoch": 4.102747909199522, + "grad_norm": 0.48302260518832413, + "learning_rate": 1.1434621031431145e-06, + "loss": 0.561, + "step": 15455 + }, + { + "epoch": 4.103013407672906, + "grad_norm": 0.480244665964944, + "learning_rate": 1.143168855407259e-06, + "loss": 0.5575, + "step": 15456 + }, + { + "epoch": 4.10327890614629, + "grad_norm": 0.47675986978699214, + "learning_rate": 1.1428756341329338e-06, + "loss": 0.5334, + "step": 15457 + }, + { + "epoch": 4.103544404619673, + "grad_norm": 0.4821504789013667, + "learning_rate": 1.142582439325856e-06, + "loss": 0.5332, + "step": 15458 + }, + { + "epoch": 4.103809903093057, + "grad_norm": 0.47611330885118996, + "learning_rate": 1.1422892709917446e-06, + "loss": 0.5315, + "step": 15459 + }, + { + "epoch": 4.104075401566441, + "grad_norm": 0.48452878116476633, + "learning_rate": 1.141996129136317e-06, + "loss": 0.5499, + "step": 15460 + }, + { + "epoch": 4.1043409000398245, + "grad_norm": 0.48164522330123033, + "learning_rate": 1.1417030137652896e-06, + "loss": 0.5777, + "step": 15461 + }, + { + "epoch": 4.104606398513209, + "grad_norm": 0.48619550810479, + "learning_rate": 1.1414099248843782e-06, + "loss": 0.5501, + "step": 15462 + }, + { + "epoch": 4.104871896986593, + "grad_norm": 0.4848212799772456, + "learning_rate": 1.1411168624993004e-06, + "loss": 0.5339, + "step": 15463 + }, + { + "epoch": 4.105137395459976, + "grad_norm": 0.4887231954192791, + "learning_rate": 1.1408238266157703e-06, + "loss": 0.5489, + "step": 15464 + }, + { + "epoch": 4.10540289393336, + "grad_norm": 0.49186598726095804, + "learning_rate": 1.1405308172395043e-06, + "loss": 0.5421, + "step": 15465 + }, + { + "epoch": 4.105668392406744, + "grad_norm": 0.4833677954385405, + "learning_rate": 1.140237834376216e-06, + "loss": 0.5036, + "step": 15466 + }, + { + "epoch": 4.105933890880127, + "grad_norm": 0.4787778057518926, + "learning_rate": 1.139944878031618e-06, + "loss": 0.5486, + "step": 15467 + }, + { + "epoch": 4.106199389353511, + "grad_norm": 0.4835492936274251, + "learning_rate": 1.139651948211426e-06, + "loss": 0.5459, + "step": 15468 + }, + { + "epoch": 4.106464887826895, + "grad_norm": 0.47353895102262933, + "learning_rate": 1.139359044921351e-06, + "loss": 0.5282, + "step": 15469 + }, + { + "epoch": 4.106730386300279, + "grad_norm": 0.46089384567683056, + "learning_rate": 1.1390661681671066e-06, + "loss": 0.5328, + "step": 15470 + }, + { + "epoch": 4.106995884773663, + "grad_norm": 0.48443599195542786, + "learning_rate": 1.1387733179544042e-06, + "loss": 0.5242, + "step": 15471 + }, + { + "epoch": 4.107261383247046, + "grad_norm": 0.49848100963334163, + "learning_rate": 1.1384804942889549e-06, + "loss": 0.5665, + "step": 15472 + }, + { + "epoch": 4.10752688172043, + "grad_norm": 0.4722537934171106, + "learning_rate": 1.1381876971764688e-06, + "loss": 0.558, + "step": 15473 + }, + { + "epoch": 4.107792380193814, + "grad_norm": 0.45797572626807087, + "learning_rate": 1.1378949266226577e-06, + "loss": 0.5132, + "step": 15474 + }, + { + "epoch": 4.108057878667197, + "grad_norm": 0.4733250590764185, + "learning_rate": 1.1376021826332295e-06, + "loss": 0.5505, + "step": 15475 + }, + { + "epoch": 4.108323377140581, + "grad_norm": 0.5381160805824715, + "learning_rate": 1.1373094652138945e-06, + "loss": 0.5351, + "step": 15476 + }, + { + "epoch": 4.1085888756139655, + "grad_norm": 0.48283512177860494, + "learning_rate": 1.1370167743703634e-06, + "loss": 0.5365, + "step": 15477 + }, + { + "epoch": 4.108854374087349, + "grad_norm": 0.49282344315052784, + "learning_rate": 1.1367241101083404e-06, + "loss": 0.555, + "step": 15478 + }, + { + "epoch": 4.109119872560733, + "grad_norm": 0.478607039300628, + "learning_rate": 1.1364314724335359e-06, + "loss": 0.5047, + "step": 15479 + }, + { + "epoch": 4.109385371034117, + "grad_norm": 0.4850599282933851, + "learning_rate": 1.1361388613516555e-06, + "loss": 0.5764, + "step": 15480 + }, + { + "epoch": 4.1096508695075, + "grad_norm": 0.4824368961899814, + "learning_rate": 1.1358462768684075e-06, + "loss": 0.5073, + "step": 15481 + }, + { + "epoch": 4.109916367980884, + "grad_norm": 0.4950245034499109, + "learning_rate": 1.1355537189894963e-06, + "loss": 0.5355, + "step": 15482 + }, + { + "epoch": 4.110181866454268, + "grad_norm": 0.47658368926283523, + "learning_rate": 1.135261187720629e-06, + "loss": 0.5754, + "step": 15483 + }, + { + "epoch": 4.110447364927651, + "grad_norm": 0.46280756800415745, + "learning_rate": 1.1349686830675102e-06, + "loss": 0.5353, + "step": 15484 + }, + { + "epoch": 4.1107128634010355, + "grad_norm": 0.47251089894170517, + "learning_rate": 1.1346762050358434e-06, + "loss": 0.5322, + "step": 15485 + }, + { + "epoch": 4.11097836187442, + "grad_norm": 0.48628681443600474, + "learning_rate": 1.1343837536313342e-06, + "loss": 0.5064, + "step": 15486 + }, + { + "epoch": 4.111243860347803, + "grad_norm": 0.47553115784868455, + "learning_rate": 1.1340913288596846e-06, + "loss": 0.5134, + "step": 15487 + }, + { + "epoch": 4.111509358821187, + "grad_norm": 0.48641577675168407, + "learning_rate": 1.1337989307265995e-06, + "loss": 0.5705, + "step": 15488 + }, + { + "epoch": 4.111774857294571, + "grad_norm": 0.47735331959611227, + "learning_rate": 1.13350655923778e-06, + "loss": 0.5261, + "step": 15489 + }, + { + "epoch": 4.112040355767954, + "grad_norm": 0.4660302700507674, + "learning_rate": 1.1332142143989285e-06, + "loss": 0.4889, + "step": 15490 + }, + { + "epoch": 4.112305854241338, + "grad_norm": 0.47030674494640967, + "learning_rate": 1.1329218962157454e-06, + "loss": 0.5025, + "step": 15491 + }, + { + "epoch": 4.112571352714722, + "grad_norm": 0.46511406413352496, + "learning_rate": 1.1326296046939334e-06, + "loss": 0.5302, + "step": 15492 + }, + { + "epoch": 4.1128368511881055, + "grad_norm": 0.4734691688143414, + "learning_rate": 1.1323373398391909e-06, + "loss": 0.5662, + "step": 15493 + }, + { + "epoch": 4.11310234966149, + "grad_norm": 0.46993128516122984, + "learning_rate": 1.13204510165722e-06, + "loss": 0.5855, + "step": 15494 + }, + { + "epoch": 4.113367848134874, + "grad_norm": 0.46978789182989583, + "learning_rate": 1.131752890153719e-06, + "loss": 0.5553, + "step": 15495 + }, + { + "epoch": 4.113633346608257, + "grad_norm": 0.46397060441926696, + "learning_rate": 1.1314607053343857e-06, + "loss": 0.5083, + "step": 15496 + }, + { + "epoch": 4.113898845081641, + "grad_norm": 0.4836015882237248, + "learning_rate": 1.1311685472049204e-06, + "loss": 0.5447, + "step": 15497 + }, + { + "epoch": 4.114164343555024, + "grad_norm": 0.473901865326143, + "learning_rate": 1.1308764157710187e-06, + "loss": 0.5489, + "step": 15498 + }, + { + "epoch": 4.114429842028408, + "grad_norm": 0.4794470876563034, + "learning_rate": 1.13058431103838e-06, + "loss": 0.5278, + "step": 15499 + }, + { + "epoch": 4.114695340501792, + "grad_norm": 0.49734911428279194, + "learning_rate": 1.1302922330127005e-06, + "loss": 0.5242, + "step": 15500 + }, + { + "epoch": 4.1149608389751755, + "grad_norm": 0.4805064052929783, + "learning_rate": 1.130000181699675e-06, + "loss": 0.4972, + "step": 15501 + }, + { + "epoch": 4.11522633744856, + "grad_norm": 0.4627186404709467, + "learning_rate": 1.1297081571050014e-06, + "loss": 0.5646, + "step": 15502 + }, + { + "epoch": 4.115491835921944, + "grad_norm": 0.47185857201879167, + "learning_rate": 1.1294161592343725e-06, + "loss": 0.5281, + "step": 15503 + }, + { + "epoch": 4.115757334395327, + "grad_norm": 0.47484701618902164, + "learning_rate": 1.1291241880934855e-06, + "loss": 0.5537, + "step": 15504 + }, + { + "epoch": 4.116022832868711, + "grad_norm": 0.5006905644164343, + "learning_rate": 1.1288322436880322e-06, + "loss": 0.5576, + "step": 15505 + }, + { + "epoch": 4.116288331342095, + "grad_norm": 0.4940666638969997, + "learning_rate": 1.1285403260237093e-06, + "loss": 0.5328, + "step": 15506 + }, + { + "epoch": 4.116553829815478, + "grad_norm": 0.4760851919295786, + "learning_rate": 1.128248435106206e-06, + "loss": 0.5304, + "step": 15507 + }, + { + "epoch": 4.116819328288862, + "grad_norm": 0.47790399710346, + "learning_rate": 1.127956570941218e-06, + "loss": 0.5468, + "step": 15508 + }, + { + "epoch": 4.117084826762246, + "grad_norm": 0.477314963493014, + "learning_rate": 1.127664733534435e-06, + "loss": 0.5259, + "step": 15509 + }, + { + "epoch": 4.11735032523563, + "grad_norm": 0.4794506393572603, + "learning_rate": 1.127372922891551e-06, + "loss": 0.5374, + "step": 15510 + }, + { + "epoch": 4.117615823709014, + "grad_norm": 0.4655015605684314, + "learning_rate": 1.1270811390182543e-06, + "loss": 0.5493, + "step": 15511 + }, + { + "epoch": 4.117881322182398, + "grad_norm": 0.4795395110755202, + "learning_rate": 1.1267893819202383e-06, + "loss": 0.5553, + "step": 15512 + }, + { + "epoch": 4.118146820655781, + "grad_norm": 0.4745873644949828, + "learning_rate": 1.1264976516031914e-06, + "loss": 0.5142, + "step": 15513 + }, + { + "epoch": 4.118412319129165, + "grad_norm": 0.4829250288103117, + "learning_rate": 1.1262059480728022e-06, + "loss": 0.5453, + "step": 15514 + }, + { + "epoch": 4.118677817602549, + "grad_norm": 0.48749227015319374, + "learning_rate": 1.1259142713347615e-06, + "loss": 0.5745, + "step": 15515 + }, + { + "epoch": 4.118943316075932, + "grad_norm": 0.4949231274059334, + "learning_rate": 1.125622621394756e-06, + "loss": 0.5656, + "step": 15516 + }, + { + "epoch": 4.119208814549316, + "grad_norm": 0.459439133885228, + "learning_rate": 1.1253309982584755e-06, + "loss": 0.5203, + "step": 15517 + }, + { + "epoch": 4.1194743130227005, + "grad_norm": 0.48280030971257576, + "learning_rate": 1.1250394019316063e-06, + "loss": 0.5287, + "step": 15518 + }, + { + "epoch": 4.119739811496084, + "grad_norm": 0.47449002357217707, + "learning_rate": 1.1247478324198351e-06, + "loss": 0.5292, + "step": 15519 + }, + { + "epoch": 4.120005309969468, + "grad_norm": 0.4992125424553765, + "learning_rate": 1.1244562897288474e-06, + "loss": 0.5392, + "step": 15520 + }, + { + "epoch": 4.120270808442852, + "grad_norm": 0.4899011082425855, + "learning_rate": 1.1241647738643302e-06, + "loss": 0.5622, + "step": 15521 + }, + { + "epoch": 4.120536306916235, + "grad_norm": 0.4669537961698866, + "learning_rate": 1.1238732848319695e-06, + "loss": 0.5127, + "step": 15522 + }, + { + "epoch": 4.120801805389619, + "grad_norm": 0.46826171538247113, + "learning_rate": 1.1235818226374489e-06, + "loss": 0.5527, + "step": 15523 + }, + { + "epoch": 4.121067303863002, + "grad_norm": 0.4835485196809336, + "learning_rate": 1.1232903872864529e-06, + "loss": 0.5356, + "step": 15524 + }, + { + "epoch": 4.121332802336386, + "grad_norm": 0.4672756468223092, + "learning_rate": 1.122998978784664e-06, + "loss": 0.4988, + "step": 15525 + }, + { + "epoch": 4.1215983008097705, + "grad_norm": 0.4684412976558153, + "learning_rate": 1.1227075971377676e-06, + "loss": 0.5164, + "step": 15526 + }, + { + "epoch": 4.121863799283154, + "grad_norm": 0.4796260289739082, + "learning_rate": 1.1224162423514442e-06, + "loss": 0.5812, + "step": 15527 + }, + { + "epoch": 4.122129297756538, + "grad_norm": 0.46535898132102727, + "learning_rate": 1.1221249144313778e-06, + "loss": 0.5258, + "step": 15528 + }, + { + "epoch": 4.122394796229922, + "grad_norm": 0.47595586486401287, + "learning_rate": 1.1218336133832494e-06, + "loss": 0.5144, + "step": 15529 + }, + { + "epoch": 4.122660294703305, + "grad_norm": 0.47088727439423417, + "learning_rate": 1.1215423392127388e-06, + "loss": 0.5357, + "step": 15530 + }, + { + "epoch": 4.122925793176689, + "grad_norm": 0.46291951163652123, + "learning_rate": 1.121251091925529e-06, + "loss": 0.5143, + "step": 15531 + }, + { + "epoch": 4.123191291650073, + "grad_norm": 0.484197933142213, + "learning_rate": 1.1209598715272974e-06, + "loss": 0.5142, + "step": 15532 + }, + { + "epoch": 4.1234567901234565, + "grad_norm": 0.48458272870296576, + "learning_rate": 1.1206686780237256e-06, + "loss": 0.531, + "step": 15533 + }, + { + "epoch": 4.1237222885968405, + "grad_norm": 0.4882887384324555, + "learning_rate": 1.120377511420491e-06, + "loss": 0.5236, + "step": 15534 + }, + { + "epoch": 4.123987787070225, + "grad_norm": 0.477147298130285, + "learning_rate": 1.1200863717232745e-06, + "loss": 0.5489, + "step": 15535 + }, + { + "epoch": 4.124253285543608, + "grad_norm": 0.47210302805973253, + "learning_rate": 1.1197952589377506e-06, + "loss": 0.5258, + "step": 15536 + }, + { + "epoch": 4.124518784016992, + "grad_norm": 0.47528758191717485, + "learning_rate": 1.1195041730695997e-06, + "loss": 0.5454, + "step": 15537 + }, + { + "epoch": 4.124784282490376, + "grad_norm": 0.5012021287384323, + "learning_rate": 1.119213114124496e-06, + "loss": 0.53, + "step": 15538 + }, + { + "epoch": 4.125049780963759, + "grad_norm": 0.4653418544208738, + "learning_rate": 1.1189220821081187e-06, + "loss": 0.5306, + "step": 15539 + }, + { + "epoch": 4.125315279437143, + "grad_norm": 0.47880937642808624, + "learning_rate": 1.118631077026141e-06, + "loss": 0.5477, + "step": 15540 + }, + { + "epoch": 4.125580777910527, + "grad_norm": 0.49837154386921517, + "learning_rate": 1.1183400988842405e-06, + "loss": 0.5784, + "step": 15541 + }, + { + "epoch": 4.1258462763839105, + "grad_norm": 0.46201191978439693, + "learning_rate": 1.118049147688091e-06, + "loss": 0.5208, + "step": 15542 + }, + { + "epoch": 4.126111774857295, + "grad_norm": 0.4766994454286879, + "learning_rate": 1.117758223443366e-06, + "loss": 0.5472, + "step": 15543 + }, + { + "epoch": 4.126377273330679, + "grad_norm": 0.4833781782210179, + "learning_rate": 1.1174673261557405e-06, + "loss": 0.5325, + "step": 15544 + }, + { + "epoch": 4.126642771804062, + "grad_norm": 0.49259926823461087, + "learning_rate": 1.1171764558308865e-06, + "loss": 0.5383, + "step": 15545 + }, + { + "epoch": 4.126908270277446, + "grad_norm": 0.45673830618872674, + "learning_rate": 1.1168856124744782e-06, + "loss": 0.5226, + "step": 15546 + }, + { + "epoch": 4.12717376875083, + "grad_norm": 0.46550724569612123, + "learning_rate": 1.1165947960921868e-06, + "loss": 0.5354, + "step": 15547 + }, + { + "epoch": 4.127439267224213, + "grad_norm": 0.480245422161727, + "learning_rate": 1.1163040066896832e-06, + "loss": 0.5631, + "step": 15548 + }, + { + "epoch": 4.127704765697597, + "grad_norm": 0.4834019673595126, + "learning_rate": 1.1160132442726404e-06, + "loss": 0.5221, + "step": 15549 + }, + { + "epoch": 4.1279702641709815, + "grad_norm": 0.4924064340013863, + "learning_rate": 1.1157225088467268e-06, + "loss": 0.5264, + "step": 15550 + }, + { + "epoch": 4.128235762644365, + "grad_norm": 0.4868811244069276, + "learning_rate": 1.1154318004176147e-06, + "loss": 0.5383, + "step": 15551 + }, + { + "epoch": 4.128501261117749, + "grad_norm": 0.4694884383800759, + "learning_rate": 1.1151411189909724e-06, + "loss": 0.5262, + "step": 15552 + }, + { + "epoch": 4.128766759591132, + "grad_norm": 0.48940522990978125, + "learning_rate": 1.114850464572469e-06, + "loss": 0.5779, + "step": 15553 + }, + { + "epoch": 4.129032258064516, + "grad_norm": 0.4781934060803823, + "learning_rate": 1.1145598371677718e-06, + "loss": 0.5369, + "step": 15554 + }, + { + "epoch": 4.1292977565379, + "grad_norm": 0.48811123907939447, + "learning_rate": 1.1142692367825512e-06, + "loss": 0.5169, + "step": 15555 + }, + { + "epoch": 4.129563255011283, + "grad_norm": 0.4582092128417319, + "learning_rate": 1.1139786634224723e-06, + "loss": 0.5323, + "step": 15556 + }, + { + "epoch": 4.129828753484667, + "grad_norm": 0.4805331425137868, + "learning_rate": 1.113688117093204e-06, + "loss": 0.5354, + "step": 15557 + }, + { + "epoch": 4.1300942519580515, + "grad_norm": 0.4653630522683209, + "learning_rate": 1.1133975978004117e-06, + "loss": 0.5451, + "step": 15558 + }, + { + "epoch": 4.130359750431435, + "grad_norm": 0.4814286033916301, + "learning_rate": 1.11310710554976e-06, + "loss": 0.5495, + "step": 15559 + }, + { + "epoch": 4.130625248904819, + "grad_norm": 0.4780070513290123, + "learning_rate": 1.1128166403469165e-06, + "loss": 0.5303, + "step": 15560 + }, + { + "epoch": 4.130890747378203, + "grad_norm": 0.48993472337058136, + "learning_rate": 1.112526202197544e-06, + "loss": 0.582, + "step": 15561 + }, + { + "epoch": 4.131156245851586, + "grad_norm": 0.4633367386425925, + "learning_rate": 1.112235791107309e-06, + "loss": 0.5472, + "step": 15562 + }, + { + "epoch": 4.13142174432497, + "grad_norm": 0.4897531974201303, + "learning_rate": 1.1119454070818725e-06, + "loss": 0.5481, + "step": 15563 + }, + { + "epoch": 4.131687242798354, + "grad_norm": 0.4658986328199154, + "learning_rate": 1.1116550501269013e-06, + "loss": 0.5316, + "step": 15564 + }, + { + "epoch": 4.131952741271737, + "grad_norm": 0.47410322638262953, + "learning_rate": 1.1113647202480538e-06, + "loss": 0.5229, + "step": 15565 + }, + { + "epoch": 4.1322182397451215, + "grad_norm": 0.48771309885865977, + "learning_rate": 1.1110744174509952e-06, + "loss": 0.5463, + "step": 15566 + }, + { + "epoch": 4.132483738218506, + "grad_norm": 0.47224637346761367, + "learning_rate": 1.110784141741385e-06, + "loss": 0.5318, + "step": 15567 + }, + { + "epoch": 4.132749236691889, + "grad_norm": 0.4707038842073985, + "learning_rate": 1.1104938931248857e-06, + "loss": 0.5524, + "step": 15568 + }, + { + "epoch": 4.133014735165273, + "grad_norm": 0.4607022075219156, + "learning_rate": 1.1102036716071596e-06, + "loss": 0.4783, + "step": 15569 + }, + { + "epoch": 4.133280233638657, + "grad_norm": 0.47113615676178994, + "learning_rate": 1.1099134771938624e-06, + "loss": 0.5294, + "step": 15570 + }, + { + "epoch": 4.13354573211204, + "grad_norm": 0.4902044192166875, + "learning_rate": 1.1096233098906571e-06, + "loss": 0.5146, + "step": 15571 + }, + { + "epoch": 4.133811230585424, + "grad_norm": 0.4859085061662544, + "learning_rate": 1.1093331697032007e-06, + "loss": 0.5225, + "step": 15572 + }, + { + "epoch": 4.134076729058808, + "grad_norm": 0.46643061471805203, + "learning_rate": 1.109043056637153e-06, + "loss": 0.5204, + "step": 15573 + }, + { + "epoch": 4.1343422275321915, + "grad_norm": 0.4897898641260012, + "learning_rate": 1.1087529706981707e-06, + "loss": 0.5528, + "step": 15574 + }, + { + "epoch": 4.134607726005576, + "grad_norm": 0.4797901952282531, + "learning_rate": 1.1084629118919127e-06, + "loss": 0.5352, + "step": 15575 + }, + { + "epoch": 4.13487322447896, + "grad_norm": 0.46364227762620613, + "learning_rate": 1.1081728802240348e-06, + "loss": 0.5197, + "step": 15576 + }, + { + "epoch": 4.135138722952343, + "grad_norm": 0.47871785662735733, + "learning_rate": 1.1078828757001928e-06, + "loss": 0.5537, + "step": 15577 + }, + { + "epoch": 4.135404221425727, + "grad_norm": 0.4731982000786994, + "learning_rate": 1.1075928983260442e-06, + "loss": 0.5459, + "step": 15578 + }, + { + "epoch": 4.135669719899111, + "grad_norm": 0.49052294271584956, + "learning_rate": 1.1073029481072423e-06, + "loss": 0.5589, + "step": 15579 + }, + { + "epoch": 4.135935218372494, + "grad_norm": 0.47539561775462214, + "learning_rate": 1.1070130250494435e-06, + "loss": 0.5182, + "step": 15580 + }, + { + "epoch": 4.136200716845878, + "grad_norm": 0.48104889508605886, + "learning_rate": 1.1067231291583016e-06, + "loss": 0.5189, + "step": 15581 + }, + { + "epoch": 4.1364662153192615, + "grad_norm": 0.47762855462445847, + "learning_rate": 1.1064332604394698e-06, + "loss": 0.5432, + "step": 15582 + }, + { + "epoch": 4.136731713792646, + "grad_norm": 0.4813338715069982, + "learning_rate": 1.106143418898601e-06, + "loss": 0.5443, + "step": 15583 + }, + { + "epoch": 4.13699721226603, + "grad_norm": 0.48179542270873427, + "learning_rate": 1.1058536045413486e-06, + "loss": 0.5599, + "step": 15584 + }, + { + "epoch": 4.137262710739413, + "grad_norm": 0.4682756838872559, + "learning_rate": 1.1055638173733638e-06, + "loss": 0.4961, + "step": 15585 + }, + { + "epoch": 4.137528209212797, + "grad_norm": 0.4685254421693967, + "learning_rate": 1.1052740574002996e-06, + "loss": 0.5201, + "step": 15586 + }, + { + "epoch": 4.137793707686181, + "grad_norm": 0.4668272981890748, + "learning_rate": 1.104984324627806e-06, + "loss": 0.5485, + "step": 15587 + }, + { + "epoch": 4.138059206159564, + "grad_norm": 0.4843672061429562, + "learning_rate": 1.104694619061533e-06, + "loss": 0.5365, + "step": 15588 + }, + { + "epoch": 4.138324704632948, + "grad_norm": 0.4754774843218086, + "learning_rate": 1.104404940707132e-06, + "loss": 0.5689, + "step": 15589 + }, + { + "epoch": 4.138590203106332, + "grad_norm": 0.4648329191717306, + "learning_rate": 1.1041152895702506e-06, + "loss": 0.5304, + "step": 15590 + }, + { + "epoch": 4.138855701579716, + "grad_norm": 0.4762410385576523, + "learning_rate": 1.1038256656565398e-06, + "loss": 0.5321, + "step": 15591 + }, + { + "epoch": 4.1391212000531, + "grad_norm": 0.47261792211761755, + "learning_rate": 1.103536068971647e-06, + "loss": 0.5535, + "step": 15592 + }, + { + "epoch": 4.139386698526484, + "grad_norm": 0.4659213927497224, + "learning_rate": 1.103246499521219e-06, + "loss": 0.5441, + "step": 15593 + }, + { + "epoch": 4.139652196999867, + "grad_norm": 0.475025068989063, + "learning_rate": 1.1029569573109052e-06, + "loss": 0.5567, + "step": 15594 + }, + { + "epoch": 4.139917695473251, + "grad_norm": 0.46770203524811976, + "learning_rate": 1.1026674423463502e-06, + "loss": 0.5518, + "step": 15595 + }, + { + "epoch": 4.140183193946635, + "grad_norm": 0.48339099623767884, + "learning_rate": 1.1023779546332023e-06, + "loss": 0.5722, + "step": 15596 + }, + { + "epoch": 4.140448692420018, + "grad_norm": 0.46940204914652384, + "learning_rate": 1.1020884941771055e-06, + "loss": 0.5508, + "step": 15597 + }, + { + "epoch": 4.140714190893402, + "grad_norm": 0.4700794376705288, + "learning_rate": 1.1017990609837074e-06, + "loss": 0.4877, + "step": 15598 + }, + { + "epoch": 4.1409796893667865, + "grad_norm": 0.4771422175118768, + "learning_rate": 1.1015096550586491e-06, + "loss": 0.5329, + "step": 15599 + }, + { + "epoch": 4.14124518784017, + "grad_norm": 0.48015149765604603, + "learning_rate": 1.1012202764075774e-06, + "loss": 0.5223, + "step": 15600 + }, + { + "epoch": 4.141510686313554, + "grad_norm": 0.45896049722337495, + "learning_rate": 1.1009309250361344e-06, + "loss": 0.5307, + "step": 15601 + }, + { + "epoch": 4.141776184786938, + "grad_norm": 0.5018457356281955, + "learning_rate": 1.1006416009499648e-06, + "loss": 0.5299, + "step": 15602 + }, + { + "epoch": 4.142041683260321, + "grad_norm": 0.47232860626027773, + "learning_rate": 1.1003523041547094e-06, + "loss": 0.5501, + "step": 15603 + }, + { + "epoch": 4.142307181733705, + "grad_norm": 0.47563475155287643, + "learning_rate": 1.1000630346560118e-06, + "loss": 0.5312, + "step": 15604 + }, + { + "epoch": 4.142572680207088, + "grad_norm": 0.47294516911440865, + "learning_rate": 1.0997737924595125e-06, + "loss": 0.552, + "step": 15605 + }, + { + "epoch": 4.142838178680472, + "grad_norm": 0.4670639174234273, + "learning_rate": 1.0994845775708518e-06, + "loss": 0.5366, + "step": 15606 + }, + { + "epoch": 4.1431036771538565, + "grad_norm": 0.4896518973428236, + "learning_rate": 1.099195389995672e-06, + "loss": 0.5614, + "step": 15607 + }, + { + "epoch": 4.14336917562724, + "grad_norm": 0.4704579177338021, + "learning_rate": 1.0989062297396108e-06, + "loss": 0.5654, + "step": 15608 + }, + { + "epoch": 4.143634674100624, + "grad_norm": 0.47688782471548824, + "learning_rate": 1.0986170968083093e-06, + "loss": 0.5553, + "step": 15609 + }, + { + "epoch": 4.143900172574008, + "grad_norm": 0.4867397031128611, + "learning_rate": 1.0983279912074061e-06, + "loss": 0.5716, + "step": 15610 + }, + { + "epoch": 4.144165671047391, + "grad_norm": 0.4708918646391312, + "learning_rate": 1.0980389129425385e-06, + "loss": 0.5278, + "step": 15611 + }, + { + "epoch": 4.144431169520775, + "grad_norm": 0.5014593503860955, + "learning_rate": 1.0977498620193438e-06, + "loss": 0.5496, + "step": 15612 + }, + { + "epoch": 4.144696667994159, + "grad_norm": 0.4793916859520966, + "learning_rate": 1.0974608384434602e-06, + "loss": 0.5044, + "step": 15613 + }, + { + "epoch": 4.1449621664675425, + "grad_norm": 0.4751833558243376, + "learning_rate": 1.0971718422205252e-06, + "loss": 0.5303, + "step": 15614 + }, + { + "epoch": 4.1452276649409265, + "grad_norm": 0.4917791479577007, + "learning_rate": 1.096882873356174e-06, + "loss": 0.5471, + "step": 15615 + }, + { + "epoch": 4.145493163414311, + "grad_norm": 0.4801873104571178, + "learning_rate": 1.0965939318560421e-06, + "loss": 0.5841, + "step": 15616 + }, + { + "epoch": 4.145758661887694, + "grad_norm": 0.4535522819737755, + "learning_rate": 1.0963050177257637e-06, + "loss": 0.5517, + "step": 15617 + }, + { + "epoch": 4.146024160361078, + "grad_norm": 0.48636694625259147, + "learning_rate": 1.0960161309709753e-06, + "loss": 0.5527, + "step": 15618 + }, + { + "epoch": 4.146289658834462, + "grad_norm": 0.49205196097159415, + "learning_rate": 1.095727271597309e-06, + "loss": 0.5587, + "step": 15619 + }, + { + "epoch": 4.146555157307845, + "grad_norm": 0.46930758742504225, + "learning_rate": 1.0954384396104e-06, + "loss": 0.5339, + "step": 15620 + }, + { + "epoch": 4.146820655781229, + "grad_norm": 0.47073698252177665, + "learning_rate": 1.0951496350158804e-06, + "loss": 0.5539, + "step": 15621 + }, + { + "epoch": 4.147086154254613, + "grad_norm": 0.47749835500701826, + "learning_rate": 1.0948608578193816e-06, + "loss": 0.5145, + "step": 15622 + }, + { + "epoch": 4.1473516527279966, + "grad_norm": 0.4863400294798167, + "learning_rate": 1.0945721080265377e-06, + "loss": 0.584, + "step": 15623 + }, + { + "epoch": 4.147617151201381, + "grad_norm": 0.4505560926588953, + "learning_rate": 1.0942833856429777e-06, + "loss": 0.5021, + "step": 15624 + }, + { + "epoch": 4.147882649674765, + "grad_norm": 0.4783220294329117, + "learning_rate": 1.0939946906743342e-06, + "loss": 0.5612, + "step": 15625 + }, + { + "epoch": 4.148148148148148, + "grad_norm": 0.4760714907730766, + "learning_rate": 1.0937060231262358e-06, + "loss": 0.5268, + "step": 15626 + }, + { + "epoch": 4.148413646621532, + "grad_norm": 0.47409249678504867, + "learning_rate": 1.0934173830043153e-06, + "loss": 0.5107, + "step": 15627 + }, + { + "epoch": 4.148679145094916, + "grad_norm": 0.46554352700927415, + "learning_rate": 1.0931287703141977e-06, + "loss": 0.5291, + "step": 15628 + }, + { + "epoch": 4.148944643568299, + "grad_norm": 0.48432925912608943, + "learning_rate": 1.0928401850615144e-06, + "loss": 0.5564, + "step": 15629 + }, + { + "epoch": 4.149210142041683, + "grad_norm": 0.49315448153297736, + "learning_rate": 1.092551627251892e-06, + "loss": 0.5746, + "step": 15630 + }, + { + "epoch": 4.1494756405150675, + "grad_norm": 0.496241098270634, + "learning_rate": 1.0922630968909602e-06, + "loss": 0.5292, + "step": 15631 + }, + { + "epoch": 4.149741138988451, + "grad_norm": 0.4756442440758925, + "learning_rate": 1.0919745939843435e-06, + "loss": 0.4961, + "step": 15632 + }, + { + "epoch": 4.150006637461835, + "grad_norm": 0.48058388657686724, + "learning_rate": 1.0916861185376706e-06, + "loss": 0.5408, + "step": 15633 + }, + { + "epoch": 4.150272135935218, + "grad_norm": 0.47799591834016036, + "learning_rate": 1.0913976705565666e-06, + "loss": 0.5549, + "step": 15634 + }, + { + "epoch": 4.150537634408602, + "grad_norm": 0.4895221315472336, + "learning_rate": 1.0911092500466558e-06, + "loss": 0.5553, + "step": 15635 + }, + { + "epoch": 4.150803132881986, + "grad_norm": 0.4709301438419086, + "learning_rate": 1.0908208570135654e-06, + "loss": 0.5532, + "step": 15636 + }, + { + "epoch": 4.151068631355369, + "grad_norm": 0.48130540949252976, + "learning_rate": 1.0905324914629173e-06, + "loss": 0.5636, + "step": 15637 + }, + { + "epoch": 4.151334129828753, + "grad_norm": 0.47873598813665563, + "learning_rate": 1.0902441534003378e-06, + "loss": 0.5285, + "step": 15638 + }, + { + "epoch": 4.1515996283021375, + "grad_norm": 0.4834338954471441, + "learning_rate": 1.0899558428314489e-06, + "loss": 0.5205, + "step": 15639 + }, + { + "epoch": 4.151865126775521, + "grad_norm": 0.46395662207142596, + "learning_rate": 1.0896675597618725e-06, + "loss": 0.5026, + "step": 15640 + }, + { + "epoch": 4.152130625248905, + "grad_norm": 0.5040384996878225, + "learning_rate": 1.0893793041972325e-06, + "loss": 0.5773, + "step": 15641 + }, + { + "epoch": 4.152396123722289, + "grad_norm": 0.4701412011660318, + "learning_rate": 1.089091076143149e-06, + "loss": 0.5309, + "step": 15642 + }, + { + "epoch": 4.152661622195672, + "grad_norm": 0.49043270850002174, + "learning_rate": 1.088802875605245e-06, + "loss": 0.5106, + "step": 15643 + }, + { + "epoch": 4.152927120669056, + "grad_norm": 0.4913715841290042, + "learning_rate": 1.0885147025891404e-06, + "loss": 0.5447, + "step": 15644 + }, + { + "epoch": 4.15319261914244, + "grad_norm": 0.4734505134804812, + "learning_rate": 1.088226557100455e-06, + "loss": 0.5556, + "step": 15645 + }, + { + "epoch": 4.153458117615823, + "grad_norm": 0.4798403915641649, + "learning_rate": 1.087938439144807e-06, + "loss": 0.5185, + "step": 15646 + }, + { + "epoch": 4.1537236160892075, + "grad_norm": 0.48184063571663016, + "learning_rate": 1.0876503487278178e-06, + "loss": 0.5673, + "step": 15647 + }, + { + "epoch": 4.153989114562592, + "grad_norm": 0.47539130489602216, + "learning_rate": 1.0873622858551042e-06, + "loss": 0.5385, + "step": 15648 + }, + { + "epoch": 4.154254613035975, + "grad_norm": 0.4882432993942981, + "learning_rate": 1.0870742505322854e-06, + "loss": 0.5741, + "step": 15649 + }, + { + "epoch": 4.154520111509359, + "grad_norm": 0.4733264755012237, + "learning_rate": 1.0867862427649783e-06, + "loss": 0.5457, + "step": 15650 + }, + { + "epoch": 4.154785609982743, + "grad_norm": 0.49683287541177046, + "learning_rate": 1.0864982625587986e-06, + "loss": 0.5629, + "step": 15651 + }, + { + "epoch": 4.155051108456126, + "grad_norm": 0.47911576978441145, + "learning_rate": 1.0862103099193646e-06, + "loss": 0.5491, + "step": 15652 + }, + { + "epoch": 4.15531660692951, + "grad_norm": 0.4833308084409787, + "learning_rate": 1.0859223848522902e-06, + "loss": 0.5118, + "step": 15653 + }, + { + "epoch": 4.155582105402894, + "grad_norm": 0.4784609363065905, + "learning_rate": 1.0856344873631924e-06, + "loss": 0.5098, + "step": 15654 + }, + { + "epoch": 4.1558476038762775, + "grad_norm": 0.4668208354231387, + "learning_rate": 1.0853466174576843e-06, + "loss": 0.522, + "step": 15655 + }, + { + "epoch": 4.156113102349662, + "grad_norm": 0.46346170465603853, + "learning_rate": 1.0850587751413824e-06, + "loss": 0.5001, + "step": 15656 + }, + { + "epoch": 4.156378600823046, + "grad_norm": 0.47095824474909026, + "learning_rate": 1.0847709604198972e-06, + "loss": 0.5668, + "step": 15657 + }, + { + "epoch": 4.156644099296429, + "grad_norm": 0.4763162224508801, + "learning_rate": 1.0844831732988434e-06, + "loss": 0.5577, + "step": 15658 + }, + { + "epoch": 4.156909597769813, + "grad_norm": 0.4759551912536526, + "learning_rate": 1.0841954137838342e-06, + "loss": 0.5198, + "step": 15659 + }, + { + "epoch": 4.157175096243197, + "grad_norm": 0.47322081924139175, + "learning_rate": 1.08390768188048e-06, + "loss": 0.543, + "step": 15660 + }, + { + "epoch": 4.15744059471658, + "grad_norm": 0.4709030150311447, + "learning_rate": 1.0836199775943942e-06, + "loss": 0.5118, + "step": 15661 + }, + { + "epoch": 4.157706093189964, + "grad_norm": 0.47927412791656393, + "learning_rate": 1.0833323009311867e-06, + "loss": 0.5384, + "step": 15662 + }, + { + "epoch": 4.1579715916633475, + "grad_norm": 0.47047171960373263, + "learning_rate": 1.083044651896468e-06, + "loss": 0.5348, + "step": 15663 + }, + { + "epoch": 4.158237090136732, + "grad_norm": 0.47454374726311693, + "learning_rate": 1.0827570304958467e-06, + "loss": 0.5494, + "step": 15664 + }, + { + "epoch": 4.158502588610116, + "grad_norm": 0.47845325854692045, + "learning_rate": 1.0824694367349346e-06, + "loss": 0.5094, + "step": 15665 + }, + { + "epoch": 4.158768087083499, + "grad_norm": 0.4692408311694858, + "learning_rate": 1.0821818706193379e-06, + "loss": 0.5521, + "step": 15666 + }, + { + "epoch": 4.159033585556883, + "grad_norm": 0.4910113783130177, + "learning_rate": 1.0818943321546673e-06, + "loss": 0.5675, + "step": 15667 + }, + { + "epoch": 4.159299084030267, + "grad_norm": 0.48358433429505404, + "learning_rate": 1.0816068213465295e-06, + "loss": 0.5642, + "step": 15668 + }, + { + "epoch": 4.15956458250365, + "grad_norm": 0.46768123465324485, + "learning_rate": 1.0813193382005306e-06, + "loss": 0.5214, + "step": 15669 + }, + { + "epoch": 4.159830080977034, + "grad_norm": 0.47353513836109445, + "learning_rate": 1.0810318827222785e-06, + "loss": 0.5631, + "step": 15670 + }, + { + "epoch": 4.160095579450418, + "grad_norm": 0.4892440492161654, + "learning_rate": 1.0807444549173784e-06, + "loss": 0.531, + "step": 15671 + }, + { + "epoch": 4.160361077923802, + "grad_norm": 0.4776180910061011, + "learning_rate": 1.0804570547914372e-06, + "loss": 0.5186, + "step": 15672 + }, + { + "epoch": 4.160626576397186, + "grad_norm": 0.4827916115411352, + "learning_rate": 1.0801696823500592e-06, + "loss": 0.5726, + "step": 15673 + }, + { + "epoch": 4.16089207487057, + "grad_norm": 0.48963619137701425, + "learning_rate": 1.0798823375988487e-06, + "loss": 0.538, + "step": 15674 + }, + { + "epoch": 4.161157573343953, + "grad_norm": 0.47304734721066966, + "learning_rate": 1.0795950205434088e-06, + "loss": 0.4914, + "step": 15675 + }, + { + "epoch": 4.161423071817337, + "grad_norm": 0.4752259889217294, + "learning_rate": 1.0793077311893447e-06, + "loss": 0.5196, + "step": 15676 + }, + { + "epoch": 4.161688570290721, + "grad_norm": 0.4869680743126387, + "learning_rate": 1.0790204695422571e-06, + "loss": 0.5811, + "step": 15677 + }, + { + "epoch": 4.161954068764104, + "grad_norm": 0.4625635226420737, + "learning_rate": 1.0787332356077496e-06, + "loss": 0.5709, + "step": 15678 + }, + { + "epoch": 4.162219567237488, + "grad_norm": 0.4762615401200068, + "learning_rate": 1.078446029391426e-06, + "loss": 0.5143, + "step": 15679 + }, + { + "epoch": 4.1624850657108725, + "grad_norm": 0.4788627478488273, + "learning_rate": 1.078158850898883e-06, + "loss": 0.5638, + "step": 15680 + }, + { + "epoch": 4.162750564184256, + "grad_norm": 0.47660238093927376, + "learning_rate": 1.077871700135725e-06, + "loss": 0.5461, + "step": 15681 + }, + { + "epoch": 4.16301606265764, + "grad_norm": 0.4855898410808992, + "learning_rate": 1.07758457710755e-06, + "loss": 0.5553, + "step": 15682 + }, + { + "epoch": 4.163281561131024, + "grad_norm": 0.46956226094612036, + "learning_rate": 1.077297481819959e-06, + "loss": 0.5247, + "step": 15683 + }, + { + "epoch": 4.163547059604407, + "grad_norm": 0.45767437513522463, + "learning_rate": 1.0770104142785499e-06, + "loss": 0.5088, + "step": 15684 + }, + { + "epoch": 4.163812558077791, + "grad_norm": 0.48756026668722924, + "learning_rate": 1.0767233744889221e-06, + "loss": 0.6005, + "step": 15685 + }, + { + "epoch": 4.164078056551175, + "grad_norm": 0.48423336578759374, + "learning_rate": 1.076436362456674e-06, + "loss": 0.5685, + "step": 15686 + }, + { + "epoch": 4.164343555024558, + "grad_norm": 0.48627331479466046, + "learning_rate": 1.076149378187401e-06, + "loss": 0.5421, + "step": 15687 + }, + { + "epoch": 4.1646090534979425, + "grad_norm": 0.48211017665897593, + "learning_rate": 1.0758624216867022e-06, + "loss": 0.5436, + "step": 15688 + }, + { + "epoch": 4.164874551971327, + "grad_norm": 0.480244637142, + "learning_rate": 1.0755754929601726e-06, + "loss": 0.5339, + "step": 15689 + }, + { + "epoch": 4.16514005044471, + "grad_norm": 0.4781502935866322, + "learning_rate": 1.0752885920134098e-06, + "loss": 0.5803, + "step": 15690 + }, + { + "epoch": 4.165405548918094, + "grad_norm": 0.4743428694996764, + "learning_rate": 1.0750017188520063e-06, + "loss": 0.5548, + "step": 15691 + }, + { + "epoch": 4.165671047391477, + "grad_norm": 0.47841753093213657, + "learning_rate": 1.0747148734815587e-06, + "loss": 0.54, + "step": 15692 + }, + { + "epoch": 4.165936545864861, + "grad_norm": 0.46059801769971387, + "learning_rate": 1.0744280559076602e-06, + "loss": 0.5043, + "step": 15693 + }, + { + "epoch": 4.166202044338245, + "grad_norm": 0.4619070769801546, + "learning_rate": 1.074141266135906e-06, + "loss": 0.5172, + "step": 15694 + }, + { + "epoch": 4.1664675428116285, + "grad_norm": 0.48613543849746815, + "learning_rate": 1.0738545041718873e-06, + "loss": 0.5706, + "step": 15695 + }, + { + "epoch": 4.1667330412850125, + "grad_norm": 0.4814975462576854, + "learning_rate": 1.0735677700211982e-06, + "loss": 0.5604, + "step": 15696 + }, + { + "epoch": 4.166998539758397, + "grad_norm": 0.45904673752790504, + "learning_rate": 1.0732810636894304e-06, + "loss": 0.5168, + "step": 15697 + }, + { + "epoch": 4.16726403823178, + "grad_norm": 0.4657578420663952, + "learning_rate": 1.072994385182174e-06, + "loss": 0.5272, + "step": 15698 + }, + { + "epoch": 4.167529536705164, + "grad_norm": 0.4706186582753637, + "learning_rate": 1.0727077345050218e-06, + "loss": 0.5439, + "step": 15699 + }, + { + "epoch": 4.167795035178548, + "grad_norm": 0.47812731891053234, + "learning_rate": 1.0724211116635628e-06, + "loss": 0.5661, + "step": 15700 + }, + { + "epoch": 4.168060533651931, + "grad_norm": 0.4773364919210583, + "learning_rate": 1.072134516663388e-06, + "loss": 0.558, + "step": 15701 + }, + { + "epoch": 4.168326032125315, + "grad_norm": 0.48999880360261217, + "learning_rate": 1.0718479495100864e-06, + "loss": 0.5459, + "step": 15702 + }, + { + "epoch": 4.168591530598699, + "grad_norm": 0.48169021501988535, + "learning_rate": 1.0715614102092455e-06, + "loss": 0.5333, + "step": 15703 + }, + { + "epoch": 4.1688570290720826, + "grad_norm": 0.4782256139721503, + "learning_rate": 1.0712748987664554e-06, + "loss": 0.5581, + "step": 15704 + }, + { + "epoch": 4.169122527545467, + "grad_norm": 0.4835018960535134, + "learning_rate": 1.070988415187302e-06, + "loss": 0.5354, + "step": 15705 + }, + { + "epoch": 4.169388026018851, + "grad_norm": 0.48371136889518485, + "learning_rate": 1.0707019594773744e-06, + "loss": 0.5516, + "step": 15706 + }, + { + "epoch": 4.169653524492234, + "grad_norm": 0.4835419597659858, + "learning_rate": 1.070415531642257e-06, + "loss": 0.5417, + "step": 15707 + }, + { + "epoch": 4.169919022965618, + "grad_norm": 0.48807210873478907, + "learning_rate": 1.070129131687539e-06, + "loss": 0.5417, + "step": 15708 + }, + { + "epoch": 4.170184521439002, + "grad_norm": 0.4581948774842411, + "learning_rate": 1.069842759618802e-06, + "loss": 0.5309, + "step": 15709 + }, + { + "epoch": 4.170450019912385, + "grad_norm": 0.47472127779898404, + "learning_rate": 1.0695564154416333e-06, + "loss": 0.5038, + "step": 15710 + }, + { + "epoch": 4.170715518385769, + "grad_norm": 0.47985532794409486, + "learning_rate": 1.0692700991616161e-06, + "loss": 0.5542, + "step": 15711 + }, + { + "epoch": 4.1709810168591535, + "grad_norm": 0.47393132832617674, + "learning_rate": 1.0689838107843362e-06, + "loss": 0.5151, + "step": 15712 + }, + { + "epoch": 4.171246515332537, + "grad_norm": 0.46779888435621175, + "learning_rate": 1.0686975503153756e-06, + "loss": 0.4988, + "step": 15713 + }, + { + "epoch": 4.171512013805921, + "grad_norm": 0.4575252250191547, + "learning_rate": 1.0684113177603161e-06, + "loss": 0.5465, + "step": 15714 + }, + { + "epoch": 4.171777512279305, + "grad_norm": 0.4705538585063664, + "learning_rate": 1.0681251131247423e-06, + "loss": 0.5779, + "step": 15715 + }, + { + "epoch": 4.172043010752688, + "grad_norm": 0.481342298589126, + "learning_rate": 1.0678389364142336e-06, + "loss": 0.5302, + "step": 15716 + }, + { + "epoch": 4.172308509226072, + "grad_norm": 0.48902391370159254, + "learning_rate": 1.0675527876343727e-06, + "loss": 0.5563, + "step": 15717 + }, + { + "epoch": 4.172574007699455, + "grad_norm": 0.47558219684957925, + "learning_rate": 1.067266666790739e-06, + "loss": 0.4971, + "step": 15718 + }, + { + "epoch": 4.172839506172839, + "grad_norm": 0.4784512161688012, + "learning_rate": 1.0669805738889152e-06, + "loss": 0.5537, + "step": 15719 + }, + { + "epoch": 4.1731050046462235, + "grad_norm": 0.489921125433436, + "learning_rate": 1.0666945089344771e-06, + "loss": 0.5427, + "step": 15720 + }, + { + "epoch": 4.173370503119607, + "grad_norm": 0.47990864313583836, + "learning_rate": 1.0664084719330061e-06, + "loss": 0.5633, + "step": 15721 + }, + { + "epoch": 4.173636001592991, + "grad_norm": 0.47401647350908505, + "learning_rate": 1.066122462890079e-06, + "loss": 0.5538, + "step": 15722 + }, + { + "epoch": 4.173901500066375, + "grad_norm": 0.47581908227713804, + "learning_rate": 1.0658364818112758e-06, + "loss": 0.5312, + "step": 15723 + }, + { + "epoch": 4.174166998539758, + "grad_norm": 0.4855315803699059, + "learning_rate": 1.0655505287021714e-06, + "loss": 0.5612, + "step": 15724 + }, + { + "epoch": 4.174432497013142, + "grad_norm": 0.4826275883005352, + "learning_rate": 1.0652646035683452e-06, + "loss": 0.5263, + "step": 15725 + }, + { + "epoch": 4.174697995486526, + "grad_norm": 0.4734590465662751, + "learning_rate": 1.0649787064153716e-06, + "loss": 0.5258, + "step": 15726 + }, + { + "epoch": 4.174963493959909, + "grad_norm": 0.4760709919088463, + "learning_rate": 1.064692837248826e-06, + "loss": 0.5475, + "step": 15727 + }, + { + "epoch": 4.1752289924332935, + "grad_norm": 0.48253965061722287, + "learning_rate": 1.0644069960742853e-06, + "loss": 0.5265, + "step": 15728 + }, + { + "epoch": 4.175494490906678, + "grad_norm": 0.4810148237581286, + "learning_rate": 1.064121182897322e-06, + "loss": 0.5261, + "step": 15729 + }, + { + "epoch": 4.175759989380061, + "grad_norm": 0.47672167900209766, + "learning_rate": 1.0638353977235125e-06, + "loss": 0.5465, + "step": 15730 + }, + { + "epoch": 4.176025487853445, + "grad_norm": 0.4833559058161596, + "learning_rate": 1.0635496405584286e-06, + "loss": 0.5644, + "step": 15731 + }, + { + "epoch": 4.176290986326829, + "grad_norm": 0.47290001327260356, + "learning_rate": 1.0632639114076432e-06, + "loss": 0.5409, + "step": 15732 + }, + { + "epoch": 4.176556484800212, + "grad_norm": 0.4757374099398108, + "learning_rate": 1.0629782102767303e-06, + "loss": 0.572, + "step": 15733 + }, + { + "epoch": 4.176821983273596, + "grad_norm": 0.4865106661101705, + "learning_rate": 1.0626925371712596e-06, + "loss": 0.5657, + "step": 15734 + }, + { + "epoch": 4.17708748174698, + "grad_norm": 0.47053084093472325, + "learning_rate": 1.0624068920968044e-06, + "loss": 0.5395, + "step": 15735 + }, + { + "epoch": 4.1773529802203635, + "grad_norm": 0.46430694205337447, + "learning_rate": 1.0621212750589346e-06, + "loss": 0.52, + "step": 15736 + }, + { + "epoch": 4.177618478693748, + "grad_norm": 0.4747787480069449, + "learning_rate": 1.0618356860632209e-06, + "loss": 0.5115, + "step": 15737 + }, + { + "epoch": 4.177883977167132, + "grad_norm": 0.471792817491463, + "learning_rate": 1.0615501251152314e-06, + "loss": 0.5148, + "step": 15738 + }, + { + "epoch": 4.178149475640515, + "grad_norm": 0.4767625350760104, + "learning_rate": 1.0612645922205373e-06, + "loss": 0.5184, + "step": 15739 + }, + { + "epoch": 4.178414974113899, + "grad_norm": 0.4779025911445821, + "learning_rate": 1.0609790873847051e-06, + "loss": 0.5479, + "step": 15740 + }, + { + "epoch": 4.178680472587283, + "grad_norm": 0.47172344489076556, + "learning_rate": 1.0606936106133054e-06, + "loss": 0.539, + "step": 15741 + }, + { + "epoch": 4.178945971060666, + "grad_norm": 0.46882636633974095, + "learning_rate": 1.060408161911904e-06, + "loss": 0.5214, + "step": 15742 + }, + { + "epoch": 4.17921146953405, + "grad_norm": 0.47229362713666023, + "learning_rate": 1.0601227412860674e-06, + "loss": 0.5509, + "step": 15743 + }, + { + "epoch": 4.1794769680074335, + "grad_norm": 0.474879630717398, + "learning_rate": 1.0598373487413634e-06, + "loss": 0.5154, + "step": 15744 + }, + { + "epoch": 4.179742466480818, + "grad_norm": 0.45010427264510255, + "learning_rate": 1.0595519842833565e-06, + "loss": 0.4937, + "step": 15745 + }, + { + "epoch": 4.180007964954202, + "grad_norm": 0.48301703582448147, + "learning_rate": 1.0592666479176139e-06, + "loss": 0.521, + "step": 15746 + }, + { + "epoch": 4.180273463427585, + "grad_norm": 0.4785468044719761, + "learning_rate": 1.058981339649698e-06, + "loss": 0.5472, + "step": 15747 + }, + { + "epoch": 4.180538961900969, + "grad_norm": 0.47447337999250233, + "learning_rate": 1.0586960594851762e-06, + "loss": 0.5204, + "step": 15748 + }, + { + "epoch": 4.180804460374353, + "grad_norm": 0.4778942286194906, + "learning_rate": 1.0584108074296082e-06, + "loss": 0.5366, + "step": 15749 + }, + { + "epoch": 4.181069958847736, + "grad_norm": 0.4945115402202853, + "learning_rate": 1.0581255834885589e-06, + "loss": 0.5668, + "step": 15750 + }, + { + "epoch": 4.18133545732112, + "grad_norm": 0.4694389901768765, + "learning_rate": 1.0578403876675921e-06, + "loss": 0.5296, + "step": 15751 + }, + { + "epoch": 4.181600955794504, + "grad_norm": 0.46976216932579046, + "learning_rate": 1.057555219972268e-06, + "loss": 0.5557, + "step": 15752 + }, + { + "epoch": 4.181866454267888, + "grad_norm": 0.4794819469668981, + "learning_rate": 1.0572700804081497e-06, + "loss": 0.5259, + "step": 15753 + }, + { + "epoch": 4.182131952741272, + "grad_norm": 0.4603142029328358, + "learning_rate": 1.0569849689807973e-06, + "loss": 0.5087, + "step": 15754 + }, + { + "epoch": 4.182397451214656, + "grad_norm": 0.4912490196602543, + "learning_rate": 1.0566998856957707e-06, + "loss": 0.5443, + "step": 15755 + }, + { + "epoch": 4.182662949688039, + "grad_norm": 0.4925570321937711, + "learning_rate": 1.0564148305586296e-06, + "loss": 0.5252, + "step": 15756 + }, + { + "epoch": 4.182928448161423, + "grad_norm": 0.4795003420522764, + "learning_rate": 1.0561298035749348e-06, + "loss": 0.518, + "step": 15757 + }, + { + "epoch": 4.183193946634807, + "grad_norm": 0.4668182639786524, + "learning_rate": 1.055844804750243e-06, + "loss": 0.5411, + "step": 15758 + }, + { + "epoch": 4.18345944510819, + "grad_norm": 0.47684510322568757, + "learning_rate": 1.055559834090114e-06, + "loss": 0.5435, + "step": 15759 + }, + { + "epoch": 4.183724943581574, + "grad_norm": 0.4684372087409417, + "learning_rate": 1.0552748916001054e-06, + "loss": 0.5509, + "step": 15760 + }, + { + "epoch": 4.1839904420549585, + "grad_norm": 0.46994319082365404, + "learning_rate": 1.0549899772857724e-06, + "loss": 0.5363, + "step": 15761 + }, + { + "epoch": 4.184255940528342, + "grad_norm": 0.46786250486815784, + "learning_rate": 1.054705091152674e-06, + "loss": 0.5478, + "step": 15762 + }, + { + "epoch": 4.184521439001726, + "grad_norm": 0.4779272570984364, + "learning_rate": 1.0544202332063637e-06, + "loss": 0.5051, + "step": 15763 + }, + { + "epoch": 4.18478693747511, + "grad_norm": 0.4652851216827191, + "learning_rate": 1.0541354034523997e-06, + "loss": 0.5172, + "step": 15764 + }, + { + "epoch": 4.185052435948493, + "grad_norm": 0.4785416639510209, + "learning_rate": 1.0538506018963352e-06, + "loss": 0.5518, + "step": 15765 + }, + { + "epoch": 4.185317934421877, + "grad_norm": 0.4694615231733207, + "learning_rate": 1.0535658285437245e-06, + "loss": 0.5258, + "step": 15766 + }, + { + "epoch": 4.185583432895261, + "grad_norm": 0.47578635822774545, + "learning_rate": 1.0532810834001208e-06, + "loss": 0.5335, + "step": 15767 + }, + { + "epoch": 4.185848931368644, + "grad_norm": 0.4698792494983958, + "learning_rate": 1.0529963664710794e-06, + "loss": 0.5464, + "step": 15768 + }, + { + "epoch": 4.1861144298420285, + "grad_norm": 0.4748498706339072, + "learning_rate": 1.0527116777621505e-06, + "loss": 0.5495, + "step": 15769 + }, + { + "epoch": 4.186379928315413, + "grad_norm": 0.46611474543374687, + "learning_rate": 1.0524270172788874e-06, + "loss": 0.541, + "step": 15770 + }, + { + "epoch": 4.186645426788796, + "grad_norm": 0.4886192620314173, + "learning_rate": 1.052142385026844e-06, + "loss": 0.5931, + "step": 15771 + }, + { + "epoch": 4.18691092526218, + "grad_norm": 0.47441555552000675, + "learning_rate": 1.0518577810115668e-06, + "loss": 0.5759, + "step": 15772 + }, + { + "epoch": 4.187176423735563, + "grad_norm": 0.46505979551740484, + "learning_rate": 1.0515732052386095e-06, + "loss": 0.5104, + "step": 15773 + }, + { + "epoch": 4.187441922208947, + "grad_norm": 0.4556154464288007, + "learning_rate": 1.0512886577135203e-06, + "loss": 0.5502, + "step": 15774 + }, + { + "epoch": 4.187707420682331, + "grad_norm": 0.4809501149952131, + "learning_rate": 1.05100413844185e-06, + "loss": 0.56, + "step": 15775 + }, + { + "epoch": 4.1879729191557145, + "grad_norm": 0.4781242533682232, + "learning_rate": 1.0507196474291462e-06, + "loss": 0.5522, + "step": 15776 + }, + { + "epoch": 4.1882384176290985, + "grad_norm": 0.46930550074800953, + "learning_rate": 1.0504351846809585e-06, + "loss": 0.536, + "step": 15777 + }, + { + "epoch": 4.188503916102483, + "grad_norm": 0.4791523379790535, + "learning_rate": 1.0501507502028339e-06, + "loss": 0.5464, + "step": 15778 + }, + { + "epoch": 4.188769414575866, + "grad_norm": 0.4781422789645589, + "learning_rate": 1.0498663440003185e-06, + "loss": 0.5413, + "step": 15779 + }, + { + "epoch": 4.18903491304925, + "grad_norm": 0.46019783224004224, + "learning_rate": 1.0495819660789607e-06, + "loss": 0.5148, + "step": 15780 + }, + { + "epoch": 4.189300411522634, + "grad_norm": 0.4773654089268839, + "learning_rate": 1.049297616444305e-06, + "loss": 0.543, + "step": 15781 + }, + { + "epoch": 4.189565909996017, + "grad_norm": 0.4825802363934642, + "learning_rate": 1.0490132951018986e-06, + "loss": 0.5119, + "step": 15782 + }, + { + "epoch": 4.189831408469401, + "grad_norm": 0.48507517733597844, + "learning_rate": 1.0487290020572857e-06, + "loss": 0.5426, + "step": 15783 + }, + { + "epoch": 4.190096906942785, + "grad_norm": 0.4897126798205443, + "learning_rate": 1.0484447373160106e-06, + "loss": 0.5554, + "step": 15784 + }, + { + "epoch": 4.190362405416169, + "grad_norm": 0.49361586397445456, + "learning_rate": 1.048160500883616e-06, + "loss": 0.5396, + "step": 15785 + }, + { + "epoch": 4.190627903889553, + "grad_norm": 0.4632751690889525, + "learning_rate": 1.0478762927656474e-06, + "loss": 0.5256, + "step": 15786 + }, + { + "epoch": 4.190893402362937, + "grad_norm": 0.4715623999435698, + "learning_rate": 1.0475921129676455e-06, + "loss": 0.5667, + "step": 15787 + }, + { + "epoch": 4.19115890083632, + "grad_norm": 0.46614743397444725, + "learning_rate": 1.0473079614951546e-06, + "loss": 0.5153, + "step": 15788 + }, + { + "epoch": 4.191424399309704, + "grad_norm": 0.5017620379302506, + "learning_rate": 1.0470238383537152e-06, + "loss": 0.5663, + "step": 15789 + }, + { + "epoch": 4.191689897783088, + "grad_norm": 0.48458098953727874, + "learning_rate": 1.0467397435488673e-06, + "loss": 0.5731, + "step": 15790 + }, + { + "epoch": 4.191955396256471, + "grad_norm": 0.4768514341780122, + "learning_rate": 1.046455677086154e-06, + "loss": 0.5482, + "step": 15791 + }, + { + "epoch": 4.192220894729855, + "grad_norm": 0.4841196781653079, + "learning_rate": 1.0461716389711128e-06, + "loss": 0.5392, + "step": 15792 + }, + { + "epoch": 4.1924863932032395, + "grad_norm": 0.4684832148464701, + "learning_rate": 1.0458876292092848e-06, + "loss": 0.5369, + "step": 15793 + }, + { + "epoch": 4.192751891676623, + "grad_norm": 0.47899904644737745, + "learning_rate": 1.0456036478062085e-06, + "loss": 0.5414, + "step": 15794 + }, + { + "epoch": 4.193017390150007, + "grad_norm": 0.4766981606065527, + "learning_rate": 1.0453196947674212e-06, + "loss": 0.5394, + "step": 15795 + }, + { + "epoch": 4.193282888623391, + "grad_norm": 0.4794784140554601, + "learning_rate": 1.0450357700984624e-06, + "loss": 0.5286, + "step": 15796 + }, + { + "epoch": 4.193548387096774, + "grad_norm": 0.4696173030359798, + "learning_rate": 1.0447518738048676e-06, + "loss": 0.5337, + "step": 15797 + }, + { + "epoch": 4.193813885570158, + "grad_norm": 0.4754889404917988, + "learning_rate": 1.044468005892175e-06, + "loss": 0.5589, + "step": 15798 + }, + { + "epoch": 4.194079384043542, + "grad_norm": 0.48365008091434597, + "learning_rate": 1.0441841663659193e-06, + "loss": 0.5751, + "step": 15799 + }, + { + "epoch": 4.194344882516925, + "grad_norm": 0.4739409824701588, + "learning_rate": 1.0439003552316387e-06, + "loss": 0.5473, + "step": 15800 + }, + { + "epoch": 4.1946103809903095, + "grad_norm": 0.4740853854824402, + "learning_rate": 1.0436165724948644e-06, + "loss": 0.5344, + "step": 15801 + }, + { + "epoch": 4.194875879463693, + "grad_norm": 0.48335579784914723, + "learning_rate": 1.043332818161134e-06, + "loss": 0.587, + "step": 15802 + }, + { + "epoch": 4.195141377937077, + "grad_norm": 0.4628608439969883, + "learning_rate": 1.0430490922359787e-06, + "loss": 0.5156, + "step": 15803 + }, + { + "epoch": 4.195406876410461, + "grad_norm": 0.46762070763234215, + "learning_rate": 1.0427653947249345e-06, + "loss": 0.5044, + "step": 15804 + }, + { + "epoch": 4.195672374883844, + "grad_norm": 0.4660179700702556, + "learning_rate": 1.0424817256335324e-06, + "loss": 0.5377, + "step": 15805 + }, + { + "epoch": 4.195937873357228, + "grad_norm": 0.47089826122905587, + "learning_rate": 1.042198084967306e-06, + "loss": 0.5221, + "step": 15806 + }, + { + "epoch": 4.196203371830612, + "grad_norm": 0.47687851371220574, + "learning_rate": 1.041914472731786e-06, + "loss": 0.5234, + "step": 15807 + }, + { + "epoch": 4.196468870303995, + "grad_norm": 0.48087517686309145, + "learning_rate": 1.041630888932503e-06, + "loss": 0.5373, + "step": 15808 + }, + { + "epoch": 4.1967343687773795, + "grad_norm": 0.48255578804077875, + "learning_rate": 1.0413473335749894e-06, + "loss": 0.5427, + "step": 15809 + }, + { + "epoch": 4.196999867250764, + "grad_norm": 0.4791727927412968, + "learning_rate": 1.0410638066647733e-06, + "loss": 0.517, + "step": 15810 + }, + { + "epoch": 4.197265365724147, + "grad_norm": 0.4876404795385016, + "learning_rate": 1.0407803082073867e-06, + "loss": 0.5307, + "step": 15811 + }, + { + "epoch": 4.197530864197531, + "grad_norm": 0.48055114090260526, + "learning_rate": 1.0404968382083553e-06, + "loss": 0.5251, + "step": 15812 + }, + { + "epoch": 4.197796362670915, + "grad_norm": 0.4796792245686959, + "learning_rate": 1.0402133966732098e-06, + "loss": 0.559, + "step": 15813 + }, + { + "epoch": 4.198061861144298, + "grad_norm": 0.4833996014113798, + "learning_rate": 1.0399299836074762e-06, + "loss": 0.5615, + "step": 15814 + }, + { + "epoch": 4.198327359617682, + "grad_norm": 0.4633363803989168, + "learning_rate": 1.0396465990166829e-06, + "loss": 0.5267, + "step": 15815 + }, + { + "epoch": 4.198592858091066, + "grad_norm": 0.4777288993868714, + "learning_rate": 1.0393632429063575e-06, + "loss": 0.5735, + "step": 15816 + }, + { + "epoch": 4.1988583565644495, + "grad_norm": 0.47420527543503405, + "learning_rate": 1.039079915282025e-06, + "loss": 0.5141, + "step": 15817 + }, + { + "epoch": 4.199123855037834, + "grad_norm": 0.47716209695746187, + "learning_rate": 1.0387966161492111e-06, + "loss": 0.5505, + "step": 15818 + }, + { + "epoch": 4.199389353511218, + "grad_norm": 0.4755603780171045, + "learning_rate": 1.03851334551344e-06, + "loss": 0.5675, + "step": 15819 + }, + { + "epoch": 4.199654851984601, + "grad_norm": 0.47678007924957305, + "learning_rate": 1.0382301033802379e-06, + "loss": 0.5327, + "step": 15820 + }, + { + "epoch": 4.199920350457985, + "grad_norm": 0.48171762282732833, + "learning_rate": 1.0379468897551267e-06, + "loss": 0.5704, + "step": 15821 + }, + { + "epoch": 4.200185848931369, + "grad_norm": 0.490028259144393, + "learning_rate": 1.0376637046436323e-06, + "loss": 0.4882, + "step": 15822 + }, + { + "epoch": 4.200451347404752, + "grad_norm": 0.476688594985123, + "learning_rate": 1.0373805480512755e-06, + "loss": 0.559, + "step": 15823 + }, + { + "epoch": 4.200716845878136, + "grad_norm": 0.48776632700402484, + "learning_rate": 1.0370974199835785e-06, + "loss": 0.5645, + "step": 15824 + }, + { + "epoch": 4.20098234435152, + "grad_norm": 0.48072199869614013, + "learning_rate": 1.0368143204460643e-06, + "loss": 0.5433, + "step": 15825 + }, + { + "epoch": 4.201247842824904, + "grad_norm": 0.4720978912349653, + "learning_rate": 1.0365312494442528e-06, + "loss": 0.5315, + "step": 15826 + }, + { + "epoch": 4.201513341298288, + "grad_norm": 0.4503657636212197, + "learning_rate": 1.0362482069836658e-06, + "loss": 0.5061, + "step": 15827 + }, + { + "epoch": 4.201778839771672, + "grad_norm": 0.4818982649339037, + "learning_rate": 1.0359651930698217e-06, + "loss": 0.5542, + "step": 15828 + }, + { + "epoch": 4.202044338245055, + "grad_norm": 0.4762736540411629, + "learning_rate": 1.0356822077082428e-06, + "loss": 0.5412, + "step": 15829 + }, + { + "epoch": 4.202309836718439, + "grad_norm": 0.47059643406848206, + "learning_rate": 1.0353992509044445e-06, + "loss": 0.5229, + "step": 15830 + }, + { + "epoch": 4.202575335191822, + "grad_norm": 0.46810946096065087, + "learning_rate": 1.0351163226639474e-06, + "loss": 0.5461, + "step": 15831 + }, + { + "epoch": 4.202840833665206, + "grad_norm": 0.47334207261048367, + "learning_rate": 1.0348334229922677e-06, + "loss": 0.5565, + "step": 15832 + }, + { + "epoch": 4.20310633213859, + "grad_norm": 0.4577515337816793, + "learning_rate": 1.0345505518949247e-06, + "loss": 0.5332, + "step": 15833 + }, + { + "epoch": 4.203371830611974, + "grad_norm": 0.4644843714654492, + "learning_rate": 1.034267709377434e-06, + "loss": 0.5295, + "step": 15834 + }, + { + "epoch": 4.203637329085358, + "grad_norm": 0.4786982362174368, + "learning_rate": 1.0339848954453105e-06, + "loss": 0.538, + "step": 15835 + }, + { + "epoch": 4.203902827558742, + "grad_norm": 0.4913151811164922, + "learning_rate": 1.0337021101040724e-06, + "loss": 0.5287, + "step": 15836 + }, + { + "epoch": 4.204168326032125, + "grad_norm": 0.4954858732638189, + "learning_rate": 1.033419353359232e-06, + "loss": 0.5735, + "step": 15837 + }, + { + "epoch": 4.204433824505509, + "grad_norm": 0.4817342773849108, + "learning_rate": 1.0331366252163058e-06, + "loss": 0.5358, + "step": 15838 + }, + { + "epoch": 4.204699322978893, + "grad_norm": 0.47222194218079133, + "learning_rate": 1.0328539256808063e-06, + "loss": 0.5672, + "step": 15839 + }, + { + "epoch": 4.204964821452276, + "grad_norm": 0.4759953838767612, + "learning_rate": 1.032571254758248e-06, + "loss": 0.5564, + "step": 15840 + }, + { + "epoch": 4.20523031992566, + "grad_norm": 0.48462847967122563, + "learning_rate": 1.0322886124541433e-06, + "loss": 0.5759, + "step": 15841 + }, + { + "epoch": 4.2054958183990445, + "grad_norm": 0.47342175752153015, + "learning_rate": 1.0320059987740036e-06, + "loss": 0.5712, + "step": 15842 + }, + { + "epoch": 4.205761316872428, + "grad_norm": 0.48556260003642543, + "learning_rate": 1.0317234137233417e-06, + "loss": 0.5791, + "step": 15843 + }, + { + "epoch": 4.206026815345812, + "grad_norm": 0.4800324194060265, + "learning_rate": 1.0314408573076676e-06, + "loss": 0.5102, + "step": 15844 + }, + { + "epoch": 4.206292313819196, + "grad_norm": 0.47934191640752666, + "learning_rate": 1.031158329532493e-06, + "loss": 0.5344, + "step": 15845 + }, + { + "epoch": 4.206557812292579, + "grad_norm": 0.46728851077621186, + "learning_rate": 1.0308758304033276e-06, + "loss": 0.5172, + "step": 15846 + }, + { + "epoch": 4.206823310765963, + "grad_norm": 0.46783268879461065, + "learning_rate": 1.0305933599256805e-06, + "loss": 0.5473, + "step": 15847 + }, + { + "epoch": 4.207088809239347, + "grad_norm": 0.469213312191495, + "learning_rate": 1.0303109181050597e-06, + "loss": 0.5156, + "step": 15848 + }, + { + "epoch": 4.2073543077127304, + "grad_norm": 0.47580852028287096, + "learning_rate": 1.0300285049469752e-06, + "loss": 0.5303, + "step": 15849 + }, + { + "epoch": 4.2076198061861145, + "grad_norm": 0.4649832905144798, + "learning_rate": 1.029746120456933e-06, + "loss": 0.5637, + "step": 15850 + }, + { + "epoch": 4.207885304659499, + "grad_norm": 0.4639440059770558, + "learning_rate": 1.0294637646404424e-06, + "loss": 0.5388, + "step": 15851 + }, + { + "epoch": 4.208150803132882, + "grad_norm": 0.4743992927242743, + "learning_rate": 1.0291814375030087e-06, + "loss": 0.5252, + "step": 15852 + }, + { + "epoch": 4.208416301606266, + "grad_norm": 0.4753260657613669, + "learning_rate": 1.0288991390501373e-06, + "loss": 0.5135, + "step": 15853 + }, + { + "epoch": 4.208681800079649, + "grad_norm": 0.47742979299859034, + "learning_rate": 1.0286168692873355e-06, + "loss": 0.5805, + "step": 15854 + }, + { + "epoch": 4.208947298553033, + "grad_norm": 0.4778318080148841, + "learning_rate": 1.0283346282201064e-06, + "loss": 0.5714, + "step": 15855 + }, + { + "epoch": 4.209212797026417, + "grad_norm": 0.4827808863498797, + "learning_rate": 1.0280524158539562e-06, + "loss": 0.5412, + "step": 15856 + }, + { + "epoch": 4.2094782954998005, + "grad_norm": 0.46258722103823363, + "learning_rate": 1.027770232194388e-06, + "loss": 0.5117, + "step": 15857 + }, + { + "epoch": 4.2097437939731845, + "grad_norm": 0.47981473072896585, + "learning_rate": 1.0274880772469048e-06, + "loss": 0.5073, + "step": 15858 + }, + { + "epoch": 4.210009292446569, + "grad_norm": 0.4778023426602347, + "learning_rate": 1.0272059510170084e-06, + "loss": 0.5305, + "step": 15859 + }, + { + "epoch": 4.210274790919952, + "grad_norm": 0.47170898785160725, + "learning_rate": 1.026923853510203e-06, + "loss": 0.5163, + "step": 15860 + }, + { + "epoch": 4.210540289393336, + "grad_norm": 0.4723815154284675, + "learning_rate": 1.026641784731988e-06, + "loss": 0.5339, + "step": 15861 + }, + { + "epoch": 4.21080578786672, + "grad_norm": 0.4776143822266705, + "learning_rate": 1.0263597446878663e-06, + "loss": 0.5325, + "step": 15862 + }, + { + "epoch": 4.211071286340103, + "grad_norm": 0.48134166642271997, + "learning_rate": 1.0260777333833388e-06, + "loss": 0.5405, + "step": 15863 + }, + { + "epoch": 4.211336784813487, + "grad_norm": 0.4821551303003591, + "learning_rate": 1.0257957508239028e-06, + "loss": 0.5126, + "step": 15864 + }, + { + "epoch": 4.211602283286871, + "grad_norm": 0.4823589925371034, + "learning_rate": 1.0255137970150598e-06, + "loss": 0.5052, + "step": 15865 + }, + { + "epoch": 4.211867781760255, + "grad_norm": 0.47531023868110134, + "learning_rate": 1.0252318719623072e-06, + "loss": 0.5448, + "step": 15866 + }, + { + "epoch": 4.212133280233639, + "grad_norm": 0.47731256335508954, + "learning_rate": 1.0249499756711447e-06, + "loss": 0.5587, + "step": 15867 + }, + { + "epoch": 4.212398778707023, + "grad_norm": 0.47722080867640504, + "learning_rate": 1.0246681081470685e-06, + "loss": 0.5212, + "step": 15868 + }, + { + "epoch": 4.212664277180406, + "grad_norm": 0.47918871013826214, + "learning_rate": 1.0243862693955772e-06, + "loss": 0.5489, + "step": 15869 + }, + { + "epoch": 4.21292977565379, + "grad_norm": 0.4783584564069484, + "learning_rate": 1.0241044594221666e-06, + "loss": 0.5728, + "step": 15870 + }, + { + "epoch": 4.213195274127174, + "grad_norm": 0.4932118852095281, + "learning_rate": 1.0238226782323318e-06, + "loss": 0.5336, + "step": 15871 + }, + { + "epoch": 4.213460772600557, + "grad_norm": 0.4645038030882401, + "learning_rate": 1.02354092583157e-06, + "loss": 0.524, + "step": 15872 + }, + { + "epoch": 4.213726271073941, + "grad_norm": 0.4704331305131458, + "learning_rate": 1.0232592022253743e-06, + "loss": 0.542, + "step": 15873 + }, + { + "epoch": 4.2139917695473255, + "grad_norm": 0.4748354462780193, + "learning_rate": 1.0229775074192405e-06, + "loss": 0.5219, + "step": 15874 + }, + { + "epoch": 4.214257268020709, + "grad_norm": 0.4808335730089421, + "learning_rate": 1.022695841418662e-06, + "loss": 0.5567, + "step": 15875 + }, + { + "epoch": 4.214522766494093, + "grad_norm": 0.48397537639404015, + "learning_rate": 1.0224142042291313e-06, + "loss": 0.531, + "step": 15876 + }, + { + "epoch": 4.214788264967477, + "grad_norm": 0.4913913781668344, + "learning_rate": 1.0221325958561406e-06, + "loss": 0.5878, + "step": 15877 + }, + { + "epoch": 4.21505376344086, + "grad_norm": 0.4871073066019187, + "learning_rate": 1.0218510163051837e-06, + "loss": 0.5359, + "step": 15878 + }, + { + "epoch": 4.215319261914244, + "grad_norm": 0.46763669227248683, + "learning_rate": 1.02156946558175e-06, + "loss": 0.5034, + "step": 15879 + }, + { + "epoch": 4.215584760387628, + "grad_norm": 0.4879447462348556, + "learning_rate": 1.021287943691333e-06, + "loss": 0.5373, + "step": 15880 + }, + { + "epoch": 4.215850258861011, + "grad_norm": 0.47483084478082516, + "learning_rate": 1.021006450639421e-06, + "loss": 0.5077, + "step": 15881 + }, + { + "epoch": 4.2161157573343955, + "grad_norm": 0.493133603760487, + "learning_rate": 1.0207249864315038e-06, + "loss": 0.5385, + "step": 15882 + }, + { + "epoch": 4.216381255807779, + "grad_norm": 0.4748920336912919, + "learning_rate": 1.020443551073072e-06, + "loss": 0.5767, + "step": 15883 + }, + { + "epoch": 4.216646754281163, + "grad_norm": 0.4707026778562516, + "learning_rate": 1.0201621445696128e-06, + "loss": 0.5492, + "step": 15884 + }, + { + "epoch": 4.216912252754547, + "grad_norm": 0.48389348468213805, + "learning_rate": 1.0198807669266158e-06, + "loss": 0.544, + "step": 15885 + }, + { + "epoch": 4.21717775122793, + "grad_norm": 0.4750360799136636, + "learning_rate": 1.019599418149568e-06, + "loss": 0.5662, + "step": 15886 + }, + { + "epoch": 4.217443249701314, + "grad_norm": 0.4705706520969915, + "learning_rate": 1.0193180982439552e-06, + "loss": 0.5387, + "step": 15887 + }, + { + "epoch": 4.217708748174698, + "grad_norm": 0.47822659720091076, + "learning_rate": 1.0190368072152657e-06, + "loss": 0.5658, + "step": 15888 + }, + { + "epoch": 4.217974246648081, + "grad_norm": 0.4675187228909948, + "learning_rate": 1.0187555450689837e-06, + "loss": 0.5441, + "step": 15889 + }, + { + "epoch": 4.2182397451214655, + "grad_norm": 0.4697110696289429, + "learning_rate": 1.018474311810596e-06, + "loss": 0.5252, + "step": 15890 + }, + { + "epoch": 4.21850524359485, + "grad_norm": 0.48491602632538877, + "learning_rate": 1.0181931074455858e-06, + "loss": 0.5532, + "step": 15891 + }, + { + "epoch": 4.218770742068233, + "grad_norm": 0.4644217257885918, + "learning_rate": 1.0179119319794399e-06, + "loss": 0.5143, + "step": 15892 + }, + { + "epoch": 4.219036240541617, + "grad_norm": 0.4675934969381847, + "learning_rate": 1.0176307854176383e-06, + "loss": 0.5121, + "step": 15893 + }, + { + "epoch": 4.219301739015001, + "grad_norm": 0.4746763685315101, + "learning_rate": 1.0173496677656666e-06, + "loss": 0.5564, + "step": 15894 + }, + { + "epoch": 4.219567237488384, + "grad_norm": 0.47304827154072404, + "learning_rate": 1.0170685790290058e-06, + "loss": 0.5236, + "step": 15895 + }, + { + "epoch": 4.219832735961768, + "grad_norm": 0.4653343826433345, + "learning_rate": 1.0167875192131394e-06, + "loss": 0.5064, + "step": 15896 + }, + { + "epoch": 4.220098234435152, + "grad_norm": 0.4740909098740407, + "learning_rate": 1.016506488323547e-06, + "loss": 0.5213, + "step": 15897 + }, + { + "epoch": 4.2203637329085355, + "grad_norm": 0.47913438400532027, + "learning_rate": 1.0162254863657112e-06, + "loss": 0.5668, + "step": 15898 + }, + { + "epoch": 4.22062923138192, + "grad_norm": 0.48515452996387065, + "learning_rate": 1.0159445133451116e-06, + "loss": 0.5291, + "step": 15899 + }, + { + "epoch": 4.220894729855304, + "grad_norm": 0.4814378623759041, + "learning_rate": 1.0156635692672261e-06, + "loss": 0.5503, + "step": 15900 + }, + { + "epoch": 4.221160228328687, + "grad_norm": 0.4644732072739585, + "learning_rate": 1.0153826541375366e-06, + "loss": 0.5109, + "step": 15901 + }, + { + "epoch": 4.221425726802071, + "grad_norm": 0.4739078262498862, + "learning_rate": 1.0151017679615194e-06, + "loss": 0.5133, + "step": 15902 + }, + { + "epoch": 4.221691225275455, + "grad_norm": 0.4775277760577075, + "learning_rate": 1.014820910744654e-06, + "loss": 0.5482, + "step": 15903 + }, + { + "epoch": 4.221956723748838, + "grad_norm": 0.4771427804362491, + "learning_rate": 1.0145400824924176e-06, + "loss": 0.5437, + "step": 15904 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.47618627437789246, + "learning_rate": 1.0142592832102863e-06, + "loss": 0.512, + "step": 15905 + }, + { + "epoch": 4.222487720695606, + "grad_norm": 0.47847148916228677, + "learning_rate": 1.013978512903736e-06, + "loss": 0.5426, + "step": 15906 + }, + { + "epoch": 4.22275321916899, + "grad_norm": 0.4852771486454056, + "learning_rate": 1.0136977715782431e-06, + "loss": 0.5464, + "step": 15907 + }, + { + "epoch": 4.223018717642374, + "grad_norm": 0.4692580065780568, + "learning_rate": 1.0134170592392837e-06, + "loss": 0.5396, + "step": 15908 + }, + { + "epoch": 4.223284216115758, + "grad_norm": 0.4688993432918405, + "learning_rate": 1.013136375892331e-06, + "loss": 0.5182, + "step": 15909 + }, + { + "epoch": 4.223549714589141, + "grad_norm": 0.4800633816061315, + "learning_rate": 1.0128557215428597e-06, + "loss": 0.5347, + "step": 15910 + }, + { + "epoch": 4.223815213062525, + "grad_norm": 0.4645576836122541, + "learning_rate": 1.0125750961963424e-06, + "loss": 0.5181, + "step": 15911 + }, + { + "epoch": 4.224080711535908, + "grad_norm": 0.4769913040956293, + "learning_rate": 1.0122944998582528e-06, + "loss": 0.5413, + "step": 15912 + }, + { + "epoch": 4.224346210009292, + "grad_norm": 0.4757370444146832, + "learning_rate": 1.0120139325340627e-06, + "loss": 0.5454, + "step": 15913 + }, + { + "epoch": 4.224611708482676, + "grad_norm": 0.47129215624269116, + "learning_rate": 1.0117333942292448e-06, + "loss": 0.5925, + "step": 15914 + }, + { + "epoch": 4.22487720695606, + "grad_norm": 0.4818781799857873, + "learning_rate": 1.0114528849492698e-06, + "loss": 0.5233, + "step": 15915 + }, + { + "epoch": 4.225142705429444, + "grad_norm": 0.48251590852443565, + "learning_rate": 1.0111724046996069e-06, + "loss": 0.5283, + "step": 15916 + }, + { + "epoch": 4.225408203902828, + "grad_norm": 0.4816599107045372, + "learning_rate": 1.0108919534857284e-06, + "loss": 0.5568, + "step": 15917 + }, + { + "epoch": 4.225673702376211, + "grad_norm": 0.5001858710165548, + "learning_rate": 1.010611531313102e-06, + "loss": 0.5933, + "step": 15918 + }, + { + "epoch": 4.225939200849595, + "grad_norm": 0.4863384905923578, + "learning_rate": 1.0103311381871983e-06, + "loss": 0.5811, + "step": 15919 + }, + { + "epoch": 4.226204699322979, + "grad_norm": 0.49967567532830104, + "learning_rate": 1.0100507741134836e-06, + "loss": 0.562, + "step": 15920 + }, + { + "epoch": 4.226470197796362, + "grad_norm": 0.48503391419084313, + "learning_rate": 1.009770439097429e-06, + "loss": 0.527, + "step": 15921 + }, + { + "epoch": 4.226735696269746, + "grad_norm": 0.47518587412633456, + "learning_rate": 1.0094901331444974e-06, + "loss": 0.5169, + "step": 15922 + }, + { + "epoch": 4.2270011947431305, + "grad_norm": 0.48262071059950545, + "learning_rate": 1.0092098562601585e-06, + "loss": 0.535, + "step": 15923 + }, + { + "epoch": 4.227266693216514, + "grad_norm": 0.47111494732321507, + "learning_rate": 1.0089296084498768e-06, + "loss": 0.5162, + "step": 15924 + }, + { + "epoch": 4.227532191689898, + "grad_norm": 0.4630830746118903, + "learning_rate": 1.0086493897191194e-06, + "loss": 0.5276, + "step": 15925 + }, + { + "epoch": 4.227797690163282, + "grad_norm": 0.4808478967248429, + "learning_rate": 1.0083692000733495e-06, + "loss": 0.5257, + "step": 15926 + }, + { + "epoch": 4.228063188636665, + "grad_norm": 0.46938887313113176, + "learning_rate": 1.008089039518033e-06, + "loss": 0.5148, + "step": 15927 + }, + { + "epoch": 4.228328687110049, + "grad_norm": 0.4708743551013019, + "learning_rate": 1.0078089080586331e-06, + "loss": 0.5197, + "step": 15928 + }, + { + "epoch": 4.228594185583433, + "grad_norm": 0.47646650794977047, + "learning_rate": 1.0075288057006122e-06, + "loss": 0.5469, + "step": 15929 + }, + { + "epoch": 4.2288596840568164, + "grad_norm": 0.4741249961849285, + "learning_rate": 1.0072487324494347e-06, + "loss": 0.5355, + "step": 15930 + }, + { + "epoch": 4.2291251825302005, + "grad_norm": 0.4804513957740347, + "learning_rate": 1.0069686883105608e-06, + "loss": 0.5556, + "step": 15931 + }, + { + "epoch": 4.229390681003585, + "grad_norm": 0.48234061686951074, + "learning_rate": 1.006688673289454e-06, + "loss": 0.5022, + "step": 15932 + }, + { + "epoch": 4.229656179476968, + "grad_norm": 0.47688139674142555, + "learning_rate": 1.0064086873915743e-06, + "loss": 0.5431, + "step": 15933 + }, + { + "epoch": 4.229921677950352, + "grad_norm": 0.47863507090959995, + "learning_rate": 1.0061287306223814e-06, + "loss": 0.5518, + "step": 15934 + }, + { + "epoch": 4.230187176423736, + "grad_norm": 0.4910866207723701, + "learning_rate": 1.0058488029873367e-06, + "loss": 0.5474, + "step": 15935 + }, + { + "epoch": 4.230452674897119, + "grad_norm": 0.474248423892993, + "learning_rate": 1.0055689044918979e-06, + "loss": 0.5491, + "step": 15936 + }, + { + "epoch": 4.230718173370503, + "grad_norm": 0.48336703789497126, + "learning_rate": 1.005289035141525e-06, + "loss": 0.502, + "step": 15937 + }, + { + "epoch": 4.230983671843887, + "grad_norm": 0.4838438719219541, + "learning_rate": 1.0050091949416758e-06, + "loss": 0.5465, + "step": 15938 + }, + { + "epoch": 4.2312491703172705, + "grad_norm": 0.48221486111858924, + "learning_rate": 1.0047293838978076e-06, + "loss": 0.5725, + "step": 15939 + }, + { + "epoch": 4.231514668790655, + "grad_norm": 0.4783826217859718, + "learning_rate": 1.0044496020153766e-06, + "loss": 0.5718, + "step": 15940 + }, + { + "epoch": 4.231780167264038, + "grad_norm": 0.48866707437572793, + "learning_rate": 1.004169849299841e-06, + "loss": 0.5707, + "step": 15941 + }, + { + "epoch": 4.232045665737422, + "grad_norm": 0.4886087644318049, + "learning_rate": 1.0038901257566549e-06, + "loss": 0.5392, + "step": 15942 + }, + { + "epoch": 4.232311164210806, + "grad_norm": 0.46473920359812265, + "learning_rate": 1.0036104313912753e-06, + "loss": 0.5145, + "step": 15943 + }, + { + "epoch": 4.232576662684189, + "grad_norm": 0.47626016489057527, + "learning_rate": 1.0033307662091564e-06, + "loss": 0.5312, + "step": 15944 + }, + { + "epoch": 4.232842161157573, + "grad_norm": 0.47641897969300506, + "learning_rate": 1.0030511302157507e-06, + "loss": 0.5343, + "step": 15945 + }, + { + "epoch": 4.233107659630957, + "grad_norm": 0.4804188349795335, + "learning_rate": 1.0027715234165141e-06, + "loss": 0.5571, + "step": 15946 + }, + { + "epoch": 4.233373158104341, + "grad_norm": 0.4809974956275609, + "learning_rate": 1.002491945816898e-06, + "loss": 0.5447, + "step": 15947 + }, + { + "epoch": 4.233638656577725, + "grad_norm": 0.48272081441051834, + "learning_rate": 1.0022123974223561e-06, + "loss": 0.5083, + "step": 15948 + }, + { + "epoch": 4.233904155051109, + "grad_norm": 0.4799969637914395, + "learning_rate": 1.001932878238339e-06, + "loss": 0.5793, + "step": 15949 + }, + { + "epoch": 4.234169653524492, + "grad_norm": 0.48157006782917455, + "learning_rate": 1.0016533882703003e-06, + "loss": 0.5167, + "step": 15950 + }, + { + "epoch": 4.234435151997876, + "grad_norm": 0.4745047614306491, + "learning_rate": 1.0013739275236873e-06, + "loss": 0.5536, + "step": 15951 + }, + { + "epoch": 4.23470065047126, + "grad_norm": 0.47684744179159017, + "learning_rate": 1.001094496003952e-06, + "loss": 0.5862, + "step": 15952 + }, + { + "epoch": 4.234966148944643, + "grad_norm": 0.4743926638992742, + "learning_rate": 1.000815093716545e-06, + "loss": 0.5328, + "step": 15953 + }, + { + "epoch": 4.235231647418027, + "grad_norm": 0.4726109594210413, + "learning_rate": 1.0005357206669133e-06, + "loss": 0.558, + "step": 15954 + }, + { + "epoch": 4.2354971458914115, + "grad_norm": 0.4884984905044093, + "learning_rate": 1.000256376860508e-06, + "loss": 0.5364, + "step": 15955 + }, + { + "epoch": 4.235762644364795, + "grad_norm": 0.4719704050174625, + "learning_rate": 9.99977062302774e-07, + "loss": 0.5442, + "step": 15956 + }, + { + "epoch": 4.236028142838179, + "grad_norm": 0.46870701889438227, + "learning_rate": 9.996977769991606e-07, + "loss": 0.5382, + "step": 15957 + }, + { + "epoch": 4.236293641311563, + "grad_norm": 0.46666157490804416, + "learning_rate": 9.99418520955113e-07, + "loss": 0.54, + "step": 15958 + }, + { + "epoch": 4.236559139784946, + "grad_norm": 0.46596040893020063, + "learning_rate": 9.991392941760794e-07, + "loss": 0.5205, + "step": 15959 + }, + { + "epoch": 4.23682463825833, + "grad_norm": 0.47799301608879863, + "learning_rate": 9.988600966675032e-07, + "loss": 0.5393, + "step": 15960 + }, + { + "epoch": 4.237090136731714, + "grad_norm": 0.46859255423847684, + "learning_rate": 9.985809284348317e-07, + "loss": 0.5061, + "step": 15961 + }, + { + "epoch": 4.237355635205097, + "grad_norm": 0.46207119164410576, + "learning_rate": 9.983017894835079e-07, + "loss": 0.54, + "step": 15962 + }, + { + "epoch": 4.2376211336784815, + "grad_norm": 0.47510397793530307, + "learning_rate": 9.980226798189752e-07, + "loss": 0.539, + "step": 15963 + }, + { + "epoch": 4.237886632151865, + "grad_norm": 0.49287419808481087, + "learning_rate": 9.977435994466787e-07, + "loss": 0.5415, + "step": 15964 + }, + { + "epoch": 4.238152130625249, + "grad_norm": 0.47409514978860545, + "learning_rate": 9.974645483720591e-07, + "loss": 0.5116, + "step": 15965 + }, + { + "epoch": 4.238417629098633, + "grad_norm": 0.47710044379054645, + "learning_rate": 9.971855266005605e-07, + "loss": 0.5231, + "step": 15966 + }, + { + "epoch": 4.238683127572016, + "grad_norm": 0.47780619371812816, + "learning_rate": 9.96906534137624e-07, + "loss": 0.5509, + "step": 15967 + }, + { + "epoch": 4.2389486260454, + "grad_norm": 0.4779700258715163, + "learning_rate": 9.966275709886899e-07, + "loss": 0.5296, + "step": 15968 + }, + { + "epoch": 4.239214124518784, + "grad_norm": 0.4875739553432791, + "learning_rate": 9.963486371591984e-07, + "loss": 0.5316, + "step": 15969 + }, + { + "epoch": 4.239479622992167, + "grad_norm": 0.47674675590545296, + "learning_rate": 9.960697326545909e-07, + "loss": 0.5646, + "step": 15970 + }, + { + "epoch": 4.2397451214655515, + "grad_norm": 0.47229727053842213, + "learning_rate": 9.95790857480305e-07, + "loss": 0.517, + "step": 15971 + }, + { + "epoch": 4.240010619938936, + "grad_norm": 0.47586131703811907, + "learning_rate": 9.95512011641781e-07, + "loss": 0.5261, + "step": 15972 + }, + { + "epoch": 4.240276118412319, + "grad_norm": 0.47888792352102183, + "learning_rate": 9.952331951444566e-07, + "loss": 0.5759, + "step": 15973 + }, + { + "epoch": 4.240541616885703, + "grad_norm": 0.46001179587519053, + "learning_rate": 9.949544079937684e-07, + "loss": 0.5085, + "step": 15974 + }, + { + "epoch": 4.240807115359087, + "grad_norm": 0.4757442630119836, + "learning_rate": 9.946756501951549e-07, + "loss": 0.5233, + "step": 15975 + }, + { + "epoch": 4.24107261383247, + "grad_norm": 0.4845871760270505, + "learning_rate": 9.94396921754051e-07, + "loss": 0.5206, + "step": 15976 + }, + { + "epoch": 4.241338112305854, + "grad_norm": 0.47762528619974337, + "learning_rate": 9.941182226758948e-07, + "loss": 0.5697, + "step": 15977 + }, + { + "epoch": 4.241603610779238, + "grad_norm": 0.4761888890384406, + "learning_rate": 9.938395529661188e-07, + "loss": 0.5209, + "step": 15978 + }, + { + "epoch": 4.2418691092526215, + "grad_norm": 0.4635347399061296, + "learning_rate": 9.935609126301603e-07, + "loss": 0.5024, + "step": 15979 + }, + { + "epoch": 4.242134607726006, + "grad_norm": 0.4744707139161358, + "learning_rate": 9.932823016734528e-07, + "loss": 0.5531, + "step": 15980 + }, + { + "epoch": 4.24240010619939, + "grad_norm": 0.48150205727966855, + "learning_rate": 9.930037201014282e-07, + "loss": 0.5269, + "step": 15981 + }, + { + "epoch": 4.242665604672773, + "grad_norm": 0.482923144486339, + "learning_rate": 9.927251679195218e-07, + "loss": 0.5324, + "step": 15982 + }, + { + "epoch": 4.242931103146157, + "grad_norm": 0.47463125858726435, + "learning_rate": 9.92446645133164e-07, + "loss": 0.5342, + "step": 15983 + }, + { + "epoch": 4.243196601619541, + "grad_norm": 0.476528507446879, + "learning_rate": 9.9216815174779e-07, + "loss": 0.5265, + "step": 15984 + }, + { + "epoch": 4.243462100092924, + "grad_norm": 0.4830035630828717, + "learning_rate": 9.918896877688266e-07, + "loss": 0.5248, + "step": 15985 + }, + { + "epoch": 4.243727598566308, + "grad_norm": 0.4869457089488714, + "learning_rate": 9.91611253201708e-07, + "loss": 0.5499, + "step": 15986 + }, + { + "epoch": 4.243993097039692, + "grad_norm": 0.47789855810616927, + "learning_rate": 9.91332848051862e-07, + "loss": 0.5478, + "step": 15987 + }, + { + "epoch": 4.244258595513076, + "grad_norm": 0.48049778461741777, + "learning_rate": 9.910544723247204e-07, + "loss": 0.5003, + "step": 15988 + }, + { + "epoch": 4.24452409398646, + "grad_norm": 0.48802707897228115, + "learning_rate": 9.907761260257101e-07, + "loss": 0.5394, + "step": 15989 + }, + { + "epoch": 4.244789592459844, + "grad_norm": 0.4784880012741722, + "learning_rate": 9.904978091602615e-07, + "loss": 0.5395, + "step": 15990 + }, + { + "epoch": 4.245055090933227, + "grad_norm": 0.46522718711992217, + "learning_rate": 9.902195217338014e-07, + "loss": 0.5172, + "step": 15991 + }, + { + "epoch": 4.245320589406611, + "grad_norm": 0.4783737405844053, + "learning_rate": 9.899412637517567e-07, + "loss": 0.561, + "step": 15992 + }, + { + "epoch": 4.245586087879994, + "grad_norm": 0.47611464593511765, + "learning_rate": 9.896630352195553e-07, + "loss": 0.5338, + "step": 15993 + }, + { + "epoch": 4.245851586353378, + "grad_norm": 0.47763745446218864, + "learning_rate": 9.893848361426213e-07, + "loss": 0.5112, + "step": 15994 + }, + { + "epoch": 4.246117084826762, + "grad_norm": 0.4690369275846122, + "learning_rate": 9.891066665263831e-07, + "loss": 0.5231, + "step": 15995 + }, + { + "epoch": 4.246382583300146, + "grad_norm": 0.47004676018678654, + "learning_rate": 9.88828526376264e-07, + "loss": 0.527, + "step": 15996 + }, + { + "epoch": 4.24664808177353, + "grad_norm": 0.47394702047689996, + "learning_rate": 9.88550415697689e-07, + "loss": 0.5603, + "step": 15997 + }, + { + "epoch": 4.246913580246914, + "grad_norm": 0.4695180362922194, + "learning_rate": 9.8827233449608e-07, + "loss": 0.5024, + "step": 15998 + }, + { + "epoch": 4.247179078720297, + "grad_norm": 0.48834802311827824, + "learning_rate": 9.879942827768621e-07, + "loss": 0.4921, + "step": 15999 + }, + { + "epoch": 4.247444577193681, + "grad_norm": 0.49290687926842247, + "learning_rate": 9.877162605454587e-07, + "loss": 0.5302, + "step": 16000 + }, + { + "epoch": 4.247710075667065, + "grad_norm": 0.4769398085062818, + "learning_rate": 9.8743826780729e-07, + "loss": 0.5455, + "step": 16001 + }, + { + "epoch": 4.247975574140448, + "grad_norm": 0.49579909742191913, + "learning_rate": 9.871603045677802e-07, + "loss": 0.5499, + "step": 16002 + }, + { + "epoch": 4.248241072613832, + "grad_norm": 0.47489923116737565, + "learning_rate": 9.86882370832347e-07, + "loss": 0.5246, + "step": 16003 + }, + { + "epoch": 4.2485065710872165, + "grad_norm": 0.46381284918068677, + "learning_rate": 9.866044666064128e-07, + "loss": 0.5444, + "step": 16004 + }, + { + "epoch": 4.2487720695606, + "grad_norm": 0.4795434403804182, + "learning_rate": 9.863265918953962e-07, + "loss": 0.5286, + "step": 16005 + }, + { + "epoch": 4.249037568033984, + "grad_norm": 0.48327450098696323, + "learning_rate": 9.86048746704718e-07, + "loss": 0.5807, + "step": 16006 + }, + { + "epoch": 4.249303066507368, + "grad_norm": 0.47626729010875246, + "learning_rate": 9.857709310397962e-07, + "loss": 0.5285, + "step": 16007 + }, + { + "epoch": 4.249568564980751, + "grad_norm": 0.4843302235506611, + "learning_rate": 9.854931449060479e-07, + "loss": 0.5509, + "step": 16008 + }, + { + "epoch": 4.249834063454135, + "grad_norm": 0.48323191514862807, + "learning_rate": 9.852153883088922e-07, + "loss": 0.5635, + "step": 16009 + }, + { + "epoch": 4.250099561927519, + "grad_norm": 0.46662611076328203, + "learning_rate": 9.849376612537442e-07, + "loss": 0.5314, + "step": 16010 + }, + { + "epoch": 4.2503650604009025, + "grad_norm": 0.47660802677753505, + "learning_rate": 9.846599637460224e-07, + "loss": 0.5433, + "step": 16011 + }, + { + "epoch": 4.2506305588742865, + "grad_norm": 0.4935621095105905, + "learning_rate": 9.843822957911405e-07, + "loss": 0.5442, + "step": 16012 + }, + { + "epoch": 4.250896057347671, + "grad_norm": 0.48849525065551946, + "learning_rate": 9.841046573945165e-07, + "loss": 0.5341, + "step": 16013 + }, + { + "epoch": 4.251161555821054, + "grad_norm": 0.4777556110114682, + "learning_rate": 9.838270485615613e-07, + "loss": 0.5513, + "step": 16014 + }, + { + "epoch": 4.251427054294438, + "grad_norm": 0.4736586790615974, + "learning_rate": 9.83549469297692e-07, + "loss": 0.5341, + "step": 16015 + }, + { + "epoch": 4.251692552767822, + "grad_norm": 0.4783070597224486, + "learning_rate": 9.8327191960832e-07, + "loss": 0.5588, + "step": 16016 + }, + { + "epoch": 4.251958051241205, + "grad_norm": 0.4835368521055435, + "learning_rate": 9.829943994988603e-07, + "loss": 0.5481, + "step": 16017 + }, + { + "epoch": 4.252223549714589, + "grad_norm": 0.4870136685130338, + "learning_rate": 9.82716908974723e-07, + "loss": 0.5815, + "step": 16018 + }, + { + "epoch": 4.252489048187973, + "grad_norm": 0.46232329302794406, + "learning_rate": 9.82439448041322e-07, + "loss": 0.5575, + "step": 16019 + }, + { + "epoch": 4.2527545466613565, + "grad_norm": 0.46938662065875303, + "learning_rate": 9.821620167040669e-07, + "loss": 0.5462, + "step": 16020 + }, + { + "epoch": 4.253020045134741, + "grad_norm": 0.48660290109037174, + "learning_rate": 9.818846149683683e-07, + "loss": 0.5134, + "step": 16021 + }, + { + "epoch": 4.253285543608124, + "grad_norm": 0.47521846723545547, + "learning_rate": 9.816072428396376e-07, + "loss": 0.5336, + "step": 16022 + }, + { + "epoch": 4.253551042081508, + "grad_norm": 0.47149676881478686, + "learning_rate": 9.81329900323282e-07, + "loss": 0.5338, + "step": 16023 + }, + { + "epoch": 4.253816540554892, + "grad_norm": 0.4763948392473039, + "learning_rate": 9.810525874247128e-07, + "loss": 0.5734, + "step": 16024 + }, + { + "epoch": 4.254082039028275, + "grad_norm": 0.472106145233036, + "learning_rate": 9.807753041493374e-07, + "loss": 0.5572, + "step": 16025 + }, + { + "epoch": 4.254347537501659, + "grad_norm": 0.48380889344319267, + "learning_rate": 9.804980505025621e-07, + "loss": 0.5487, + "step": 16026 + }, + { + "epoch": 4.254613035975043, + "grad_norm": 0.4725136356423027, + "learning_rate": 9.802208264897965e-07, + "loss": 0.5697, + "step": 16027 + }, + { + "epoch": 4.254878534448427, + "grad_norm": 0.4738780429007621, + "learning_rate": 9.799436321164446e-07, + "loss": 0.5272, + "step": 16028 + }, + { + "epoch": 4.255144032921811, + "grad_norm": 0.4728705996349, + "learning_rate": 9.796664673879143e-07, + "loss": 0.524, + "step": 16029 + }, + { + "epoch": 4.255409531395195, + "grad_norm": 0.4842737139152287, + "learning_rate": 9.793893323096106e-07, + "loss": 0.5855, + "step": 16030 + }, + { + "epoch": 4.255675029868578, + "grad_norm": 0.4880141373677577, + "learning_rate": 9.791122268869383e-07, + "loss": 0.5411, + "step": 16031 + }, + { + "epoch": 4.255940528341962, + "grad_norm": 0.4805442288193504, + "learning_rate": 9.788351511252999e-07, + "loss": 0.5225, + "step": 16032 + }, + { + "epoch": 4.256206026815346, + "grad_norm": 0.4814089510712732, + "learning_rate": 9.785581050301019e-07, + "loss": 0.5534, + "step": 16033 + }, + { + "epoch": 4.256471525288729, + "grad_norm": 0.4794097831717837, + "learning_rate": 9.782810886067448e-07, + "loss": 0.569, + "step": 16034 + }, + { + "epoch": 4.256737023762113, + "grad_norm": 0.482781456838931, + "learning_rate": 9.78004101860633e-07, + "loss": 0.5611, + "step": 16035 + }, + { + "epoch": 4.2570025222354975, + "grad_norm": 0.4715717009632437, + "learning_rate": 9.77727144797168e-07, + "loss": 0.5392, + "step": 16036 + }, + { + "epoch": 4.257268020708881, + "grad_norm": 0.49242737961084665, + "learning_rate": 9.7745021742175e-07, + "loss": 0.5435, + "step": 16037 + }, + { + "epoch": 4.257533519182265, + "grad_norm": 0.4857115041572971, + "learning_rate": 9.771733197397815e-07, + "loss": 0.5821, + "step": 16038 + }, + { + "epoch": 4.257799017655649, + "grad_norm": 0.47833497809488323, + "learning_rate": 9.768964517566607e-07, + "loss": 0.5398, + "step": 16039 + }, + { + "epoch": 4.258064516129032, + "grad_norm": 0.48023255944996, + "learning_rate": 9.766196134777893e-07, + "loss": 0.5264, + "step": 16040 + }, + { + "epoch": 4.258330014602416, + "grad_norm": 0.4724451195861857, + "learning_rate": 9.763428049085643e-07, + "loss": 0.5303, + "step": 16041 + }, + { + "epoch": 4.2585955130758, + "grad_norm": 0.47375952723417264, + "learning_rate": 9.76066026054387e-07, + "loss": 0.5233, + "step": 16042 + }, + { + "epoch": 4.258861011549183, + "grad_norm": 0.4686268518570557, + "learning_rate": 9.757892769206517e-07, + "loss": 0.5441, + "step": 16043 + }, + { + "epoch": 4.2591265100225675, + "grad_norm": 0.4880967754837364, + "learning_rate": 9.755125575127575e-07, + "loss": 0.5569, + "step": 16044 + }, + { + "epoch": 4.259392008495951, + "grad_norm": 0.48539376577705884, + "learning_rate": 9.752358678361015e-07, + "loss": 0.5407, + "step": 16045 + }, + { + "epoch": 4.259657506969335, + "grad_norm": 0.48876764730175115, + "learning_rate": 9.74959207896079e-07, + "loss": 0.5375, + "step": 16046 + }, + { + "epoch": 4.259923005442719, + "grad_norm": 0.48391938008695684, + "learning_rate": 9.746825776980864e-07, + "loss": 0.5329, + "step": 16047 + }, + { + "epoch": 4.260188503916103, + "grad_norm": 0.4970687844112541, + "learning_rate": 9.744059772475182e-07, + "loss": 0.5384, + "step": 16048 + }, + { + "epoch": 4.260454002389486, + "grad_norm": 0.4779618739741265, + "learning_rate": 9.74129406549769e-07, + "loss": 0.5572, + "step": 16049 + }, + { + "epoch": 4.26071950086287, + "grad_norm": 0.48276551268483087, + "learning_rate": 9.73852865610231e-07, + "loss": 0.5465, + "step": 16050 + }, + { + "epoch": 4.260984999336253, + "grad_norm": 0.46703669058627545, + "learning_rate": 9.735763544343e-07, + "loss": 0.514, + "step": 16051 + }, + { + "epoch": 4.2612504978096375, + "grad_norm": 0.488214021625802, + "learning_rate": 9.732998730273665e-07, + "loss": 0.5528, + "step": 16052 + }, + { + "epoch": 4.261515996283022, + "grad_norm": 0.4751823210750789, + "learning_rate": 9.730234213948245e-07, + "loss": 0.5636, + "step": 16053 + }, + { + "epoch": 4.261781494756405, + "grad_norm": 0.4834320507378837, + "learning_rate": 9.727469995420642e-07, + "loss": 0.5209, + "step": 16054 + }, + { + "epoch": 4.262046993229789, + "grad_norm": 0.48519488910479647, + "learning_rate": 9.72470607474476e-07, + "loss": 0.5236, + "step": 16055 + }, + { + "epoch": 4.262312491703173, + "grad_norm": 0.48297816464771204, + "learning_rate": 9.721942451974516e-07, + "loss": 0.5618, + "step": 16056 + }, + { + "epoch": 4.262577990176556, + "grad_norm": 0.48494023212641846, + "learning_rate": 9.719179127163797e-07, + "loss": 0.5463, + "step": 16057 + }, + { + "epoch": 4.26284348864994, + "grad_norm": 0.4561690057664008, + "learning_rate": 9.716416100366504e-07, + "loss": 0.4978, + "step": 16058 + }, + { + "epoch": 4.263108987123324, + "grad_norm": 0.478148824142442, + "learning_rate": 9.713653371636521e-07, + "loss": 0.5297, + "step": 16059 + }, + { + "epoch": 4.2633744855967075, + "grad_norm": 0.493625423472556, + "learning_rate": 9.710890941027724e-07, + "loss": 0.564, + "step": 16060 + }, + { + "epoch": 4.263639984070092, + "grad_norm": 0.4789588176883131, + "learning_rate": 9.708128808593977e-07, + "loss": 0.55, + "step": 16061 + }, + { + "epoch": 4.263905482543476, + "grad_norm": 0.4743936301301481, + "learning_rate": 9.70536697438917e-07, + "loss": 0.5339, + "step": 16062 + }, + { + "epoch": 4.264170981016859, + "grad_norm": 0.4759998978829584, + "learning_rate": 9.702605438467144e-07, + "loss": 0.5452, + "step": 16063 + }, + { + "epoch": 4.264436479490243, + "grad_norm": 0.4792951823034538, + "learning_rate": 9.69984420088177e-07, + "loss": 0.5686, + "step": 16064 + }, + { + "epoch": 4.264701977963627, + "grad_norm": 0.48056652033675523, + "learning_rate": 9.69708326168691e-07, + "loss": 0.5648, + "step": 16065 + }, + { + "epoch": 4.26496747643701, + "grad_norm": 0.46039414951950647, + "learning_rate": 9.694322620936377e-07, + "loss": 0.5079, + "step": 16066 + }, + { + "epoch": 4.265232974910394, + "grad_norm": 0.4840736501554974, + "learning_rate": 9.691562278684035e-07, + "loss": 0.5642, + "step": 16067 + }, + { + "epoch": 4.265498473383778, + "grad_norm": 0.4819643709749258, + "learning_rate": 9.688802234983706e-07, + "loss": 0.5777, + "step": 16068 + }, + { + "epoch": 4.265763971857162, + "grad_norm": 0.4860825727021681, + "learning_rate": 9.686042489889225e-07, + "loss": 0.5386, + "step": 16069 + }, + { + "epoch": 4.266029470330546, + "grad_norm": 0.4744512794430315, + "learning_rate": 9.683283043454407e-07, + "loss": 0.5569, + "step": 16070 + }, + { + "epoch": 4.26629496880393, + "grad_norm": 0.4885789344864933, + "learning_rate": 9.680523895733078e-07, + "loss": 0.546, + "step": 16071 + }, + { + "epoch": 4.266560467277313, + "grad_norm": 0.48135840635621785, + "learning_rate": 9.67776504677904e-07, + "loss": 0.5148, + "step": 16072 + }, + { + "epoch": 4.266825965750697, + "grad_norm": 0.4920760143132962, + "learning_rate": 9.67500649664609e-07, + "loss": 0.5483, + "step": 16073 + }, + { + "epoch": 4.26709146422408, + "grad_norm": 0.48226587383837854, + "learning_rate": 9.672248245388043e-07, + "loss": 0.5712, + "step": 16074 + }, + { + "epoch": 4.267356962697464, + "grad_norm": 0.48995922230162264, + "learning_rate": 9.669490293058681e-07, + "loss": 0.538, + "step": 16075 + }, + { + "epoch": 4.267622461170848, + "grad_norm": 0.5042151762882503, + "learning_rate": 9.666732639711808e-07, + "loss": 0.5459, + "step": 16076 + }, + { + "epoch": 4.2678879596442325, + "grad_norm": 0.4655224328954768, + "learning_rate": 9.663975285401172e-07, + "loss": 0.5518, + "step": 16077 + }, + { + "epoch": 4.268153458117616, + "grad_norm": 0.4664643310146895, + "learning_rate": 9.661218230180577e-07, + "loss": 0.5099, + "step": 16078 + }, + { + "epoch": 4.268418956591, + "grad_norm": 0.4913369467815235, + "learning_rate": 9.658461474103772e-07, + "loss": 0.5336, + "step": 16079 + }, + { + "epoch": 4.268684455064383, + "grad_norm": 0.4894777850480446, + "learning_rate": 9.655705017224543e-07, + "loss": 0.5361, + "step": 16080 + }, + { + "epoch": 4.268949953537767, + "grad_norm": 0.4619893316056146, + "learning_rate": 9.652948859596626e-07, + "loss": 0.5077, + "step": 16081 + }, + { + "epoch": 4.269215452011151, + "grad_norm": 0.4850913789551262, + "learning_rate": 9.65019300127379e-07, + "loss": 0.5317, + "step": 16082 + }, + { + "epoch": 4.269480950484534, + "grad_norm": 0.4764435888125086, + "learning_rate": 9.647437442309771e-07, + "loss": 0.5237, + "step": 16083 + }, + { + "epoch": 4.269746448957918, + "grad_norm": 0.48223306966055324, + "learning_rate": 9.644682182758305e-07, + "loss": 0.5139, + "step": 16084 + }, + { + "epoch": 4.2700119474313025, + "grad_norm": 0.473251847050034, + "learning_rate": 9.641927222673142e-07, + "loss": 0.5125, + "step": 16085 + }, + { + "epoch": 4.270277445904686, + "grad_norm": 0.46133355324013003, + "learning_rate": 9.63917256210799e-07, + "loss": 0.5168, + "step": 16086 + }, + { + "epoch": 4.27054294437807, + "grad_norm": 0.47156208699791874, + "learning_rate": 9.636418201116595e-07, + "loss": 0.4969, + "step": 16087 + }, + { + "epoch": 4.270808442851454, + "grad_norm": 0.4847854205290721, + "learning_rate": 9.63366413975266e-07, + "loss": 0.5243, + "step": 16088 + }, + { + "epoch": 4.271073941324837, + "grad_norm": 0.47570561452831767, + "learning_rate": 9.630910378069888e-07, + "loss": 0.5529, + "step": 16089 + }, + { + "epoch": 4.271339439798221, + "grad_norm": 0.4757927710267138, + "learning_rate": 9.628156916122002e-07, + "loss": 0.5448, + "step": 16090 + }, + { + "epoch": 4.271604938271605, + "grad_norm": 0.4858879579251946, + "learning_rate": 9.625403753962686e-07, + "loss": 0.545, + "step": 16091 + }, + { + "epoch": 4.2718704367449885, + "grad_norm": 0.4777561412323649, + "learning_rate": 9.62265089164565e-07, + "loss": 0.542, + "step": 16092 + }, + { + "epoch": 4.2721359352183725, + "grad_norm": 0.47979147112681003, + "learning_rate": 9.619898329224562e-07, + "loss": 0.5824, + "step": 16093 + }, + { + "epoch": 4.272401433691757, + "grad_norm": 0.48484741406015935, + "learning_rate": 9.61714606675313e-07, + "loss": 0.5017, + "step": 16094 + }, + { + "epoch": 4.27266693216514, + "grad_norm": 0.4818843720111582, + "learning_rate": 9.614394104284996e-07, + "loss": 0.5388, + "step": 16095 + }, + { + "epoch": 4.272932430638524, + "grad_norm": 0.4742567143723967, + "learning_rate": 9.611642441873856e-07, + "loss": 0.573, + "step": 16096 + }, + { + "epoch": 4.273197929111908, + "grad_norm": 0.4929283846730227, + "learning_rate": 9.608891079573357e-07, + "loss": 0.5519, + "step": 16097 + }, + { + "epoch": 4.273463427585291, + "grad_norm": 0.4824255007576174, + "learning_rate": 9.606140017437176e-07, + "loss": 0.5496, + "step": 16098 + }, + { + "epoch": 4.273728926058675, + "grad_norm": 0.4706925454237134, + "learning_rate": 9.603389255518947e-07, + "loss": 0.5233, + "step": 16099 + }, + { + "epoch": 4.273994424532059, + "grad_norm": 0.4788694614300321, + "learning_rate": 9.60063879387233e-07, + "loss": 0.5714, + "step": 16100 + }, + { + "epoch": 4.2742599230054426, + "grad_norm": 0.46395638356149377, + "learning_rate": 9.597888632550963e-07, + "loss": 0.5447, + "step": 16101 + }, + { + "epoch": 4.274525421478827, + "grad_norm": 0.4649147910818229, + "learning_rate": 9.59513877160847e-07, + "loss": 0.4857, + "step": 16102 + }, + { + "epoch": 4.27479091995221, + "grad_norm": 0.4752046417424867, + "learning_rate": 9.5923892110985e-07, + "loss": 0.5016, + "step": 16103 + }, + { + "epoch": 4.275056418425594, + "grad_norm": 0.4816590262517745, + "learning_rate": 9.58963995107465e-07, + "loss": 0.5823, + "step": 16104 + }, + { + "epoch": 4.275321916898978, + "grad_norm": 0.48660476014912474, + "learning_rate": 9.586890991590575e-07, + "loss": 0.5667, + "step": 16105 + }, + { + "epoch": 4.275587415372361, + "grad_norm": 0.47604121262834925, + "learning_rate": 9.584142332699842e-07, + "loss": 0.4962, + "step": 16106 + }, + { + "epoch": 4.275852913845745, + "grad_norm": 0.4849036248325664, + "learning_rate": 9.581393974456089e-07, + "loss": 0.5424, + "step": 16107 + }, + { + "epoch": 4.276118412319129, + "grad_norm": 0.4847917749237604, + "learning_rate": 9.578645916912896e-07, + "loss": 0.5605, + "step": 16108 + }, + { + "epoch": 4.276383910792513, + "grad_norm": 0.4755094529760263, + "learning_rate": 9.575898160123873e-07, + "loss": 0.5151, + "step": 16109 + }, + { + "epoch": 4.276649409265897, + "grad_norm": 0.47892046251268966, + "learning_rate": 9.573150704142592e-07, + "loss": 0.54, + "step": 16110 + }, + { + "epoch": 4.276914907739281, + "grad_norm": 0.4797209134977321, + "learning_rate": 9.570403549022652e-07, + "loss": 0.5764, + "step": 16111 + }, + { + "epoch": 4.277180406212664, + "grad_norm": 0.4623075018790371, + "learning_rate": 9.567656694817622e-07, + "loss": 0.5255, + "step": 16112 + }, + { + "epoch": 4.277445904686048, + "grad_norm": 0.48044970077507604, + "learning_rate": 9.564910141581062e-07, + "loss": 0.5571, + "step": 16113 + }, + { + "epoch": 4.277711403159432, + "grad_norm": 0.4665407751577506, + "learning_rate": 9.562163889366558e-07, + "loss": 0.5368, + "step": 16114 + }, + { + "epoch": 4.277976901632815, + "grad_norm": 0.4670286381747807, + "learning_rate": 9.559417938227645e-07, + "loss": 0.4958, + "step": 16115 + }, + { + "epoch": 4.278242400106199, + "grad_norm": 0.4961140339950053, + "learning_rate": 9.556672288217894e-07, + "loss": 0.5506, + "step": 16116 + }, + { + "epoch": 4.2785078985795835, + "grad_norm": 0.4703959930478149, + "learning_rate": 9.553926939390847e-07, + "loss": 0.546, + "step": 16117 + }, + { + "epoch": 4.278773397052967, + "grad_norm": 0.47164671502162875, + "learning_rate": 9.551181891800034e-07, + "loss": 0.5652, + "step": 16118 + }, + { + "epoch": 4.279038895526351, + "grad_norm": 0.4759257335530126, + "learning_rate": 9.54843714549901e-07, + "loss": 0.5451, + "step": 16119 + }, + { + "epoch": 4.279304393999735, + "grad_norm": 0.4942306060661886, + "learning_rate": 9.545692700541284e-07, + "loss": 0.558, + "step": 16120 + }, + { + "epoch": 4.279569892473118, + "grad_norm": 0.48477733508936244, + "learning_rate": 9.542948556980396e-07, + "loss": 0.5201, + "step": 16121 + }, + { + "epoch": 4.279835390946502, + "grad_norm": 0.46041307978856433, + "learning_rate": 9.54020471486985e-07, + "loss": 0.463, + "step": 16122 + }, + { + "epoch": 4.280100889419886, + "grad_norm": 0.4813174865330322, + "learning_rate": 9.53746117426318e-07, + "loss": 0.5663, + "step": 16123 + }, + { + "epoch": 4.280366387893269, + "grad_norm": 0.4877195222172295, + "learning_rate": 9.534717935213861e-07, + "loss": 0.5312, + "step": 16124 + }, + { + "epoch": 4.2806318863666535, + "grad_norm": 0.48403467329840777, + "learning_rate": 9.531974997775415e-07, + "loss": 0.5543, + "step": 16125 + }, + { + "epoch": 4.280897384840038, + "grad_norm": 0.4683525674011287, + "learning_rate": 9.529232362001317e-07, + "loss": 0.5512, + "step": 16126 + }, + { + "epoch": 4.281162883313421, + "grad_norm": 0.4998225066379177, + "learning_rate": 9.526490027945079e-07, + "loss": 0.5535, + "step": 16127 + }, + { + "epoch": 4.281428381786805, + "grad_norm": 0.47963758682379953, + "learning_rate": 9.523747995660171e-07, + "loss": 0.512, + "step": 16128 + }, + { + "epoch": 4.281693880260189, + "grad_norm": 0.4881294811680851, + "learning_rate": 9.521006265200061e-07, + "loss": 0.5617, + "step": 16129 + }, + { + "epoch": 4.281959378733572, + "grad_norm": 0.45100918106953597, + "learning_rate": 9.518264836618235e-07, + "loss": 0.503, + "step": 16130 + }, + { + "epoch": 4.282224877206956, + "grad_norm": 0.5006723882399844, + "learning_rate": 9.515523709968145e-07, + "loss": 0.5348, + "step": 16131 + }, + { + "epoch": 4.282490375680339, + "grad_norm": 0.5146927350541792, + "learning_rate": 9.51278288530326e-07, + "loss": 0.5141, + "step": 16132 + }, + { + "epoch": 4.2827558741537235, + "grad_norm": 0.48732172393054124, + "learning_rate": 9.510042362677022e-07, + "loss": 0.5527, + "step": 16133 + }, + { + "epoch": 4.283021372627108, + "grad_norm": 0.46928567272209737, + "learning_rate": 9.507302142142899e-07, + "loss": 0.5466, + "step": 16134 + }, + { + "epoch": 4.283286871100491, + "grad_norm": 0.4717044534597236, + "learning_rate": 9.504562223754299e-07, + "loss": 0.5338, + "step": 16135 + }, + { + "epoch": 4.283552369573875, + "grad_norm": 0.47831350543841095, + "learning_rate": 9.501822607564678e-07, + "loss": 0.5575, + "step": 16136 + }, + { + "epoch": 4.283817868047259, + "grad_norm": 0.47793444674497115, + "learning_rate": 9.49908329362747e-07, + "loss": 0.547, + "step": 16137 + }, + { + "epoch": 4.284083366520642, + "grad_norm": 0.47907698575589613, + "learning_rate": 9.496344281996081e-07, + "loss": 0.549, + "step": 16138 + }, + { + "epoch": 4.284348864994026, + "grad_norm": 0.47739258316581606, + "learning_rate": 9.493605572723946e-07, + "loss": 0.5353, + "step": 16139 + }, + { + "epoch": 4.28461436346741, + "grad_norm": 0.48723796176825107, + "learning_rate": 9.49086716586447e-07, + "loss": 0.5607, + "step": 16140 + }, + { + "epoch": 4.2848798619407935, + "grad_norm": 0.4769773182167794, + "learning_rate": 9.488129061471057e-07, + "loss": 0.5003, + "step": 16141 + }, + { + "epoch": 4.285145360414178, + "grad_norm": 0.4747009446944575, + "learning_rate": 9.485391259597099e-07, + "loss": 0.5131, + "step": 16142 + }, + { + "epoch": 4.285410858887562, + "grad_norm": 0.47806353165227333, + "learning_rate": 9.482653760296007e-07, + "loss": 0.5058, + "step": 16143 + }, + { + "epoch": 4.285676357360945, + "grad_norm": 0.4798065459503907, + "learning_rate": 9.479916563621147e-07, + "loss": 0.5438, + "step": 16144 + }, + { + "epoch": 4.285941855834329, + "grad_norm": 0.4780843780240689, + "learning_rate": 9.477179669625927e-07, + "loss": 0.5478, + "step": 16145 + }, + { + "epoch": 4.286207354307713, + "grad_norm": 0.46730210238748343, + "learning_rate": 9.474443078363707e-07, + "loss": 0.549, + "step": 16146 + }, + { + "epoch": 4.286472852781096, + "grad_norm": 0.47109015247716934, + "learning_rate": 9.471706789887852e-07, + "loss": 0.5329, + "step": 16147 + }, + { + "epoch": 4.28673835125448, + "grad_norm": 0.49357752495817003, + "learning_rate": 9.468970804251742e-07, + "loss": 0.5431, + "step": 16148 + }, + { + "epoch": 4.287003849727864, + "grad_norm": 0.4739079515181912, + "learning_rate": 9.466235121508722e-07, + "loss": 0.5608, + "step": 16149 + }, + { + "epoch": 4.287269348201248, + "grad_norm": 0.4720426246815597, + "learning_rate": 9.463499741712157e-07, + "loss": 0.5396, + "step": 16150 + }, + { + "epoch": 4.287534846674632, + "grad_norm": 0.49223986858574126, + "learning_rate": 9.460764664915386e-07, + "loss": 0.5415, + "step": 16151 + }, + { + "epoch": 4.287800345148016, + "grad_norm": 0.4945450951568442, + "learning_rate": 9.458029891171755e-07, + "loss": 0.5519, + "step": 16152 + }, + { + "epoch": 4.288065843621399, + "grad_norm": 0.49565694318361314, + "learning_rate": 9.455295420534583e-07, + "loss": 0.531, + "step": 16153 + }, + { + "epoch": 4.288331342094783, + "grad_norm": 0.48152222448362336, + "learning_rate": 9.452561253057219e-07, + "loss": 0.5617, + "step": 16154 + }, + { + "epoch": 4.288596840568167, + "grad_norm": 0.4857186548844074, + "learning_rate": 9.449827388792968e-07, + "loss": 0.5314, + "step": 16155 + }, + { + "epoch": 4.28886233904155, + "grad_norm": 0.49285837061140425, + "learning_rate": 9.447093827795156e-07, + "loss": 0.5426, + "step": 16156 + }, + { + "epoch": 4.289127837514934, + "grad_norm": 0.478490371523138, + "learning_rate": 9.444360570117111e-07, + "loss": 0.5193, + "step": 16157 + }, + { + "epoch": 4.2893933359883185, + "grad_norm": 0.4802187041363442, + "learning_rate": 9.441627615812107e-07, + "loss": 0.5642, + "step": 16158 + }, + { + "epoch": 4.289658834461702, + "grad_norm": 0.4769516921480737, + "learning_rate": 9.438894964933465e-07, + "loss": 0.5493, + "step": 16159 + }, + { + "epoch": 4.289924332935086, + "grad_norm": 0.4771215505453742, + "learning_rate": 9.436162617534464e-07, + "loss": 0.5269, + "step": 16160 + }, + { + "epoch": 4.290189831408469, + "grad_norm": 0.49507969056374723, + "learning_rate": 9.433430573668406e-07, + "loss": 0.5169, + "step": 16161 + }, + { + "epoch": 4.290455329881853, + "grad_norm": 0.48236974891098067, + "learning_rate": 9.430698833388558e-07, + "loss": 0.5811, + "step": 16162 + }, + { + "epoch": 4.290720828355237, + "grad_norm": 0.4782656939701205, + "learning_rate": 9.427967396748211e-07, + "loss": 0.5435, + "step": 16163 + }, + { + "epoch": 4.29098632682862, + "grad_norm": 0.47722168296213874, + "learning_rate": 9.425236263800624e-07, + "loss": 0.5508, + "step": 16164 + }, + { + "epoch": 4.291251825302004, + "grad_norm": 0.48304854026727073, + "learning_rate": 9.422505434599058e-07, + "loss": 0.5704, + "step": 16165 + }, + { + "epoch": 4.2915173237753885, + "grad_norm": 0.4732693905455067, + "learning_rate": 9.419774909196785e-07, + "loss": 0.5453, + "step": 16166 + }, + { + "epoch": 4.291782822248772, + "grad_norm": 0.47712885680913475, + "learning_rate": 9.417044687647039e-07, + "loss": 0.551, + "step": 16167 + }, + { + "epoch": 4.292048320722156, + "grad_norm": 0.4727841774069548, + "learning_rate": 9.414314770003083e-07, + "loss": 0.512, + "step": 16168 + }, + { + "epoch": 4.29231381919554, + "grad_norm": 0.4700497144419276, + "learning_rate": 9.411585156318151e-07, + "loss": 0.5449, + "step": 16169 + }, + { + "epoch": 4.292579317668923, + "grad_norm": 0.48989582100232487, + "learning_rate": 9.408855846645476e-07, + "loss": 0.5438, + "step": 16170 + }, + { + "epoch": 4.292844816142307, + "grad_norm": 0.4722147662197645, + "learning_rate": 9.406126841038274e-07, + "loss": 0.5433, + "step": 16171 + }, + { + "epoch": 4.293110314615691, + "grad_norm": 0.4896317156584069, + "learning_rate": 9.403398139549791e-07, + "loss": 0.5575, + "step": 16172 + }, + { + "epoch": 4.2933758130890745, + "grad_norm": 0.482051978527843, + "learning_rate": 9.400669742233221e-07, + "loss": 0.5202, + "step": 16173 + }, + { + "epoch": 4.2936413115624585, + "grad_norm": 0.4802108574272704, + "learning_rate": 9.397941649141793e-07, + "loss": 0.537, + "step": 16174 + }, + { + "epoch": 4.293906810035843, + "grad_norm": 0.48280442652870736, + "learning_rate": 9.395213860328703e-07, + "loss": 0.5154, + "step": 16175 + }, + { + "epoch": 4.294172308509226, + "grad_norm": 0.47299456323769246, + "learning_rate": 9.39248637584714e-07, + "loss": 0.5286, + "step": 16176 + }, + { + "epoch": 4.29443780698261, + "grad_norm": 0.48648646953501956, + "learning_rate": 9.389759195750317e-07, + "loss": 0.5006, + "step": 16177 + }, + { + "epoch": 4.294703305455994, + "grad_norm": 0.4833304883808922, + "learning_rate": 9.387032320091402e-07, + "loss": 0.5376, + "step": 16178 + }, + { + "epoch": 4.294968803929377, + "grad_norm": 0.483472522186496, + "learning_rate": 9.384305748923592e-07, + "loss": 0.5081, + "step": 16179 + }, + { + "epoch": 4.295234302402761, + "grad_norm": 0.453136929126105, + "learning_rate": 9.381579482300054e-07, + "loss": 0.5205, + "step": 16180 + }, + { + "epoch": 4.295499800876145, + "grad_norm": 0.49560667540920283, + "learning_rate": 9.378853520273948e-07, + "loss": 0.5323, + "step": 16181 + }, + { + "epoch": 4.2957652993495286, + "grad_norm": 0.49499776283313485, + "learning_rate": 9.376127862898454e-07, + "loss": 0.5726, + "step": 16182 + }, + { + "epoch": 4.296030797822913, + "grad_norm": 0.47334489340833974, + "learning_rate": 9.373402510226712e-07, + "loss": 0.5101, + "step": 16183 + }, + { + "epoch": 4.296296296296296, + "grad_norm": 0.47208004492161953, + "learning_rate": 9.37067746231189e-07, + "loss": 0.5227, + "step": 16184 + }, + { + "epoch": 4.29656179476968, + "grad_norm": 0.48304629466859617, + "learning_rate": 9.367952719207115e-07, + "loss": 0.5678, + "step": 16185 + }, + { + "epoch": 4.296827293243064, + "grad_norm": 0.48358985269045435, + "learning_rate": 9.365228280965555e-07, + "loss": 0.5831, + "step": 16186 + }, + { + "epoch": 4.297092791716448, + "grad_norm": 0.47454324764997263, + "learning_rate": 9.362504147640305e-07, + "loss": 0.5726, + "step": 16187 + }, + { + "epoch": 4.297358290189831, + "grad_norm": 0.4873600374620913, + "learning_rate": 9.35978031928452e-07, + "loss": 0.5416, + "step": 16188 + }, + { + "epoch": 4.297623788663215, + "grad_norm": 0.4880037734697343, + "learning_rate": 9.357056795951305e-07, + "loss": 0.5467, + "step": 16189 + }, + { + "epoch": 4.297889287136599, + "grad_norm": 0.4849809209648875, + "learning_rate": 9.354333577693791e-07, + "loss": 0.5482, + "step": 16190 + }, + { + "epoch": 4.298154785609983, + "grad_norm": 0.4676136087894692, + "learning_rate": 9.351610664565072e-07, + "loss": 0.5404, + "step": 16191 + }, + { + "epoch": 4.298420284083367, + "grad_norm": 0.47110654767898347, + "learning_rate": 9.348888056618266e-07, + "loss": 0.5283, + "step": 16192 + }, + { + "epoch": 4.29868578255675, + "grad_norm": 0.48995997897553945, + "learning_rate": 9.346165753906464e-07, + "loss": 0.5952, + "step": 16193 + }, + { + "epoch": 4.298951281030134, + "grad_norm": 0.48660949210062937, + "learning_rate": 9.343443756482748e-07, + "loss": 0.555, + "step": 16194 + }, + { + "epoch": 4.299216779503518, + "grad_norm": 0.4963571687802391, + "learning_rate": 9.340722064400223e-07, + "loss": 0.5525, + "step": 16195 + }, + { + "epoch": 4.299482277976901, + "grad_norm": 0.487496856989421, + "learning_rate": 9.338000677711948e-07, + "loss": 0.5742, + "step": 16196 + }, + { + "epoch": 4.299747776450285, + "grad_norm": 0.4953732818832387, + "learning_rate": 9.335279596471025e-07, + "loss": 0.5519, + "step": 16197 + }, + { + "epoch": 4.3000132749236695, + "grad_norm": 0.4743435817553822, + "learning_rate": 9.332558820730486e-07, + "loss": 0.5233, + "step": 16198 + }, + { + "epoch": 4.300278773397053, + "grad_norm": 0.4698061009828211, + "learning_rate": 9.329838350543418e-07, + "loss": 0.5097, + "step": 16199 + }, + { + "epoch": 4.300544271870437, + "grad_norm": 0.47745370230663964, + "learning_rate": 9.327118185962864e-07, + "loss": 0.5574, + "step": 16200 + }, + { + "epoch": 4.300809770343821, + "grad_norm": 0.47717845958316396, + "learning_rate": 9.324398327041875e-07, + "loss": 0.5233, + "step": 16201 + }, + { + "epoch": 4.301075268817204, + "grad_norm": 0.49697791197964475, + "learning_rate": 9.321678773833509e-07, + "loss": 0.5452, + "step": 16202 + }, + { + "epoch": 4.301340767290588, + "grad_norm": 0.48272440510497366, + "learning_rate": 9.318959526390797e-07, + "loss": 0.5468, + "step": 16203 + }, + { + "epoch": 4.301606265763972, + "grad_norm": 0.46774505865339017, + "learning_rate": 9.316240584766764e-07, + "loss": 0.4921, + "step": 16204 + }, + { + "epoch": 4.301871764237355, + "grad_norm": 0.4893266242632504, + "learning_rate": 9.313521949014434e-07, + "loss": 0.5566, + "step": 16205 + }, + { + "epoch": 4.3021372627107395, + "grad_norm": 0.46944538895016286, + "learning_rate": 9.310803619186842e-07, + "loss": 0.5561, + "step": 16206 + }, + { + "epoch": 4.302402761184124, + "grad_norm": 0.46029536551563216, + "learning_rate": 9.308085595336983e-07, + "loss": 0.5186, + "step": 16207 + }, + { + "epoch": 4.302668259657507, + "grad_norm": 0.45728366771854756, + "learning_rate": 9.305367877517885e-07, + "loss": 0.5004, + "step": 16208 + }, + { + "epoch": 4.302933758130891, + "grad_norm": 0.4701312054524912, + "learning_rate": 9.30265046578254e-07, + "loss": 0.5281, + "step": 16209 + }, + { + "epoch": 4.303199256604275, + "grad_norm": 0.4716828165770199, + "learning_rate": 9.299933360183935e-07, + "loss": 0.514, + "step": 16210 + }, + { + "epoch": 4.303464755077658, + "grad_norm": 0.4733186995820755, + "learning_rate": 9.297216560775079e-07, + "loss": 0.565, + "step": 16211 + }, + { + "epoch": 4.303730253551042, + "grad_norm": 0.4969800792379079, + "learning_rate": 9.294500067608941e-07, + "loss": 0.5261, + "step": 16212 + }, + { + "epoch": 4.303995752024425, + "grad_norm": 0.46699768906856187, + "learning_rate": 9.291783880738511e-07, + "loss": 0.5571, + "step": 16213 + }, + { + "epoch": 4.3042612504978095, + "grad_norm": 0.4708692890400055, + "learning_rate": 9.28906800021675e-07, + "loss": 0.5296, + "step": 16214 + }, + { + "epoch": 4.304526748971194, + "grad_norm": 0.4689040155195078, + "learning_rate": 9.286352426096645e-07, + "loss": 0.5469, + "step": 16215 + }, + { + "epoch": 4.304792247444577, + "grad_norm": 0.4672868363691084, + "learning_rate": 9.283637158431124e-07, + "loss": 0.5475, + "step": 16216 + }, + { + "epoch": 4.305057745917961, + "grad_norm": 0.491190810246371, + "learning_rate": 9.280922197273168e-07, + "loss": 0.5937, + "step": 16217 + }, + { + "epoch": 4.305323244391345, + "grad_norm": 0.51341845180117, + "learning_rate": 9.278207542675708e-07, + "loss": 0.5231, + "step": 16218 + }, + { + "epoch": 4.305588742864728, + "grad_norm": 0.48648144639881646, + "learning_rate": 9.2754931946917e-07, + "loss": 0.5327, + "step": 16219 + }, + { + "epoch": 4.305854241338112, + "grad_norm": 0.47554248197372156, + "learning_rate": 9.27277915337407e-07, + "loss": 0.549, + "step": 16220 + }, + { + "epoch": 4.306119739811496, + "grad_norm": 0.48650848049535017, + "learning_rate": 9.270065418775762e-07, + "loss": 0.5455, + "step": 16221 + }, + { + "epoch": 4.3063852382848795, + "grad_norm": 0.47652047841134165, + "learning_rate": 9.267351990949689e-07, + "loss": 0.5701, + "step": 16222 + }, + { + "epoch": 4.306650736758264, + "grad_norm": 0.4708329438607657, + "learning_rate": 9.264638869948766e-07, + "loss": 0.5039, + "step": 16223 + }, + { + "epoch": 4.306916235231648, + "grad_norm": 0.4885416853075303, + "learning_rate": 9.261926055825918e-07, + "loss": 0.5444, + "step": 16224 + }, + { + "epoch": 4.307181733705031, + "grad_norm": 0.47224592581410313, + "learning_rate": 9.259213548634038e-07, + "loss": 0.523, + "step": 16225 + }, + { + "epoch": 4.307447232178415, + "grad_norm": 0.4701605892166753, + "learning_rate": 9.256501348426045e-07, + "loss": 0.551, + "step": 16226 + }, + { + "epoch": 4.307712730651799, + "grad_norm": 0.4739326924814448, + "learning_rate": 9.253789455254819e-07, + "loss": 0.5143, + "step": 16227 + }, + { + "epoch": 4.307978229125182, + "grad_norm": 0.4660940473803378, + "learning_rate": 9.251077869173244e-07, + "loss": 0.5206, + "step": 16228 + }, + { + "epoch": 4.308243727598566, + "grad_norm": 0.4862391187333547, + "learning_rate": 9.248366590234223e-07, + "loss": 0.5449, + "step": 16229 + }, + { + "epoch": 4.30850922607195, + "grad_norm": 0.4728453801821844, + "learning_rate": 9.245655618490607e-07, + "loss": 0.5705, + "step": 16230 + }, + { + "epoch": 4.308774724545334, + "grad_norm": 0.4777446418873602, + "learning_rate": 9.24294495399529e-07, + "loss": 0.5578, + "step": 16231 + }, + { + "epoch": 4.309040223018718, + "grad_norm": 0.47592370135748197, + "learning_rate": 9.240234596801125e-07, + "loss": 0.5007, + "step": 16232 + }, + { + "epoch": 4.309305721492102, + "grad_norm": 0.4849336033241753, + "learning_rate": 9.237524546960975e-07, + "loss": 0.5817, + "step": 16233 + }, + { + "epoch": 4.309571219965485, + "grad_norm": 0.4841216031843534, + "learning_rate": 9.234814804527678e-07, + "loss": 0.5858, + "step": 16234 + }, + { + "epoch": 4.309836718438869, + "grad_norm": 0.47981510459941557, + "learning_rate": 9.232105369554101e-07, + "loss": 0.5123, + "step": 16235 + }, + { + "epoch": 4.310102216912253, + "grad_norm": 0.4827896995813429, + "learning_rate": 9.22939624209307e-07, + "loss": 0.5339, + "step": 16236 + }, + { + "epoch": 4.310367715385636, + "grad_norm": 0.4801186329778531, + "learning_rate": 9.226687422197431e-07, + "loss": 0.5608, + "step": 16237 + }, + { + "epoch": 4.31063321385902, + "grad_norm": 0.4572255553183137, + "learning_rate": 9.223978909920009e-07, + "loss": 0.5, + "step": 16238 + }, + { + "epoch": 4.3108987123324045, + "grad_norm": 0.474467992914293, + "learning_rate": 9.221270705313614e-07, + "loss": 0.5243, + "step": 16239 + }, + { + "epoch": 4.311164210805788, + "grad_norm": 0.4635335030024976, + "learning_rate": 9.218562808431084e-07, + "loss": 0.5031, + "step": 16240 + }, + { + "epoch": 4.311429709279172, + "grad_norm": 0.4819448188718896, + "learning_rate": 9.215855219325207e-07, + "loss": 0.552, + "step": 16241 + }, + { + "epoch": 4.311695207752555, + "grad_norm": 0.4811299703677316, + "learning_rate": 9.21314793804881e-07, + "loss": 0.5628, + "step": 16242 + }, + { + "epoch": 4.311960706225939, + "grad_norm": 0.4835694712294409, + "learning_rate": 9.210440964654674e-07, + "loss": 0.5476, + "step": 16243 + }, + { + "epoch": 4.312226204699323, + "grad_norm": 0.4815989460394889, + "learning_rate": 9.207734299195614e-07, + "loss": 0.5203, + "step": 16244 + }, + { + "epoch": 4.312491703172706, + "grad_norm": 0.48566166185474274, + "learning_rate": 9.205027941724384e-07, + "loss": 0.544, + "step": 16245 + }, + { + "epoch": 4.31275720164609, + "grad_norm": 0.4665034270706131, + "learning_rate": 9.202321892293791e-07, + "loss": 0.5468, + "step": 16246 + }, + { + "epoch": 4.3130227001194745, + "grad_norm": 0.46694843431482913, + "learning_rate": 9.199616150956592e-07, + "loss": 0.5468, + "step": 16247 + }, + { + "epoch": 4.313288198592858, + "grad_norm": 0.45966111206574933, + "learning_rate": 9.196910717765564e-07, + "loss": 0.5253, + "step": 16248 + }, + { + "epoch": 4.313553697066242, + "grad_norm": 0.4783669474696989, + "learning_rate": 9.194205592773486e-07, + "loss": 0.5397, + "step": 16249 + }, + { + "epoch": 4.313819195539626, + "grad_norm": 0.46669517610391437, + "learning_rate": 9.191500776033083e-07, + "loss": 0.5003, + "step": 16250 + }, + { + "epoch": 4.314084694013009, + "grad_norm": 0.4700783277246493, + "learning_rate": 9.188796267597127e-07, + "loss": 0.5443, + "step": 16251 + }, + { + "epoch": 4.314350192486393, + "grad_norm": 0.4793370698317174, + "learning_rate": 9.186092067518348e-07, + "loss": 0.541, + "step": 16252 + }, + { + "epoch": 4.314615690959777, + "grad_norm": 0.4904006860925553, + "learning_rate": 9.183388175849503e-07, + "loss": 0.5604, + "step": 16253 + }, + { + "epoch": 4.3148811894331605, + "grad_norm": 0.4570081260307148, + "learning_rate": 9.180684592643305e-07, + "loss": 0.5371, + "step": 16254 + }, + { + "epoch": 4.3151466879065445, + "grad_norm": 0.4742716717005486, + "learning_rate": 9.1779813179525e-07, + "loss": 0.5013, + "step": 16255 + }, + { + "epoch": 4.315412186379929, + "grad_norm": 0.4832400161382249, + "learning_rate": 9.175278351829794e-07, + "loss": 0.5232, + "step": 16256 + }, + { + "epoch": 4.315677684853312, + "grad_norm": 0.4629710767602226, + "learning_rate": 9.172575694327901e-07, + "loss": 0.5264, + "step": 16257 + }, + { + "epoch": 4.315943183326696, + "grad_norm": 0.4990161627033716, + "learning_rate": 9.169873345499541e-07, + "loss": 0.5386, + "step": 16258 + }, + { + "epoch": 4.31620868180008, + "grad_norm": 0.4807878482598025, + "learning_rate": 9.167171305397401e-07, + "loss": 0.5326, + "step": 16259 + }, + { + "epoch": 4.316474180273463, + "grad_norm": 0.47089063093857714, + "learning_rate": 9.164469574074198e-07, + "loss": 0.5493, + "step": 16260 + }, + { + "epoch": 4.316739678746847, + "grad_norm": 0.4674818559717405, + "learning_rate": 9.161768151582607e-07, + "loss": 0.5324, + "step": 16261 + }, + { + "epoch": 4.317005177220231, + "grad_norm": 0.4805347364712472, + "learning_rate": 9.159067037975317e-07, + "loss": 0.5444, + "step": 16262 + }, + { + "epoch": 4.317270675693615, + "grad_norm": 0.4842855701445545, + "learning_rate": 9.156366233304995e-07, + "loss": 0.5624, + "step": 16263 + }, + { + "epoch": 4.317536174166999, + "grad_norm": 0.47862276976868473, + "learning_rate": 9.153665737624331e-07, + "loss": 0.5361, + "step": 16264 + }, + { + "epoch": 4.317801672640383, + "grad_norm": 0.4684594369400062, + "learning_rate": 9.150965550985976e-07, + "loss": 0.5257, + "step": 16265 + }, + { + "epoch": 4.318067171113766, + "grad_norm": 0.48064983757426455, + "learning_rate": 9.148265673442608e-07, + "loss": 0.5327, + "step": 16266 + }, + { + "epoch": 4.31833266958715, + "grad_norm": 0.4741107566999684, + "learning_rate": 9.145566105046872e-07, + "loss": 0.5693, + "step": 16267 + }, + { + "epoch": 4.318598168060534, + "grad_norm": 0.472037677150413, + "learning_rate": 9.142866845851403e-07, + "loss": 0.5152, + "step": 16268 + }, + { + "epoch": 4.318863666533917, + "grad_norm": 0.4886084695344888, + "learning_rate": 9.140167895908867e-07, + "loss": 0.5755, + "step": 16269 + }, + { + "epoch": 4.319129165007301, + "grad_norm": 0.5135276331933102, + "learning_rate": 9.13746925527188e-07, + "loss": 0.5618, + "step": 16270 + }, + { + "epoch": 4.319394663480685, + "grad_norm": 0.47746310522230434, + "learning_rate": 9.134770923993088e-07, + "loss": 0.5891, + "step": 16271 + }, + { + "epoch": 4.319660161954069, + "grad_norm": 0.4814733504421007, + "learning_rate": 9.132072902125108e-07, + "loss": 0.5243, + "step": 16272 + }, + { + "epoch": 4.319925660427453, + "grad_norm": 0.4634371537090962, + "learning_rate": 9.129375189720549e-07, + "loss": 0.52, + "step": 16273 + }, + { + "epoch": 4.320191158900836, + "grad_norm": 0.47701958698664676, + "learning_rate": 9.126677786832041e-07, + "loss": 0.522, + "step": 16274 + }, + { + "epoch": 4.32045665737422, + "grad_norm": 0.47326203536861716, + "learning_rate": 9.123980693512172e-07, + "loss": 0.5742, + "step": 16275 + }, + { + "epoch": 4.320722155847604, + "grad_norm": 0.47469105458997823, + "learning_rate": 9.121283909813558e-07, + "loss": 0.5273, + "step": 16276 + }, + { + "epoch": 4.320987654320987, + "grad_norm": 0.4691269667741205, + "learning_rate": 9.118587435788778e-07, + "loss": 0.5454, + "step": 16277 + }, + { + "epoch": 4.321253152794371, + "grad_norm": 0.48678897507555063, + "learning_rate": 9.115891271490446e-07, + "loss": 0.5457, + "step": 16278 + }, + { + "epoch": 4.3215186512677555, + "grad_norm": 0.4774631618420226, + "learning_rate": 9.113195416971105e-07, + "loss": 0.5422, + "step": 16279 + }, + { + "epoch": 4.321784149741139, + "grad_norm": 0.48343449183843334, + "learning_rate": 9.110499872283363e-07, + "loss": 0.5725, + "step": 16280 + }, + { + "epoch": 4.322049648214523, + "grad_norm": 0.4828506292153186, + "learning_rate": 9.107804637479767e-07, + "loss": 0.5412, + "step": 16281 + }, + { + "epoch": 4.322315146687907, + "grad_norm": 0.4833603996511266, + "learning_rate": 9.1051097126129e-07, + "loss": 0.5636, + "step": 16282 + }, + { + "epoch": 4.32258064516129, + "grad_norm": 0.4876591786379884, + "learning_rate": 9.102415097735304e-07, + "loss": 0.5523, + "step": 16283 + }, + { + "epoch": 4.322846143634674, + "grad_norm": 0.485920593324433, + "learning_rate": 9.099720792899544e-07, + "loss": 0.5428, + "step": 16284 + }, + { + "epoch": 4.323111642108058, + "grad_norm": 0.46044418943972143, + "learning_rate": 9.097026798158159e-07, + "loss": 0.5554, + "step": 16285 + }, + { + "epoch": 4.323377140581441, + "grad_norm": 0.4759835195925352, + "learning_rate": 9.09433311356368e-07, + "loss": 0.511, + "step": 16286 + }, + { + "epoch": 4.3236426390548255, + "grad_norm": 0.48286241322937196, + "learning_rate": 9.091639739168661e-07, + "loss": 0.5516, + "step": 16287 + }, + { + "epoch": 4.32390813752821, + "grad_norm": 0.48296999689141584, + "learning_rate": 9.088946675025607e-07, + "loss": 0.537, + "step": 16288 + }, + { + "epoch": 4.324173636001593, + "grad_norm": 0.47129293045010023, + "learning_rate": 9.086253921187058e-07, + "loss": 0.5322, + "step": 16289 + }, + { + "epoch": 4.324439134474977, + "grad_norm": 0.45400931223834307, + "learning_rate": 9.083561477705521e-07, + "loss": 0.5276, + "step": 16290 + }, + { + "epoch": 4.324704632948361, + "grad_norm": 0.4717739181427923, + "learning_rate": 9.080869344633506e-07, + "loss": 0.4833, + "step": 16291 + }, + { + "epoch": 4.324970131421744, + "grad_norm": 0.48248620035767176, + "learning_rate": 9.078177522023504e-07, + "loss": 0.5309, + "step": 16292 + }, + { + "epoch": 4.325235629895128, + "grad_norm": 0.47006097161126564, + "learning_rate": 9.075486009928028e-07, + "loss": 0.5138, + "step": 16293 + }, + { + "epoch": 4.325501128368511, + "grad_norm": 0.46906622991409325, + "learning_rate": 9.072794808399571e-07, + "loss": 0.5577, + "step": 16294 + }, + { + "epoch": 4.3257666268418955, + "grad_norm": 0.46613551172219764, + "learning_rate": 9.07010391749061e-07, + "loss": 0.5274, + "step": 16295 + }, + { + "epoch": 4.32603212531528, + "grad_norm": 0.48324233240297965, + "learning_rate": 9.067413337253631e-07, + "loss": 0.5776, + "step": 16296 + }, + { + "epoch": 4.326297623788664, + "grad_norm": 0.46050292650406666, + "learning_rate": 9.064723067741089e-07, + "loss": 0.4998, + "step": 16297 + }, + { + "epoch": 4.326563122262047, + "grad_norm": 0.48131217816698757, + "learning_rate": 9.062033109005472e-07, + "loss": 0.5396, + "step": 16298 + }, + { + "epoch": 4.326828620735431, + "grad_norm": 0.48024506138220074, + "learning_rate": 9.059343461099227e-07, + "loss": 0.5089, + "step": 16299 + }, + { + "epoch": 4.327094119208814, + "grad_norm": 0.48212067758377564, + "learning_rate": 9.056654124074818e-07, + "loss": 0.5881, + "step": 16300 + }, + { + "epoch": 4.327359617682198, + "grad_norm": 0.4706352870017432, + "learning_rate": 9.053965097984694e-07, + "loss": 0.5136, + "step": 16301 + }, + { + "epoch": 4.327625116155582, + "grad_norm": 0.47980685297609904, + "learning_rate": 9.05127638288128e-07, + "loss": 0.5582, + "step": 16302 + }, + { + "epoch": 4.3278906146289655, + "grad_norm": 0.48576016251790705, + "learning_rate": 9.048587978817036e-07, + "loss": 0.5578, + "step": 16303 + }, + { + "epoch": 4.32815611310235, + "grad_norm": 0.48721438187293226, + "learning_rate": 9.045899885844376e-07, + "loss": 0.5272, + "step": 16304 + }, + { + "epoch": 4.328421611575734, + "grad_norm": 0.4775734136990346, + "learning_rate": 9.043212104015736e-07, + "loss": 0.5199, + "step": 16305 + }, + { + "epoch": 4.328687110049117, + "grad_norm": 0.4852569658872731, + "learning_rate": 9.040524633383524e-07, + "loss": 0.5407, + "step": 16306 + }, + { + "epoch": 4.328952608522501, + "grad_norm": 0.48198080052217207, + "learning_rate": 9.037837474000172e-07, + "loss": 0.5407, + "step": 16307 + }, + { + "epoch": 4.329218106995885, + "grad_norm": 0.48369817900686396, + "learning_rate": 9.035150625918054e-07, + "loss": 0.5316, + "step": 16308 + }, + { + "epoch": 4.329483605469268, + "grad_norm": 0.4828136312453083, + "learning_rate": 9.032464089189597e-07, + "loss": 0.559, + "step": 16309 + }, + { + "epoch": 4.329749103942652, + "grad_norm": 0.4793198651925156, + "learning_rate": 9.029777863867176e-07, + "loss": 0.5287, + "step": 16310 + }, + { + "epoch": 4.330014602416036, + "grad_norm": 0.4825451280497991, + "learning_rate": 9.027091950003197e-07, + "loss": 0.5421, + "step": 16311 + }, + { + "epoch": 4.33028010088942, + "grad_norm": 0.47649580950533116, + "learning_rate": 9.024406347650025e-07, + "loss": 0.5209, + "step": 16312 + }, + { + "epoch": 4.330545599362804, + "grad_norm": 0.46668818047879146, + "learning_rate": 9.021721056860056e-07, + "loss": 0.5625, + "step": 16313 + }, + { + "epoch": 4.330811097836188, + "grad_norm": 0.49407683489090953, + "learning_rate": 9.019036077685644e-07, + "loss": 0.5702, + "step": 16314 + }, + { + "epoch": 4.331076596309571, + "grad_norm": 0.463319271207252, + "learning_rate": 9.01635141017915e-07, + "loss": 0.5101, + "step": 16315 + }, + { + "epoch": 4.331342094782955, + "grad_norm": 0.47507833204575983, + "learning_rate": 9.013667054392944e-07, + "loss": 0.5342, + "step": 16316 + }, + { + "epoch": 4.331607593256339, + "grad_norm": 0.4820185338383231, + "learning_rate": 9.010983010379365e-07, + "loss": 0.5499, + "step": 16317 + }, + { + "epoch": 4.331873091729722, + "grad_norm": 0.47464099235441276, + "learning_rate": 9.008299278190774e-07, + "loss": 0.5247, + "step": 16318 + }, + { + "epoch": 4.332138590203106, + "grad_norm": 0.4818044907209706, + "learning_rate": 9.005615857879502e-07, + "loss": 0.5424, + "step": 16319 + }, + { + "epoch": 4.3324040886764905, + "grad_norm": 0.4751429660100645, + "learning_rate": 9.002932749497873e-07, + "loss": 0.5366, + "step": 16320 + }, + { + "epoch": 4.332669587149874, + "grad_norm": 0.49614938126701463, + "learning_rate": 9.00024995309823e-07, + "loss": 0.5442, + "step": 16321 + }, + { + "epoch": 4.332935085623258, + "grad_norm": 0.491677585627819, + "learning_rate": 8.997567468732882e-07, + "loss": 0.5462, + "step": 16322 + }, + { + "epoch": 4.333200584096641, + "grad_norm": 0.4820238971739135, + "learning_rate": 8.994885296454153e-07, + "loss": 0.5332, + "step": 16323 + }, + { + "epoch": 4.333466082570025, + "grad_norm": 0.4973643342920117, + "learning_rate": 8.992203436314351e-07, + "loss": 0.5778, + "step": 16324 + }, + { + "epoch": 4.333731581043409, + "grad_norm": 0.47249798507687596, + "learning_rate": 8.989521888365774e-07, + "loss": 0.5392, + "step": 16325 + }, + { + "epoch": 4.333997079516793, + "grad_norm": 0.4866948396650183, + "learning_rate": 8.986840652660713e-07, + "loss": 0.5289, + "step": 16326 + }, + { + "epoch": 4.3342625779901764, + "grad_norm": 0.47275864954586744, + "learning_rate": 8.984159729251477e-07, + "loss": 0.5343, + "step": 16327 + }, + { + "epoch": 4.3345280764635605, + "grad_norm": 0.47582861257715003, + "learning_rate": 8.981479118190329e-07, + "loss": 0.5527, + "step": 16328 + }, + { + "epoch": 4.334793574936944, + "grad_norm": 0.4676699656079308, + "learning_rate": 8.978798819529566e-07, + "loss": 0.4909, + "step": 16329 + }, + { + "epoch": 4.335059073410328, + "grad_norm": 0.4617238203609274, + "learning_rate": 8.976118833321453e-07, + "loss": 0.5054, + "step": 16330 + }, + { + "epoch": 4.335324571883712, + "grad_norm": 0.47904408964188494, + "learning_rate": 8.97343915961825e-07, + "loss": 0.5339, + "step": 16331 + }, + { + "epoch": 4.335590070357095, + "grad_norm": 0.47422561303894234, + "learning_rate": 8.970759798472229e-07, + "loss": 0.543, + "step": 16332 + }, + { + "epoch": 4.335855568830479, + "grad_norm": 0.46660585915131697, + "learning_rate": 8.968080749935632e-07, + "loss": 0.5601, + "step": 16333 + }, + { + "epoch": 4.336121067303863, + "grad_norm": 0.47113454487936157, + "learning_rate": 8.96540201406072e-07, + "loss": 0.5458, + "step": 16334 + }, + { + "epoch": 4.3363865657772465, + "grad_norm": 0.48941246030759455, + "learning_rate": 8.962723590899719e-07, + "loss": 0.5434, + "step": 16335 + }, + { + "epoch": 4.3366520642506305, + "grad_norm": 0.4762056612955769, + "learning_rate": 8.960045480504892e-07, + "loss": 0.5483, + "step": 16336 + }, + { + "epoch": 4.336917562724015, + "grad_norm": 0.46372563334411826, + "learning_rate": 8.957367682928434e-07, + "loss": 0.4953, + "step": 16337 + }, + { + "epoch": 4.337183061197398, + "grad_norm": 0.48682278493161013, + "learning_rate": 8.954690198222585e-07, + "loss": 0.5359, + "step": 16338 + }, + { + "epoch": 4.337448559670782, + "grad_norm": 0.4849276778574229, + "learning_rate": 8.952013026439571e-07, + "loss": 0.5301, + "step": 16339 + }, + { + "epoch": 4.337714058144166, + "grad_norm": 0.4840970923871177, + "learning_rate": 8.949336167631587e-07, + "loss": 0.5411, + "step": 16340 + }, + { + "epoch": 4.337979556617549, + "grad_norm": 0.47605207428791535, + "learning_rate": 8.946659621850856e-07, + "loss": 0.5192, + "step": 16341 + }, + { + "epoch": 4.338245055090933, + "grad_norm": 0.48124455871737276, + "learning_rate": 8.943983389149566e-07, + "loss": 0.5336, + "step": 16342 + }, + { + "epoch": 4.338510553564317, + "grad_norm": 0.465209246371635, + "learning_rate": 8.941307469579913e-07, + "loss": 0.5267, + "step": 16343 + }, + { + "epoch": 4.338776052037701, + "grad_norm": 0.4757634415782952, + "learning_rate": 8.938631863194074e-07, + "loss": 0.5577, + "step": 16344 + }, + { + "epoch": 4.339041550511085, + "grad_norm": 0.47984485563879087, + "learning_rate": 8.935956570044249e-07, + "loss": 0.5571, + "step": 16345 + }, + { + "epoch": 4.339307048984469, + "grad_norm": 0.48335074589637267, + "learning_rate": 8.933281590182591e-07, + "loss": 0.557, + "step": 16346 + }, + { + "epoch": 4.339572547457852, + "grad_norm": 0.4724573534049644, + "learning_rate": 8.930606923661289e-07, + "loss": 0.5136, + "step": 16347 + }, + { + "epoch": 4.339838045931236, + "grad_norm": 0.4588307856771073, + "learning_rate": 8.927932570532499e-07, + "loss": 0.5066, + "step": 16348 + }, + { + "epoch": 4.34010354440462, + "grad_norm": 0.4743879300495682, + "learning_rate": 8.925258530848366e-07, + "loss": 0.5422, + "step": 16349 + }, + { + "epoch": 4.340369042878003, + "grad_norm": 0.47978505944828254, + "learning_rate": 8.922584804661056e-07, + "loss": 0.5313, + "step": 16350 + }, + { + "epoch": 4.340634541351387, + "grad_norm": 0.4615665841172697, + "learning_rate": 8.919911392022698e-07, + "loss": 0.535, + "step": 16351 + }, + { + "epoch": 4.340900039824771, + "grad_norm": 0.45884887646469985, + "learning_rate": 8.91723829298545e-07, + "loss": 0.4936, + "step": 16352 + }, + { + "epoch": 4.341165538298155, + "grad_norm": 0.4819228592108612, + "learning_rate": 8.91456550760143e-07, + "loss": 0.5672, + "step": 16353 + }, + { + "epoch": 4.341431036771539, + "grad_norm": 0.48558608985376944, + "learning_rate": 8.911893035922767e-07, + "loss": 0.5183, + "step": 16354 + }, + { + "epoch": 4.341696535244922, + "grad_norm": 0.49858477790193595, + "learning_rate": 8.909220878001573e-07, + "loss": 0.5512, + "step": 16355 + }, + { + "epoch": 4.341962033718306, + "grad_norm": 0.49048724191280935, + "learning_rate": 8.906549033889974e-07, + "loss": 0.5554, + "step": 16356 + }, + { + "epoch": 4.34222753219169, + "grad_norm": 0.4855794160800867, + "learning_rate": 8.903877503640068e-07, + "loss": 0.534, + "step": 16357 + }, + { + "epoch": 4.342493030665073, + "grad_norm": 0.4861068765618246, + "learning_rate": 8.901206287303968e-07, + "loss": 0.5637, + "step": 16358 + }, + { + "epoch": 4.342758529138457, + "grad_norm": 0.48771376240422293, + "learning_rate": 8.898535384933762e-07, + "loss": 0.5435, + "step": 16359 + }, + { + "epoch": 4.3430240276118415, + "grad_norm": 0.48448852480354004, + "learning_rate": 8.89586479658153e-07, + "loss": 0.5329, + "step": 16360 + }, + { + "epoch": 4.343289526085225, + "grad_norm": 0.4612957097296225, + "learning_rate": 8.893194522299373e-07, + "loss": 0.5477, + "step": 16361 + }, + { + "epoch": 4.343555024558609, + "grad_norm": 0.4783091768413817, + "learning_rate": 8.890524562139352e-07, + "loss": 0.5305, + "step": 16362 + }, + { + "epoch": 4.343820523031993, + "grad_norm": 0.4846329953759779, + "learning_rate": 8.887854916153552e-07, + "loss": 0.5129, + "step": 16363 + }, + { + "epoch": 4.344086021505376, + "grad_norm": 0.4834315532007541, + "learning_rate": 8.885185584394021e-07, + "loss": 0.5688, + "step": 16364 + }, + { + "epoch": 4.34435151997876, + "grad_norm": 0.4853017459557938, + "learning_rate": 8.882516566912838e-07, + "loss": 0.5309, + "step": 16365 + }, + { + "epoch": 4.344617018452144, + "grad_norm": 0.48305038628455726, + "learning_rate": 8.879847863762043e-07, + "loss": 0.5502, + "step": 16366 + }, + { + "epoch": 4.344882516925527, + "grad_norm": 0.46782007025759254, + "learning_rate": 8.877179474993675e-07, + "loss": 0.4994, + "step": 16367 + }, + { + "epoch": 4.3451480153989115, + "grad_norm": 0.4772463568664131, + "learning_rate": 8.874511400659791e-07, + "loss": 0.5521, + "step": 16368 + }, + { + "epoch": 4.345413513872296, + "grad_norm": 0.47463804725150294, + "learning_rate": 8.871843640812408e-07, + "loss": 0.5403, + "step": 16369 + }, + { + "epoch": 4.345679012345679, + "grad_norm": 0.4806707999051861, + "learning_rate": 8.869176195503579e-07, + "loss": 0.5648, + "step": 16370 + }, + { + "epoch": 4.345944510819063, + "grad_norm": 0.46998330578487396, + "learning_rate": 8.866509064785295e-07, + "loss": 0.5529, + "step": 16371 + }, + { + "epoch": 4.346210009292447, + "grad_norm": 0.4859555482762523, + "learning_rate": 8.863842248709592e-07, + "loss": 0.5485, + "step": 16372 + }, + { + "epoch": 4.34647550776583, + "grad_norm": 0.4774210319585395, + "learning_rate": 8.861175747328465e-07, + "loss": 0.5291, + "step": 16373 + }, + { + "epoch": 4.346741006239214, + "grad_norm": 0.46802633008101124, + "learning_rate": 8.858509560693934e-07, + "loss": 0.5493, + "step": 16374 + }, + { + "epoch": 4.347006504712598, + "grad_norm": 0.4690032759470632, + "learning_rate": 8.855843688857979e-07, + "loss": 0.5346, + "step": 16375 + }, + { + "epoch": 4.3472720031859815, + "grad_norm": 0.47376021796352863, + "learning_rate": 8.853178131872611e-07, + "loss": 0.5493, + "step": 16376 + }, + { + "epoch": 4.347537501659366, + "grad_norm": 0.4783184504129662, + "learning_rate": 8.8505128897898e-07, + "loss": 0.5591, + "step": 16377 + }, + { + "epoch": 4.34780300013275, + "grad_norm": 0.48061145571289315, + "learning_rate": 8.847847962661521e-07, + "loss": 0.5392, + "step": 16378 + }, + { + "epoch": 4.348068498606133, + "grad_norm": 0.4638283397117864, + "learning_rate": 8.845183350539763e-07, + "loss": 0.5021, + "step": 16379 + }, + { + "epoch": 4.348333997079517, + "grad_norm": 0.474443415607321, + "learning_rate": 8.842519053476476e-07, + "loss": 0.5289, + "step": 16380 + }, + { + "epoch": 4.3485994955529, + "grad_norm": 0.46414824985298847, + "learning_rate": 8.839855071523634e-07, + "loss": 0.5583, + "step": 16381 + }, + { + "epoch": 4.348864994026284, + "grad_norm": 0.48232483875074494, + "learning_rate": 8.837191404733186e-07, + "loss": 0.5557, + "step": 16382 + }, + { + "epoch": 4.349130492499668, + "grad_norm": 0.4792298494066698, + "learning_rate": 8.834528053157082e-07, + "loss": 0.5375, + "step": 16383 + }, + { + "epoch": 4.3493959909730515, + "grad_norm": 0.4868181308119856, + "learning_rate": 8.831865016847249e-07, + "loss": 0.5772, + "step": 16384 + }, + { + "epoch": 4.349661489446436, + "grad_norm": 0.4969230199257655, + "learning_rate": 8.829202295855635e-07, + "loss": 0.5143, + "step": 16385 + }, + { + "epoch": 4.34992698791982, + "grad_norm": 0.4786057844181593, + "learning_rate": 8.826539890234179e-07, + "loss": 0.5434, + "step": 16386 + }, + { + "epoch": 4.350192486393203, + "grad_norm": 0.4705197284882441, + "learning_rate": 8.823877800034783e-07, + "loss": 0.5202, + "step": 16387 + }, + { + "epoch": 4.350457984866587, + "grad_norm": 0.48086710614907097, + "learning_rate": 8.821216025309395e-07, + "loss": 0.5242, + "step": 16388 + }, + { + "epoch": 4.350723483339971, + "grad_norm": 0.4802249339539081, + "learning_rate": 8.818554566109891e-07, + "loss": 0.5378, + "step": 16389 + }, + { + "epoch": 4.350988981813354, + "grad_norm": 0.47031188344181596, + "learning_rate": 8.815893422488198e-07, + "loss": 0.5012, + "step": 16390 + }, + { + "epoch": 4.351254480286738, + "grad_norm": 0.47344812198782077, + "learning_rate": 8.813232594496201e-07, + "loss": 0.5211, + "step": 16391 + }, + { + "epoch": 4.351519978760122, + "grad_norm": 0.49847537240202067, + "learning_rate": 8.810572082185811e-07, + "loss": 0.5289, + "step": 16392 + }, + { + "epoch": 4.351785477233506, + "grad_norm": 0.48524962963409896, + "learning_rate": 8.807911885608902e-07, + "loss": 0.5372, + "step": 16393 + }, + { + "epoch": 4.35205097570689, + "grad_norm": 0.48752323248758206, + "learning_rate": 8.80525200481735e-07, + "loss": 0.5601, + "step": 16394 + }, + { + "epoch": 4.352316474180274, + "grad_norm": 0.4653201634358049, + "learning_rate": 8.802592439863041e-07, + "loss": 0.5575, + "step": 16395 + }, + { + "epoch": 4.352581972653657, + "grad_norm": 0.4710851386131509, + "learning_rate": 8.799933190797829e-07, + "loss": 0.5223, + "step": 16396 + }, + { + "epoch": 4.352847471127041, + "grad_norm": 0.4833994723769361, + "learning_rate": 8.797274257673596e-07, + "loss": 0.5325, + "step": 16397 + }, + { + "epoch": 4.353112969600425, + "grad_norm": 0.48754080848517856, + "learning_rate": 8.794615640542175e-07, + "loss": 0.5629, + "step": 16398 + }, + { + "epoch": 4.353378468073808, + "grad_norm": 0.48194049280322226, + "learning_rate": 8.791957339455445e-07, + "loss": 0.5473, + "step": 16399 + }, + { + "epoch": 4.353643966547192, + "grad_norm": 0.4893975002802356, + "learning_rate": 8.789299354465214e-07, + "loss": 0.5387, + "step": 16400 + }, + { + "epoch": 4.3539094650205765, + "grad_norm": 0.4787728565403449, + "learning_rate": 8.786641685623343e-07, + "loss": 0.5213, + "step": 16401 + }, + { + "epoch": 4.35417496349396, + "grad_norm": 0.47519259222751964, + "learning_rate": 8.783984332981649e-07, + "loss": 0.5224, + "step": 16402 + }, + { + "epoch": 4.354440461967344, + "grad_norm": 0.48565591029409966, + "learning_rate": 8.781327296591971e-07, + "loss": 0.5608, + "step": 16403 + }, + { + "epoch": 4.354705960440727, + "grad_norm": 0.471917558807958, + "learning_rate": 8.778670576506115e-07, + "loss": 0.5243, + "step": 16404 + }, + { + "epoch": 4.354971458914111, + "grad_norm": 0.47524778000913703, + "learning_rate": 8.776014172775907e-07, + "loss": 0.5198, + "step": 16405 + }, + { + "epoch": 4.355236957387495, + "grad_norm": 0.48780914634974865, + "learning_rate": 8.773358085453146e-07, + "loss": 0.5625, + "step": 16406 + }, + { + "epoch": 4.355502455860879, + "grad_norm": 0.4768870064046254, + "learning_rate": 8.770702314589624e-07, + "loss": 0.5034, + "step": 16407 + }, + { + "epoch": 4.3557679543342624, + "grad_norm": 0.4481681818768645, + "learning_rate": 8.768046860237151e-07, + "loss": 0.5147, + "step": 16408 + }, + { + "epoch": 4.3560334528076465, + "grad_norm": 0.4840034071226703, + "learning_rate": 8.765391722447497e-07, + "loss": 0.538, + "step": 16409 + }, + { + "epoch": 4.35629895128103, + "grad_norm": 0.4701280455740269, + "learning_rate": 8.762736901272462e-07, + "loss": 0.5079, + "step": 16410 + }, + { + "epoch": 4.356564449754414, + "grad_norm": 0.48662864316860477, + "learning_rate": 8.760082396763814e-07, + "loss": 0.5521, + "step": 16411 + }, + { + "epoch": 4.356829948227798, + "grad_norm": 0.4740126672256831, + "learning_rate": 8.757428208973315e-07, + "loss": 0.5712, + "step": 16412 + }, + { + "epoch": 4.357095446701181, + "grad_norm": 0.47819925643180794, + "learning_rate": 8.754774337952741e-07, + "loss": 0.5481, + "step": 16413 + }, + { + "epoch": 4.357360945174565, + "grad_norm": 0.47208656122415316, + "learning_rate": 8.752120783753834e-07, + "loss": 0.5171, + "step": 16414 + }, + { + "epoch": 4.357626443647949, + "grad_norm": 0.4730238657390892, + "learning_rate": 8.749467546428361e-07, + "loss": 0.5196, + "step": 16415 + }, + { + "epoch": 4.3578919421213325, + "grad_norm": 0.47602421248567667, + "learning_rate": 8.746814626028061e-07, + "loss": 0.5282, + "step": 16416 + }, + { + "epoch": 4.3581574405947165, + "grad_norm": 0.48629965541526915, + "learning_rate": 8.744162022604671e-07, + "loss": 0.5147, + "step": 16417 + }, + { + "epoch": 4.358422939068101, + "grad_norm": 0.4767117987420647, + "learning_rate": 8.741509736209914e-07, + "loss": 0.537, + "step": 16418 + }, + { + "epoch": 4.358688437541484, + "grad_norm": 0.5089207286591754, + "learning_rate": 8.738857766895531e-07, + "loss": 0.5862, + "step": 16419 + }, + { + "epoch": 4.358953936014868, + "grad_norm": 0.4650629990211462, + "learning_rate": 8.736206114713228e-07, + "loss": 0.5253, + "step": 16420 + }, + { + "epoch": 4.359219434488252, + "grad_norm": 0.46228703190055936, + "learning_rate": 8.733554779714735e-07, + "loss": 0.5157, + "step": 16421 + }, + { + "epoch": 4.359484932961635, + "grad_norm": 0.4856301585054266, + "learning_rate": 8.730903761951751e-07, + "loss": 0.5256, + "step": 16422 + }, + { + "epoch": 4.359750431435019, + "grad_norm": 0.4879196366426895, + "learning_rate": 8.728253061475969e-07, + "loss": 0.5563, + "step": 16423 + }, + { + "epoch": 4.360015929908403, + "grad_norm": 0.4934569647011793, + "learning_rate": 8.725602678339102e-07, + "loss": 0.5337, + "step": 16424 + }, + { + "epoch": 4.360281428381787, + "grad_norm": 0.4813837401848103, + "learning_rate": 8.722952612592819e-07, + "loss": 0.5497, + "step": 16425 + }, + { + "epoch": 4.360546926855171, + "grad_norm": 0.46357170336120795, + "learning_rate": 8.720302864288824e-07, + "loss": 0.4824, + "step": 16426 + }, + { + "epoch": 4.360812425328555, + "grad_norm": 0.4845317083872418, + "learning_rate": 8.717653433478771e-07, + "loss": 0.5295, + "step": 16427 + }, + { + "epoch": 4.361077923801938, + "grad_norm": 0.4831719751238828, + "learning_rate": 8.715004320214362e-07, + "loss": 0.5494, + "step": 16428 + }, + { + "epoch": 4.361343422275322, + "grad_norm": 0.47314500074569443, + "learning_rate": 8.712355524547222e-07, + "loss": 0.4967, + "step": 16429 + }, + { + "epoch": 4.361608920748706, + "grad_norm": 0.486665266853582, + "learning_rate": 8.709707046529029e-07, + "loss": 0.5471, + "step": 16430 + }, + { + "epoch": 4.361874419222089, + "grad_norm": 0.47753467377033604, + "learning_rate": 8.707058886211445e-07, + "loss": 0.5524, + "step": 16431 + }, + { + "epoch": 4.362139917695473, + "grad_norm": 0.47214797699595473, + "learning_rate": 8.704411043646093e-07, + "loss": 0.5028, + "step": 16432 + }, + { + "epoch": 4.362405416168857, + "grad_norm": 0.49130154937513226, + "learning_rate": 8.701763518884635e-07, + "loss": 0.5456, + "step": 16433 + }, + { + "epoch": 4.362670914642241, + "grad_norm": 0.4804179284476958, + "learning_rate": 8.699116311978692e-07, + "loss": 0.5318, + "step": 16434 + }, + { + "epoch": 4.362936413115625, + "grad_norm": 0.4859676460896525, + "learning_rate": 8.696469422979892e-07, + "loss": 0.541, + "step": 16435 + }, + { + "epoch": 4.363201911589009, + "grad_norm": 0.4686204562038585, + "learning_rate": 8.693822851939848e-07, + "loss": 0.5258, + "step": 16436 + }, + { + "epoch": 4.363467410062392, + "grad_norm": 0.5048712585747511, + "learning_rate": 8.691176598910192e-07, + "loss": 0.5633, + "step": 16437 + }, + { + "epoch": 4.363732908535776, + "grad_norm": 0.4900047480155294, + "learning_rate": 8.688530663942512e-07, + "loss": 0.5316, + "step": 16438 + }, + { + "epoch": 4.363998407009159, + "grad_norm": 0.4902297600612942, + "learning_rate": 8.685885047088433e-07, + "loss": 0.5542, + "step": 16439 + }, + { + "epoch": 4.364263905482543, + "grad_norm": 0.4688927481096436, + "learning_rate": 8.683239748399538e-07, + "loss": 0.521, + "step": 16440 + }, + { + "epoch": 4.3645294039559275, + "grad_norm": 0.4768434337273866, + "learning_rate": 8.680594767927411e-07, + "loss": 0.491, + "step": 16441 + }, + { + "epoch": 4.364794902429311, + "grad_norm": 0.47306849264282297, + "learning_rate": 8.677950105723651e-07, + "loss": 0.5528, + "step": 16442 + }, + { + "epoch": 4.365060400902695, + "grad_norm": 0.4836012128268889, + "learning_rate": 8.675305761839817e-07, + "loss": 0.56, + "step": 16443 + }, + { + "epoch": 4.365325899376079, + "grad_norm": 0.47115084782918304, + "learning_rate": 8.672661736327501e-07, + "loss": 0.4955, + "step": 16444 + }, + { + "epoch": 4.365591397849462, + "grad_norm": 0.47490498342574056, + "learning_rate": 8.670018029238256e-07, + "loss": 0.5381, + "step": 16445 + }, + { + "epoch": 4.365856896322846, + "grad_norm": 0.5028330251847454, + "learning_rate": 8.667374640623643e-07, + "loss": 0.5486, + "step": 16446 + }, + { + "epoch": 4.36612239479623, + "grad_norm": 0.45846120956587877, + "learning_rate": 8.664731570535203e-07, + "loss": 0.525, + "step": 16447 + }, + { + "epoch": 4.366387893269613, + "grad_norm": 0.48595478647348195, + "learning_rate": 8.662088819024506e-07, + "loss": 0.5381, + "step": 16448 + }, + { + "epoch": 4.3666533917429975, + "grad_norm": 0.48037713207581445, + "learning_rate": 8.659446386143067e-07, + "loss": 0.506, + "step": 16449 + }, + { + "epoch": 4.366918890216382, + "grad_norm": 0.4952378953379151, + "learning_rate": 8.656804271942435e-07, + "loss": 0.5437, + "step": 16450 + }, + { + "epoch": 4.367184388689765, + "grad_norm": 0.48205468594592643, + "learning_rate": 8.654162476474148e-07, + "loss": 0.4611, + "step": 16451 + }, + { + "epoch": 4.367449887163149, + "grad_norm": 0.4805859708436209, + "learning_rate": 8.651520999789697e-07, + "loss": 0.5111, + "step": 16452 + }, + { + "epoch": 4.367715385636533, + "grad_norm": 0.4713638114590739, + "learning_rate": 8.648879841940628e-07, + "loss": 0.5422, + "step": 16453 + }, + { + "epoch": 4.367980884109916, + "grad_norm": 0.4855901660460948, + "learning_rate": 8.646239002978423e-07, + "loss": 0.5352, + "step": 16454 + }, + { + "epoch": 4.3682463825833, + "grad_norm": 0.47968039391209205, + "learning_rate": 8.643598482954609e-07, + "loss": 0.5645, + "step": 16455 + }, + { + "epoch": 4.368511881056684, + "grad_norm": 0.47576115075520664, + "learning_rate": 8.640958281920664e-07, + "loss": 0.5002, + "step": 16456 + }, + { + "epoch": 4.3687773795300675, + "grad_norm": 0.4847981982072536, + "learning_rate": 8.638318399928094e-07, + "loss": 0.5331, + "step": 16457 + }, + { + "epoch": 4.369042878003452, + "grad_norm": 0.47499641056420255, + "learning_rate": 8.635678837028375e-07, + "loss": 0.5336, + "step": 16458 + }, + { + "epoch": 4.369308376476836, + "grad_norm": 0.46533369416006304, + "learning_rate": 8.633039593272979e-07, + "loss": 0.5132, + "step": 16459 + }, + { + "epoch": 4.369573874950219, + "grad_norm": 0.4493736123311235, + "learning_rate": 8.63040066871339e-07, + "loss": 0.4804, + "step": 16460 + }, + { + "epoch": 4.369839373423603, + "grad_norm": 0.4867906347780406, + "learning_rate": 8.62776206340106e-07, + "loss": 0.5451, + "step": 16461 + }, + { + "epoch": 4.370104871896986, + "grad_norm": 0.47350326379858393, + "learning_rate": 8.625123777387465e-07, + "loss": 0.576, + "step": 16462 + }, + { + "epoch": 4.37037037037037, + "grad_norm": 0.48038342508389703, + "learning_rate": 8.622485810724049e-07, + "loss": 0.5212, + "step": 16463 + }, + { + "epoch": 4.370635868843754, + "grad_norm": 0.4786057351633928, + "learning_rate": 8.619848163462258e-07, + "loss": 0.5585, + "step": 16464 + }, + { + "epoch": 4.3709013673171375, + "grad_norm": 0.47812931114272944, + "learning_rate": 8.617210835653525e-07, + "loss": 0.525, + "step": 16465 + }, + { + "epoch": 4.371166865790522, + "grad_norm": 0.47572591113148127, + "learning_rate": 8.614573827349304e-07, + "loss": 0.5579, + "step": 16466 + }, + { + "epoch": 4.371432364263906, + "grad_norm": 0.49049259906487147, + "learning_rate": 8.611937138601001e-07, + "loss": 0.5838, + "step": 16467 + }, + { + "epoch": 4.371697862737289, + "grad_norm": 0.49229789461599016, + "learning_rate": 8.609300769460055e-07, + "loss": 0.5412, + "step": 16468 + }, + { + "epoch": 4.371963361210673, + "grad_norm": 0.46771799767543853, + "learning_rate": 8.606664719977878e-07, + "loss": 0.5295, + "step": 16469 + }, + { + "epoch": 4.372228859684057, + "grad_norm": 0.47877790215944677, + "learning_rate": 8.604028990205868e-07, + "loss": 0.528, + "step": 16470 + }, + { + "epoch": 4.37249435815744, + "grad_norm": 0.4811853678439923, + "learning_rate": 8.601393580195444e-07, + "loss": 0.5264, + "step": 16471 + }, + { + "epoch": 4.372759856630824, + "grad_norm": 0.49228722926254653, + "learning_rate": 8.598758489997988e-07, + "loss": 0.5267, + "step": 16472 + }, + { + "epoch": 4.373025355104208, + "grad_norm": 0.4838105018200669, + "learning_rate": 8.596123719664909e-07, + "loss": 0.5644, + "step": 16473 + }, + { + "epoch": 4.373290853577592, + "grad_norm": 0.4842143312238847, + "learning_rate": 8.59348926924758e-07, + "loss": 0.5426, + "step": 16474 + }, + { + "epoch": 4.373556352050976, + "grad_norm": 0.4877996631690246, + "learning_rate": 8.590855138797371e-07, + "loss": 0.5584, + "step": 16475 + }, + { + "epoch": 4.37382185052436, + "grad_norm": 0.48886061495515487, + "learning_rate": 8.588221328365673e-07, + "loss": 0.5191, + "step": 16476 + }, + { + "epoch": 4.374087348997743, + "grad_norm": 0.4729444029136176, + "learning_rate": 8.585587838003834e-07, + "loss": 0.5702, + "step": 16477 + }, + { + "epoch": 4.374352847471127, + "grad_norm": 0.4728799075768128, + "learning_rate": 8.582954667763227e-07, + "loss": 0.5123, + "step": 16478 + }, + { + "epoch": 4.374618345944511, + "grad_norm": 0.4849476488238954, + "learning_rate": 8.580321817695195e-07, + "loss": 0.5636, + "step": 16479 + }, + { + "epoch": 4.374883844417894, + "grad_norm": 0.49145738686351426, + "learning_rate": 8.577689287851104e-07, + "loss": 0.5487, + "step": 16480 + }, + { + "epoch": 4.375149342891278, + "grad_norm": 0.4899937451678765, + "learning_rate": 8.575057078282265e-07, + "loss": 0.5299, + "step": 16481 + }, + { + "epoch": 4.3754148413646625, + "grad_norm": 0.4918975105621015, + "learning_rate": 8.572425189040037e-07, + "loss": 0.5626, + "step": 16482 + }, + { + "epoch": 4.375680339838046, + "grad_norm": 0.46645229417290207, + "learning_rate": 8.569793620175729e-07, + "loss": 0.5324, + "step": 16483 + }, + { + "epoch": 4.37594583831143, + "grad_norm": 0.4836505718989422, + "learning_rate": 8.567162371740684e-07, + "loss": 0.5528, + "step": 16484 + }, + { + "epoch": 4.376211336784814, + "grad_norm": 0.4657065342430494, + "learning_rate": 8.564531443786197e-07, + "loss": 0.5316, + "step": 16485 + }, + { + "epoch": 4.376476835258197, + "grad_norm": 0.4857349698654789, + "learning_rate": 8.561900836363598e-07, + "loss": 0.5705, + "step": 16486 + }, + { + "epoch": 4.376742333731581, + "grad_norm": 0.4630604878641226, + "learning_rate": 8.55927054952418e-07, + "loss": 0.5029, + "step": 16487 + }, + { + "epoch": 4.377007832204965, + "grad_norm": 0.4731217490369237, + "learning_rate": 8.556640583319231e-07, + "loss": 0.4904, + "step": 16488 + }, + { + "epoch": 4.3772733306783485, + "grad_norm": 0.4826008371106639, + "learning_rate": 8.554010937800059e-07, + "loss": 0.5454, + "step": 16489 + }, + { + "epoch": 4.3775388291517325, + "grad_norm": 0.5005054669993394, + "learning_rate": 8.551381613017934e-07, + "loss": 0.5451, + "step": 16490 + }, + { + "epoch": 4.377804327625116, + "grad_norm": 0.47129794505716416, + "learning_rate": 8.548752609024156e-07, + "loss": 0.5071, + "step": 16491 + }, + { + "epoch": 4.3780698260985, + "grad_norm": 0.4878527156392193, + "learning_rate": 8.546123925869965e-07, + "loss": 0.5903, + "step": 16492 + }, + { + "epoch": 4.378335324571884, + "grad_norm": 0.49120077183024247, + "learning_rate": 8.543495563606651e-07, + "loss": 0.5436, + "step": 16493 + }, + { + "epoch": 4.378600823045267, + "grad_norm": 0.477550403604748, + "learning_rate": 8.540867522285459e-07, + "loss": 0.5384, + "step": 16494 + }, + { + "epoch": 4.378866321518651, + "grad_norm": 0.47572948529315984, + "learning_rate": 8.538239801957657e-07, + "loss": 0.5424, + "step": 16495 + }, + { + "epoch": 4.379131819992035, + "grad_norm": 0.47630281357863713, + "learning_rate": 8.535612402674476e-07, + "loss": 0.5294, + "step": 16496 + }, + { + "epoch": 4.3793973184654185, + "grad_norm": 0.4920222106821019, + "learning_rate": 8.532985324487172e-07, + "loss": 0.5663, + "step": 16497 + }, + { + "epoch": 4.3796628169388025, + "grad_norm": 0.47665864220000415, + "learning_rate": 8.530358567446976e-07, + "loss": 0.5249, + "step": 16498 + }, + { + "epoch": 4.379928315412187, + "grad_norm": 0.4791771372540893, + "learning_rate": 8.527732131605101e-07, + "loss": 0.5543, + "step": 16499 + }, + { + "epoch": 4.38019381388557, + "grad_norm": 0.49087745991054277, + "learning_rate": 8.525106017012788e-07, + "loss": 0.5504, + "step": 16500 + }, + { + "epoch": 4.380459312358954, + "grad_norm": 0.4653373786361846, + "learning_rate": 8.522480223721241e-07, + "loss": 0.5215, + "step": 16501 + }, + { + "epoch": 4.380724810832338, + "grad_norm": 0.4730824873341873, + "learning_rate": 8.519854751781679e-07, + "loss": 0.5544, + "step": 16502 + }, + { + "epoch": 4.380990309305721, + "grad_norm": 0.4865970197321445, + "learning_rate": 8.517229601245299e-07, + "loss": 0.5477, + "step": 16503 + }, + { + "epoch": 4.381255807779105, + "grad_norm": 0.48863799887270964, + "learning_rate": 8.514604772163293e-07, + "loss": 0.5691, + "step": 16504 + }, + { + "epoch": 4.381521306252489, + "grad_norm": 0.4861374617992733, + "learning_rate": 8.511980264586861e-07, + "loss": 0.545, + "step": 16505 + }, + { + "epoch": 4.381786804725873, + "grad_norm": 0.4774383052610317, + "learning_rate": 8.50935607856718e-07, + "loss": 0.5583, + "step": 16506 + }, + { + "epoch": 4.382052303199257, + "grad_norm": 0.47449155817775995, + "learning_rate": 8.506732214155436e-07, + "loss": 0.5116, + "step": 16507 + }, + { + "epoch": 4.382317801672641, + "grad_norm": 0.4670936394096365, + "learning_rate": 8.504108671402791e-07, + "loss": 0.5557, + "step": 16508 + }, + { + "epoch": 4.382583300146024, + "grad_norm": 0.46826203165877606, + "learning_rate": 8.501485450360431e-07, + "loss": 0.5524, + "step": 16509 + }, + { + "epoch": 4.382848798619408, + "grad_norm": 0.47470717086633746, + "learning_rate": 8.498862551079484e-07, + "loss": 0.5171, + "step": 16510 + }, + { + "epoch": 4.383114297092792, + "grad_norm": 0.4863571357843977, + "learning_rate": 8.496239973611126e-07, + "loss": 0.5554, + "step": 16511 + }, + { + "epoch": 4.383379795566175, + "grad_norm": 0.46605684082027554, + "learning_rate": 8.493617718006489e-07, + "loss": 0.5274, + "step": 16512 + }, + { + "epoch": 4.383645294039559, + "grad_norm": 0.49102223487760194, + "learning_rate": 8.49099578431673e-07, + "loss": 0.5538, + "step": 16513 + }, + { + "epoch": 4.3839107925129435, + "grad_norm": 0.4824003622620835, + "learning_rate": 8.488374172592976e-07, + "loss": 0.5646, + "step": 16514 + }, + { + "epoch": 4.384176290986327, + "grad_norm": 0.480698107916931, + "learning_rate": 8.485752882886342e-07, + "loss": 0.5682, + "step": 16515 + }, + { + "epoch": 4.384441789459711, + "grad_norm": 0.47950961080315, + "learning_rate": 8.483131915247969e-07, + "loss": 0.5509, + "step": 16516 + }, + { + "epoch": 4.384707287933095, + "grad_norm": 0.4908380095054776, + "learning_rate": 8.480511269728955e-07, + "loss": 0.5466, + "step": 16517 + }, + { + "epoch": 4.384972786406478, + "grad_norm": 0.4869741368537473, + "learning_rate": 8.477890946380429e-07, + "loss": 0.5515, + "step": 16518 + }, + { + "epoch": 4.385238284879862, + "grad_norm": 0.4801131785219321, + "learning_rate": 8.475270945253472e-07, + "loss": 0.5232, + "step": 16519 + }, + { + "epoch": 4.385503783353245, + "grad_norm": 0.4697789475375817, + "learning_rate": 8.472651266399201e-07, + "loss": 0.5413, + "step": 16520 + }, + { + "epoch": 4.385769281826629, + "grad_norm": 0.47088206796034004, + "learning_rate": 8.470031909868695e-07, + "loss": 0.5385, + "step": 16521 + }, + { + "epoch": 4.3860347803000135, + "grad_norm": 0.4595517236539239, + "learning_rate": 8.467412875713032e-07, + "loss": 0.4956, + "step": 16522 + }, + { + "epoch": 4.386300278773397, + "grad_norm": 0.49018867475150263, + "learning_rate": 8.464794163983303e-07, + "loss": 0.5412, + "step": 16523 + }, + { + "epoch": 4.386565777246781, + "grad_norm": 0.47797847688737366, + "learning_rate": 8.462175774730566e-07, + "loss": 0.5342, + "step": 16524 + }, + { + "epoch": 4.386831275720165, + "grad_norm": 0.4720384625925957, + "learning_rate": 8.459557708005902e-07, + "loss": 0.5512, + "step": 16525 + }, + { + "epoch": 4.387096774193548, + "grad_norm": 0.47004813559713393, + "learning_rate": 8.456939963860358e-07, + "loss": 0.5348, + "step": 16526 + }, + { + "epoch": 4.387362272666932, + "grad_norm": 0.47210250729405256, + "learning_rate": 8.454322542344992e-07, + "loss": 0.548, + "step": 16527 + }, + { + "epoch": 4.387627771140316, + "grad_norm": 0.5069301901037149, + "learning_rate": 8.451705443510838e-07, + "loss": 0.555, + "step": 16528 + }, + { + "epoch": 4.387893269613699, + "grad_norm": 0.4795219091120522, + "learning_rate": 8.449088667408953e-07, + "loss": 0.5341, + "step": 16529 + }, + { + "epoch": 4.3881587680870835, + "grad_norm": 0.4545990003373758, + "learning_rate": 8.446472214090354e-07, + "loss": 0.5031, + "step": 16530 + }, + { + "epoch": 4.388424266560468, + "grad_norm": 0.47664165031644634, + "learning_rate": 8.443856083606086e-07, + "loss": 0.538, + "step": 16531 + }, + { + "epoch": 4.388689765033851, + "grad_norm": 0.47883239273265965, + "learning_rate": 8.441240276007162e-07, + "loss": 0.5349, + "step": 16532 + }, + { + "epoch": 4.388955263507235, + "grad_norm": 0.483734129951063, + "learning_rate": 8.438624791344585e-07, + "loss": 0.5265, + "step": 16533 + }, + { + "epoch": 4.389220761980619, + "grad_norm": 0.4827238771762319, + "learning_rate": 8.436009629669384e-07, + "loss": 0.5664, + "step": 16534 + }, + { + "epoch": 4.389486260454002, + "grad_norm": 0.49356122586816153, + "learning_rate": 8.43339479103254e-07, + "loss": 0.4943, + "step": 16535 + }, + { + "epoch": 4.389751758927386, + "grad_norm": 0.479835733205734, + "learning_rate": 8.43078027548507e-07, + "loss": 0.5453, + "step": 16536 + }, + { + "epoch": 4.39001725740077, + "grad_norm": 0.46247650845404736, + "learning_rate": 8.428166083077952e-07, + "loss": 0.5061, + "step": 16537 + }, + { + "epoch": 4.3902827558741535, + "grad_norm": 0.46944378309625523, + "learning_rate": 8.42555221386217e-07, + "loss": 0.5194, + "step": 16538 + }, + { + "epoch": 4.390548254347538, + "grad_norm": 0.49150570669490645, + "learning_rate": 8.422938667888695e-07, + "loss": 0.5339, + "step": 16539 + }, + { + "epoch": 4.390813752820922, + "grad_norm": 0.4788798543853999, + "learning_rate": 8.420325445208508e-07, + "loss": 0.5458, + "step": 16540 + }, + { + "epoch": 4.391079251294305, + "grad_norm": 0.4843552123459841, + "learning_rate": 8.417712545872564e-07, + "loss": 0.5637, + "step": 16541 + }, + { + "epoch": 4.391344749767689, + "grad_norm": 0.4811327189883475, + "learning_rate": 8.415099969931823e-07, + "loss": 0.551, + "step": 16542 + }, + { + "epoch": 4.391610248241072, + "grad_norm": 0.48341769739296103, + "learning_rate": 8.412487717437259e-07, + "loss": 0.5119, + "step": 16543 + }, + { + "epoch": 4.391875746714456, + "grad_norm": 0.46833997994923415, + "learning_rate": 8.409875788439778e-07, + "loss": 0.5251, + "step": 16544 + }, + { + "epoch": 4.39214124518784, + "grad_norm": 0.4823732669104166, + "learning_rate": 8.407264182990349e-07, + "loss": 0.5425, + "step": 16545 + }, + { + "epoch": 4.392406743661224, + "grad_norm": 0.48815187719077285, + "learning_rate": 8.404652901139887e-07, + "loss": 0.5665, + "step": 16546 + }, + { + "epoch": 4.392672242134608, + "grad_norm": 0.47635689641019785, + "learning_rate": 8.402041942939332e-07, + "loss": 0.5238, + "step": 16547 + }, + { + "epoch": 4.392937740607992, + "grad_norm": 0.4778722834037993, + "learning_rate": 8.399431308439592e-07, + "loss": 0.5448, + "step": 16548 + }, + { + "epoch": 4.393203239081375, + "grad_norm": 0.4730014547639294, + "learning_rate": 8.396820997691593e-07, + "loss": 0.5332, + "step": 16549 + }, + { + "epoch": 4.393468737554759, + "grad_norm": 0.48587683855389313, + "learning_rate": 8.394211010746237e-07, + "loss": 0.56, + "step": 16550 + }, + { + "epoch": 4.393734236028143, + "grad_norm": 0.4900715009798407, + "learning_rate": 8.391601347654415e-07, + "loss": 0.5293, + "step": 16551 + }, + { + "epoch": 4.393999734501526, + "grad_norm": 0.48409093114950374, + "learning_rate": 8.388992008467043e-07, + "loss": 0.5858, + "step": 16552 + }, + { + "epoch": 4.39426523297491, + "grad_norm": 0.4716282235252544, + "learning_rate": 8.386382993234984e-07, + "loss": 0.518, + "step": 16553 + }, + { + "epoch": 4.394530731448294, + "grad_norm": 0.47956117951526955, + "learning_rate": 8.383774302009146e-07, + "loss": 0.549, + "step": 16554 + }, + { + "epoch": 4.394796229921678, + "grad_norm": 0.4848508500680948, + "learning_rate": 8.381165934840391e-07, + "loss": 0.5596, + "step": 16555 + }, + { + "epoch": 4.395061728395062, + "grad_norm": 0.4712953426031584, + "learning_rate": 8.37855789177959e-07, + "loss": 0.5455, + "step": 16556 + }, + { + "epoch": 4.395327226868446, + "grad_norm": 0.49910339687729405, + "learning_rate": 8.375950172877598e-07, + "loss": 0.543, + "step": 16557 + }, + { + "epoch": 4.395592725341829, + "grad_norm": 0.4689768542396897, + "learning_rate": 8.37334277818529e-07, + "loss": 0.5414, + "step": 16558 + }, + { + "epoch": 4.395858223815213, + "grad_norm": 0.4790907031745384, + "learning_rate": 8.370735707753497e-07, + "loss": 0.5292, + "step": 16559 + }, + { + "epoch": 4.396123722288597, + "grad_norm": 0.4684591991960239, + "learning_rate": 8.368128961633082e-07, + "loss": 0.5045, + "step": 16560 + }, + { + "epoch": 4.39638922076198, + "grad_norm": 0.4843910607353818, + "learning_rate": 8.365522539874874e-07, + "loss": 0.5345, + "step": 16561 + }, + { + "epoch": 4.396654719235364, + "grad_norm": 0.4814716658497471, + "learning_rate": 8.362916442529698e-07, + "loss": 0.5484, + "step": 16562 + }, + { + "epoch": 4.3969202177087485, + "grad_norm": 0.4808893516028332, + "learning_rate": 8.360310669648392e-07, + "loss": 0.5268, + "step": 16563 + }, + { + "epoch": 4.397185716182132, + "grad_norm": 0.488372798884931, + "learning_rate": 8.357705221281762e-07, + "loss": 0.506, + "step": 16564 + }, + { + "epoch": 4.397451214655516, + "grad_norm": 0.4540464216553004, + "learning_rate": 8.355100097480634e-07, + "loss": 0.5205, + "step": 16565 + }, + { + "epoch": 4.3977167131289, + "grad_norm": 0.48227418497272384, + "learning_rate": 8.35249529829581e-07, + "loss": 0.5328, + "step": 16566 + }, + { + "epoch": 4.397982211602283, + "grad_norm": 0.4991728818605788, + "learning_rate": 8.349890823778079e-07, + "loss": 0.5416, + "step": 16567 + }, + { + "epoch": 4.398247710075667, + "grad_norm": 0.4836881787915766, + "learning_rate": 8.347286673978253e-07, + "loss": 0.5436, + "step": 16568 + }, + { + "epoch": 4.398513208549051, + "grad_norm": 0.47149152568632324, + "learning_rate": 8.344682848947099e-07, + "loss": 0.5045, + "step": 16569 + }, + { + "epoch": 4.3987787070224345, + "grad_norm": 0.47823935343197504, + "learning_rate": 8.342079348735419e-07, + "loss": 0.5632, + "step": 16570 + }, + { + "epoch": 4.3990442054958185, + "grad_norm": 0.49255780728054716, + "learning_rate": 8.339476173393968e-07, + "loss": 0.5487, + "step": 16571 + }, + { + "epoch": 4.399309703969202, + "grad_norm": 0.48310629352880186, + "learning_rate": 8.336873322973537e-07, + "loss": 0.5452, + "step": 16572 + }, + { + "epoch": 4.399575202442586, + "grad_norm": 0.47366107896474235, + "learning_rate": 8.334270797524863e-07, + "loss": 0.5097, + "step": 16573 + }, + { + "epoch": 4.39984070091597, + "grad_norm": 0.49603665753956616, + "learning_rate": 8.331668597098719e-07, + "loss": 0.5408, + "step": 16574 + }, + { + "epoch": 4.400106199389353, + "grad_norm": 0.4861467766323368, + "learning_rate": 8.32906672174584e-07, + "loss": 0.5325, + "step": 16575 + }, + { + "epoch": 4.400371697862737, + "grad_norm": 0.49185672269455355, + "learning_rate": 8.326465171516987e-07, + "loss": 0.5412, + "step": 16576 + }, + { + "epoch": 4.400637196336121, + "grad_norm": 0.5036226909742733, + "learning_rate": 8.323863946462879e-07, + "loss": 0.5699, + "step": 16577 + }, + { + "epoch": 4.4009026948095045, + "grad_norm": 0.4934988122715888, + "learning_rate": 8.321263046634262e-07, + "loss": 0.5379, + "step": 16578 + }, + { + "epoch": 4.4011681932828886, + "grad_norm": 0.48140185505303684, + "learning_rate": 8.318662472081851e-07, + "loss": 0.5558, + "step": 16579 + }, + { + "epoch": 4.401433691756273, + "grad_norm": 0.4901830703409975, + "learning_rate": 8.316062222856358e-07, + "loss": 0.5225, + "step": 16580 + }, + { + "epoch": 4.401699190229656, + "grad_norm": 0.4847336852715075, + "learning_rate": 8.31346229900851e-07, + "loss": 0.5577, + "step": 16581 + }, + { + "epoch": 4.40196468870304, + "grad_norm": 0.47053959647474014, + "learning_rate": 8.310862700588992e-07, + "loss": 0.5241, + "step": 16582 + }, + { + "epoch": 4.402230187176424, + "grad_norm": 0.476964339308851, + "learning_rate": 8.308263427648522e-07, + "loss": 0.5296, + "step": 16583 + }, + { + "epoch": 4.402495685649807, + "grad_norm": 0.47501458555709297, + "learning_rate": 8.305664480237785e-07, + "loss": 0.5609, + "step": 16584 + }, + { + "epoch": 4.402761184123191, + "grad_norm": 0.4771999506840208, + "learning_rate": 8.303065858407467e-07, + "loss": 0.5334, + "step": 16585 + }, + { + "epoch": 4.403026682596575, + "grad_norm": 0.48438442130647985, + "learning_rate": 8.300467562208234e-07, + "loss": 0.5355, + "step": 16586 + }, + { + "epoch": 4.403292181069959, + "grad_norm": 0.4725408041333438, + "learning_rate": 8.297869591690771e-07, + "loss": 0.5349, + "step": 16587 + }, + { + "epoch": 4.403557679543343, + "grad_norm": 0.4832178313289328, + "learning_rate": 8.295271946905756e-07, + "loss": 0.5496, + "step": 16588 + }, + { + "epoch": 4.403823178016727, + "grad_norm": 0.48884472571661386, + "learning_rate": 8.292674627903838e-07, + "loss": 0.5517, + "step": 16589 + }, + { + "epoch": 4.40408867649011, + "grad_norm": 0.48661902335888485, + "learning_rate": 8.290077634735668e-07, + "loss": 0.5703, + "step": 16590 + }, + { + "epoch": 4.404354174963494, + "grad_norm": 0.4989595638043249, + "learning_rate": 8.287480967451894e-07, + "loss": 0.5703, + "step": 16591 + }, + { + "epoch": 4.404619673436878, + "grad_norm": 0.48283408291535, + "learning_rate": 8.284884626103165e-07, + "loss": 0.5549, + "step": 16592 + }, + { + "epoch": 4.404885171910261, + "grad_norm": 0.4769709345382824, + "learning_rate": 8.282288610740102e-07, + "loss": 0.5384, + "step": 16593 + }, + { + "epoch": 4.405150670383645, + "grad_norm": 0.4903974098635621, + "learning_rate": 8.279692921413354e-07, + "loss": 0.5535, + "step": 16594 + }, + { + "epoch": 4.4054161688570295, + "grad_norm": 0.4836010580018324, + "learning_rate": 8.277097558173533e-07, + "loss": 0.5474, + "step": 16595 + }, + { + "epoch": 4.405681667330413, + "grad_norm": 0.46643252334484436, + "learning_rate": 8.274502521071246e-07, + "loss": 0.5325, + "step": 16596 + }, + { + "epoch": 4.405947165803797, + "grad_norm": 0.48157133937959873, + "learning_rate": 8.271907810157117e-07, + "loss": 0.5588, + "step": 16597 + }, + { + "epoch": 4.406212664277181, + "grad_norm": 0.4774852041615962, + "learning_rate": 8.269313425481737e-07, + "loss": 0.5346, + "step": 16598 + }, + { + "epoch": 4.406478162750564, + "grad_norm": 0.47200433828200533, + "learning_rate": 8.266719367095716e-07, + "loss": 0.5246, + "step": 16599 + }, + { + "epoch": 4.406743661223948, + "grad_norm": 0.48085196306466316, + "learning_rate": 8.264125635049627e-07, + "loss": 0.5426, + "step": 16600 + }, + { + "epoch": 4.407009159697331, + "grad_norm": 0.48506682186569927, + "learning_rate": 8.261532229394084e-07, + "loss": 0.5335, + "step": 16601 + }, + { + "epoch": 4.407274658170715, + "grad_norm": 0.47262651255568006, + "learning_rate": 8.258939150179629e-07, + "loss": 0.5319, + "step": 16602 + }, + { + "epoch": 4.4075401566440995, + "grad_norm": 0.4838610920771168, + "learning_rate": 8.256346397456857e-07, + "loss": 0.5578, + "step": 16603 + }, + { + "epoch": 4.407805655117483, + "grad_norm": 0.47495246358248394, + "learning_rate": 8.253753971276319e-07, + "loss": 0.5452, + "step": 16604 + }, + { + "epoch": 4.408071153590867, + "grad_norm": 0.4620917749744837, + "learning_rate": 8.251161871688587e-07, + "loss": 0.4971, + "step": 16605 + }, + { + "epoch": 4.408336652064251, + "grad_norm": 0.46823420460255805, + "learning_rate": 8.248570098744199e-07, + "loss": 0.5385, + "step": 16606 + }, + { + "epoch": 4.408602150537634, + "grad_norm": 0.49352096457911426, + "learning_rate": 8.245978652493717e-07, + "loss": 0.5333, + "step": 16607 + }, + { + "epoch": 4.408867649011018, + "grad_norm": 0.4806888410227712, + "learning_rate": 8.24338753298767e-07, + "loss": 0.5265, + "step": 16608 + }, + { + "epoch": 4.409133147484402, + "grad_norm": 0.4795033100910115, + "learning_rate": 8.240796740276588e-07, + "loss": 0.5471, + "step": 16609 + }, + { + "epoch": 4.409398645957785, + "grad_norm": 0.4866614643725927, + "learning_rate": 8.238206274411012e-07, + "loss": 0.5157, + "step": 16610 + }, + { + "epoch": 4.4096641444311695, + "grad_norm": 0.4877475479925792, + "learning_rate": 8.235616135441444e-07, + "loss": 0.5654, + "step": 16611 + }, + { + "epoch": 4.409929642904554, + "grad_norm": 0.48497645434175274, + "learning_rate": 8.233026323418417e-07, + "loss": 0.5435, + "step": 16612 + }, + { + "epoch": 4.410195141377937, + "grad_norm": 0.4791618025070629, + "learning_rate": 8.230436838392427e-07, + "loss": 0.5564, + "step": 16613 + }, + { + "epoch": 4.410460639851321, + "grad_norm": 0.4753206556623597, + "learning_rate": 8.227847680413969e-07, + "loss": 0.5151, + "step": 16614 + }, + { + "epoch": 4.410726138324705, + "grad_norm": 0.49408224235282816, + "learning_rate": 8.225258849533557e-07, + "loss": 0.5316, + "step": 16615 + }, + { + "epoch": 4.410991636798088, + "grad_norm": 0.4809189150117858, + "learning_rate": 8.222670345801661e-07, + "loss": 0.5672, + "step": 16616 + }, + { + "epoch": 4.411257135271472, + "grad_norm": 0.47512747077281287, + "learning_rate": 8.220082169268778e-07, + "loss": 0.576, + "step": 16617 + }, + { + "epoch": 4.411522633744856, + "grad_norm": 0.47325840946973086, + "learning_rate": 8.217494319985378e-07, + "loss": 0.53, + "step": 16618 + }, + { + "epoch": 4.4117881322182395, + "grad_norm": 0.4634513071347273, + "learning_rate": 8.214906798001928e-07, + "loss": 0.5648, + "step": 16619 + }, + { + "epoch": 4.412053630691624, + "grad_norm": 0.47887318760265474, + "learning_rate": 8.212319603368887e-07, + "loss": 0.5055, + "step": 16620 + }, + { + "epoch": 4.412319129165008, + "grad_norm": 0.4818422253452254, + "learning_rate": 8.209732736136725e-07, + "loss": 0.5506, + "step": 16621 + }, + { + "epoch": 4.412584627638391, + "grad_norm": 0.4933637411606336, + "learning_rate": 8.207146196355873e-07, + "loss": 0.5655, + "step": 16622 + }, + { + "epoch": 4.412850126111775, + "grad_norm": 0.47212837137853614, + "learning_rate": 8.204559984076796e-07, + "loss": 0.5542, + "step": 16623 + }, + { + "epoch": 4.413115624585159, + "grad_norm": 0.4770536731793999, + "learning_rate": 8.201974099349922e-07, + "loss": 0.5435, + "step": 16624 + }, + { + "epoch": 4.413381123058542, + "grad_norm": 0.47370131246906244, + "learning_rate": 8.199388542225672e-07, + "loss": 0.4914, + "step": 16625 + }, + { + "epoch": 4.413646621531926, + "grad_norm": 0.4788157878171548, + "learning_rate": 8.196803312754493e-07, + "loss": 0.5617, + "step": 16626 + }, + { + "epoch": 4.41391212000531, + "grad_norm": 0.47935543589093, + "learning_rate": 8.194218410986779e-07, + "loss": 0.5757, + "step": 16627 + }, + { + "epoch": 4.414177618478694, + "grad_norm": 0.49523343750168153, + "learning_rate": 8.191633836972962e-07, + "loss": 0.5256, + "step": 16628 + }, + { + "epoch": 4.414443116952078, + "grad_norm": 0.47880038529065844, + "learning_rate": 8.189049590763432e-07, + "loss": 0.5124, + "step": 16629 + }, + { + "epoch": 4.414708615425461, + "grad_norm": 0.47265548806566515, + "learning_rate": 8.186465672408608e-07, + "loss": 0.5396, + "step": 16630 + }, + { + "epoch": 4.414974113898845, + "grad_norm": 0.48925554747587374, + "learning_rate": 8.183882081958858e-07, + "loss": 0.5599, + "step": 16631 + }, + { + "epoch": 4.415239612372229, + "grad_norm": 0.4904556805467935, + "learning_rate": 8.181298819464578e-07, + "loss": 0.527, + "step": 16632 + }, + { + "epoch": 4.415505110845612, + "grad_norm": 0.48415443548359194, + "learning_rate": 8.17871588497616e-07, + "loss": 0.5239, + "step": 16633 + }, + { + "epoch": 4.415770609318996, + "grad_norm": 0.49475954637904473, + "learning_rate": 8.17613327854396e-07, + "loss": 0.5327, + "step": 16634 + }, + { + "epoch": 4.41603610779238, + "grad_norm": 0.4679431339657498, + "learning_rate": 8.173551000218369e-07, + "loss": 0.5072, + "step": 16635 + }, + { + "epoch": 4.416301606265764, + "grad_norm": 0.4980948548298773, + "learning_rate": 8.170969050049712e-07, + "loss": 0.5219, + "step": 16636 + }, + { + "epoch": 4.416567104739148, + "grad_norm": 0.4929770917999851, + "learning_rate": 8.168387428088376e-07, + "loss": 0.5539, + "step": 16637 + }, + { + "epoch": 4.416832603212532, + "grad_norm": 0.48826704303946095, + "learning_rate": 8.165806134384685e-07, + "loss": 0.5229, + "step": 16638 + }, + { + "epoch": 4.417098101685915, + "grad_norm": 0.4948866613781495, + "learning_rate": 8.163225168989e-07, + "loss": 0.5654, + "step": 16639 + }, + { + "epoch": 4.417363600159299, + "grad_norm": 0.48333305550936245, + "learning_rate": 8.160644531951642e-07, + "loss": 0.5702, + "step": 16640 + }, + { + "epoch": 4.417629098632683, + "grad_norm": 0.4922566496502325, + "learning_rate": 8.15806422332295e-07, + "loss": 0.5148, + "step": 16641 + }, + { + "epoch": 4.417894597106066, + "grad_norm": 0.488723838977445, + "learning_rate": 8.155484243153244e-07, + "loss": 0.51, + "step": 16642 + }, + { + "epoch": 4.41816009557945, + "grad_norm": 0.4838047773076337, + "learning_rate": 8.152904591492831e-07, + "loss": 0.5429, + "step": 16643 + }, + { + "epoch": 4.4184255940528345, + "grad_norm": 0.49117705174130283, + "learning_rate": 8.150325268392035e-07, + "loss": 0.5272, + "step": 16644 + }, + { + "epoch": 4.418691092526218, + "grad_norm": 0.49278922379736834, + "learning_rate": 8.147746273901138e-07, + "loss": 0.5288, + "step": 16645 + }, + { + "epoch": 4.418956590999602, + "grad_norm": 0.4987614344238964, + "learning_rate": 8.145167608070464e-07, + "loss": 0.5095, + "step": 16646 + }, + { + "epoch": 4.419222089472986, + "grad_norm": 0.48822431453689114, + "learning_rate": 8.142589270950288e-07, + "loss": 0.5324, + "step": 16647 + }, + { + "epoch": 4.419487587946369, + "grad_norm": 0.4881151593755466, + "learning_rate": 8.140011262590894e-07, + "loss": 0.5671, + "step": 16648 + }, + { + "epoch": 4.419753086419753, + "grad_norm": 0.4958922765345893, + "learning_rate": 8.137433583042553e-07, + "loss": 0.5493, + "step": 16649 + }, + { + "epoch": 4.420018584893137, + "grad_norm": 0.4828907017588553, + "learning_rate": 8.134856232355554e-07, + "loss": 0.5523, + "step": 16650 + }, + { + "epoch": 4.4202840833665205, + "grad_norm": 0.5013118522835209, + "learning_rate": 8.132279210580141e-07, + "loss": 0.5141, + "step": 16651 + }, + { + "epoch": 4.4205495818399045, + "grad_norm": 0.4750123617068865, + "learning_rate": 8.129702517766591e-07, + "loss": 0.5629, + "step": 16652 + }, + { + "epoch": 4.420815080313288, + "grad_norm": 0.4824018506989978, + "learning_rate": 8.127126153965148e-07, + "loss": 0.5649, + "step": 16653 + }, + { + "epoch": 4.421080578786672, + "grad_norm": 0.4719677230021761, + "learning_rate": 8.124550119226049e-07, + "loss": 0.5629, + "step": 16654 + }, + { + "epoch": 4.421346077260056, + "grad_norm": 0.4742841990041128, + "learning_rate": 8.121974413599549e-07, + "loss": 0.5243, + "step": 16655 + }, + { + "epoch": 4.42161157573344, + "grad_norm": 0.4846558428319592, + "learning_rate": 8.119399037135864e-07, + "loss": 0.5375, + "step": 16656 + }, + { + "epoch": 4.421877074206823, + "grad_norm": 0.48948565814441936, + "learning_rate": 8.116823989885236e-07, + "loss": 0.5732, + "step": 16657 + }, + { + "epoch": 4.422142572680207, + "grad_norm": 0.479782947037128, + "learning_rate": 8.114249271897881e-07, + "loss": 0.5447, + "step": 16658 + }, + { + "epoch": 4.4224080711535905, + "grad_norm": 0.48622118740659476, + "learning_rate": 8.111674883223997e-07, + "loss": 0.562, + "step": 16659 + }, + { + "epoch": 4.4226735696269746, + "grad_norm": 0.48554013848007177, + "learning_rate": 8.109100823913813e-07, + "loss": 0.5512, + "step": 16660 + }, + { + "epoch": 4.422939068100359, + "grad_norm": 0.4786866890688573, + "learning_rate": 8.106527094017508e-07, + "loss": 0.5319, + "step": 16661 + }, + { + "epoch": 4.423204566573742, + "grad_norm": 0.4831168918515697, + "learning_rate": 8.103953693585295e-07, + "loss": 0.5585, + "step": 16662 + }, + { + "epoch": 4.423470065047126, + "grad_norm": 0.4701209835145499, + "learning_rate": 8.101380622667349e-07, + "loss": 0.5144, + "step": 16663 + }, + { + "epoch": 4.42373556352051, + "grad_norm": 0.4697152135209769, + "learning_rate": 8.098807881313869e-07, + "loss": 0.5394, + "step": 16664 + }, + { + "epoch": 4.424001061993893, + "grad_norm": 0.4776201155560845, + "learning_rate": 8.096235469575e-07, + "loss": 0.5445, + "step": 16665 + }, + { + "epoch": 4.424266560467277, + "grad_norm": 0.48887263534609116, + "learning_rate": 8.093663387500938e-07, + "loss": 0.5518, + "step": 16666 + }, + { + "epoch": 4.424532058940661, + "grad_norm": 0.4665235798745896, + "learning_rate": 8.091091635141824e-07, + "loss": 0.5822, + "step": 16667 + }, + { + "epoch": 4.424797557414045, + "grad_norm": 0.48493862822043754, + "learning_rate": 8.088520212547832e-07, + "loss": 0.5015, + "step": 16668 + }, + { + "epoch": 4.425063055887429, + "grad_norm": 0.4718393227144154, + "learning_rate": 8.085949119769093e-07, + "loss": 0.531, + "step": 16669 + }, + { + "epoch": 4.425328554360813, + "grad_norm": 0.480798850924045, + "learning_rate": 8.083378356855767e-07, + "loss": 0.5689, + "step": 16670 + }, + { + "epoch": 4.425594052834196, + "grad_norm": 0.4698678135095887, + "learning_rate": 8.080807923857983e-07, + "loss": 0.5409, + "step": 16671 + }, + { + "epoch": 4.42585955130758, + "grad_norm": 0.47302478007096793, + "learning_rate": 8.078237820825862e-07, + "loss": 0.5207, + "step": 16672 + }, + { + "epoch": 4.426125049780964, + "grad_norm": 0.47353423074426676, + "learning_rate": 8.075668047809543e-07, + "loss": 0.5701, + "step": 16673 + }, + { + "epoch": 4.426390548254347, + "grad_norm": 0.476212131602487, + "learning_rate": 8.073098604859125e-07, + "loss": 0.5246, + "step": 16674 + }, + { + "epoch": 4.426656046727731, + "grad_norm": 0.4897021972643576, + "learning_rate": 8.07052949202474e-07, + "loss": 0.5205, + "step": 16675 + }, + { + "epoch": 4.4269215452011155, + "grad_norm": 0.4837969089029706, + "learning_rate": 8.067960709356479e-07, + "loss": 0.5441, + "step": 16676 + }, + { + "epoch": 4.427187043674499, + "grad_norm": 0.4788561982980135, + "learning_rate": 8.065392256904439e-07, + "loss": 0.5579, + "step": 16677 + }, + { + "epoch": 4.427452542147883, + "grad_norm": 0.4891693595594505, + "learning_rate": 8.062824134718706e-07, + "loss": 0.5647, + "step": 16678 + }, + { + "epoch": 4.427718040621267, + "grad_norm": 0.46813081890134806, + "learning_rate": 8.060256342849373e-07, + "loss": 0.5412, + "step": 16679 + }, + { + "epoch": 4.42798353909465, + "grad_norm": 0.4792719926285562, + "learning_rate": 8.057688881346523e-07, + "loss": 0.5505, + "step": 16680 + }, + { + "epoch": 4.428249037568034, + "grad_norm": 0.4661606059578263, + "learning_rate": 8.055121750260215e-07, + "loss": 0.5292, + "step": 16681 + }, + { + "epoch": 4.428514536041417, + "grad_norm": 0.48008116629997893, + "learning_rate": 8.052554949640537e-07, + "loss": 0.5196, + "step": 16682 + }, + { + "epoch": 4.428780034514801, + "grad_norm": 0.4827863889923207, + "learning_rate": 8.049988479537515e-07, + "loss": 0.536, + "step": 16683 + }, + { + "epoch": 4.4290455329881855, + "grad_norm": 0.4652426058007033, + "learning_rate": 8.047422340001229e-07, + "loss": 0.5072, + "step": 16684 + }, + { + "epoch": 4.429311031461569, + "grad_norm": 0.48168034844975, + "learning_rate": 8.044856531081705e-07, + "loss": 0.5345, + "step": 16685 + }, + { + "epoch": 4.429576529934953, + "grad_norm": 0.47899488626405107, + "learning_rate": 8.042291052828999e-07, + "loss": 0.5617, + "step": 16686 + }, + { + "epoch": 4.429842028408337, + "grad_norm": 0.46937329015362367, + "learning_rate": 8.039725905293139e-07, + "loss": 0.5276, + "step": 16687 + }, + { + "epoch": 4.43010752688172, + "grad_norm": 0.4943864339158835, + "learning_rate": 8.03716108852414e-07, + "loss": 0.5466, + "step": 16688 + }, + { + "epoch": 4.430373025355104, + "grad_norm": 0.4662196098037641, + "learning_rate": 8.03459660257204e-07, + "loss": 0.5432, + "step": 16689 + }, + { + "epoch": 4.430638523828488, + "grad_norm": 0.4612466141456489, + "learning_rate": 8.032032447486837e-07, + "loss": 0.4958, + "step": 16690 + }, + { + "epoch": 4.430904022301871, + "grad_norm": 0.4816510280112654, + "learning_rate": 8.029468623318554e-07, + "loss": 0.5306, + "step": 16691 + }, + { + "epoch": 4.4311695207752555, + "grad_norm": 0.4786703416356911, + "learning_rate": 8.026905130117177e-07, + "loss": 0.553, + "step": 16692 + }, + { + "epoch": 4.43143501924864, + "grad_norm": 0.48149719945164426, + "learning_rate": 8.024341967932722e-07, + "loss": 0.5351, + "step": 16693 + }, + { + "epoch": 4.431700517722023, + "grad_norm": 0.47399154768588453, + "learning_rate": 8.021779136815147e-07, + "loss": 0.5149, + "step": 16694 + }, + { + "epoch": 4.431966016195407, + "grad_norm": 0.47216508910420213, + "learning_rate": 8.019216636814455e-07, + "loss": 0.5892, + "step": 16695 + }, + { + "epoch": 4.432231514668791, + "grad_norm": 0.4796382068302887, + "learning_rate": 8.016654467980609e-07, + "loss": 0.5366, + "step": 16696 + }, + { + "epoch": 4.432497013142174, + "grad_norm": 0.49496431736136587, + "learning_rate": 8.01409263036359e-07, + "loss": 0.5495, + "step": 16697 + }, + { + "epoch": 4.432762511615558, + "grad_norm": 0.45821421861579115, + "learning_rate": 8.011531124013347e-07, + "loss": 0.5199, + "step": 16698 + }, + { + "epoch": 4.433028010088942, + "grad_norm": 0.4802740697578364, + "learning_rate": 8.008969948979848e-07, + "loss": 0.5339, + "step": 16699 + }, + { + "epoch": 4.4332935085623255, + "grad_norm": 0.48833097301451095, + "learning_rate": 8.006409105313037e-07, + "loss": 0.5564, + "step": 16700 + }, + { + "epoch": 4.43355900703571, + "grad_norm": 0.48503996782502184, + "learning_rate": 8.003848593062849e-07, + "loss": 0.5462, + "step": 16701 + }, + { + "epoch": 4.433824505509094, + "grad_norm": 0.47691588345034797, + "learning_rate": 8.001288412279232e-07, + "loss": 0.5645, + "step": 16702 + }, + { + "epoch": 4.434090003982477, + "grad_norm": 0.471904522744315, + "learning_rate": 7.998728563012106e-07, + "loss": 0.5323, + "step": 16703 + }, + { + "epoch": 4.434355502455861, + "grad_norm": 0.47149038026698187, + "learning_rate": 7.996169045311405e-07, + "loss": 0.5263, + "step": 16704 + }, + { + "epoch": 4.434621000929245, + "grad_norm": 0.47541902670267966, + "learning_rate": 7.993609859227042e-07, + "loss": 0.5024, + "step": 16705 + }, + { + "epoch": 4.434886499402628, + "grad_norm": 0.47867409564014474, + "learning_rate": 7.991051004808917e-07, + "loss": 0.5366, + "step": 16706 + }, + { + "epoch": 4.435151997876012, + "grad_norm": 0.4865322128941707, + "learning_rate": 7.988492482106952e-07, + "loss": 0.5545, + "step": 16707 + }, + { + "epoch": 4.435417496349396, + "grad_norm": 0.47569377851924544, + "learning_rate": 7.985934291171024e-07, + "loss": 0.5352, + "step": 16708 + }, + { + "epoch": 4.43568299482278, + "grad_norm": 0.48054140606946316, + "learning_rate": 7.983376432051046e-07, + "loss": 0.5352, + "step": 16709 + }, + { + "epoch": 4.435948493296164, + "grad_norm": 0.4725020440233165, + "learning_rate": 7.980818904796888e-07, + "loss": 0.5325, + "step": 16710 + }, + { + "epoch": 4.436213991769547, + "grad_norm": 0.48126112781960134, + "learning_rate": 7.978261709458434e-07, + "loss": 0.5626, + "step": 16711 + }, + { + "epoch": 4.436479490242931, + "grad_norm": 0.4946834364495555, + "learning_rate": 7.975704846085544e-07, + "loss": 0.503, + "step": 16712 + }, + { + "epoch": 4.436744988716315, + "grad_norm": 0.4640553607220983, + "learning_rate": 7.973148314728102e-07, + "loss": 0.5366, + "step": 16713 + }, + { + "epoch": 4.437010487189698, + "grad_norm": 0.4640623552025175, + "learning_rate": 7.970592115435946e-07, + "loss": 0.5516, + "step": 16714 + }, + { + "epoch": 4.437275985663082, + "grad_norm": 0.475925820198665, + "learning_rate": 7.968036248258951e-07, + "loss": 0.5401, + "step": 16715 + }, + { + "epoch": 4.437541484136466, + "grad_norm": 0.4816646501219935, + "learning_rate": 7.965480713246948e-07, + "loss": 0.5485, + "step": 16716 + }, + { + "epoch": 4.43780698260985, + "grad_norm": 0.4750407822401424, + "learning_rate": 7.96292551044977e-07, + "loss": 0.5401, + "step": 16717 + }, + { + "epoch": 4.438072481083234, + "grad_norm": 0.4897962754862172, + "learning_rate": 7.960370639917267e-07, + "loss": 0.617, + "step": 16718 + }, + { + "epoch": 4.438337979556618, + "grad_norm": 0.48956763703120704, + "learning_rate": 7.957816101699248e-07, + "loss": 0.5249, + "step": 16719 + }, + { + "epoch": 4.438603478030001, + "grad_norm": 0.4700803075201632, + "learning_rate": 7.955261895845551e-07, + "loss": 0.5505, + "step": 16720 + }, + { + "epoch": 4.438868976503385, + "grad_norm": 0.4703157179682438, + "learning_rate": 7.952708022405972e-07, + "loss": 0.4998, + "step": 16721 + }, + { + "epoch": 4.439134474976769, + "grad_norm": 0.4676187978465773, + "learning_rate": 7.950154481430338e-07, + "loss": 0.5437, + "step": 16722 + }, + { + "epoch": 4.439399973450152, + "grad_norm": 0.476401499372817, + "learning_rate": 7.947601272968425e-07, + "loss": 0.5374, + "step": 16723 + }, + { + "epoch": 4.439665471923536, + "grad_norm": 0.4856438162776569, + "learning_rate": 7.945048397070037e-07, + "loss": 0.5533, + "step": 16724 + }, + { + "epoch": 4.4399309703969205, + "grad_norm": 0.45674573049263667, + "learning_rate": 7.942495853784973e-07, + "loss": 0.5158, + "step": 16725 + }, + { + "epoch": 4.440196468870304, + "grad_norm": 0.4884248879681289, + "learning_rate": 7.939943643162992e-07, + "loss": 0.5281, + "step": 16726 + }, + { + "epoch": 4.440461967343688, + "grad_norm": 0.4846921576477788, + "learning_rate": 7.937391765253891e-07, + "loss": 0.5115, + "step": 16727 + }, + { + "epoch": 4.440727465817072, + "grad_norm": 0.47855556948809413, + "learning_rate": 7.934840220107426e-07, + "loss": 0.5596, + "step": 16728 + }, + { + "epoch": 4.440992964290455, + "grad_norm": 0.4753505047030627, + "learning_rate": 7.932289007773361e-07, + "loss": 0.5071, + "step": 16729 + }, + { + "epoch": 4.441258462763839, + "grad_norm": 0.4663231667013425, + "learning_rate": 7.92973812830144e-07, + "loss": 0.5468, + "step": 16730 + }, + { + "epoch": 4.441523961237223, + "grad_norm": 0.47037980895030174, + "learning_rate": 7.927187581741427e-07, + "loss": 0.5007, + "step": 16731 + }, + { + "epoch": 4.4417894597106065, + "grad_norm": 0.47351158245135905, + "learning_rate": 7.924637368143051e-07, + "loss": 0.5349, + "step": 16732 + }, + { + "epoch": 4.4420549581839905, + "grad_norm": 0.48662427446942946, + "learning_rate": 7.922087487556063e-07, + "loss": 0.5484, + "step": 16733 + }, + { + "epoch": 4.442320456657375, + "grad_norm": 0.48256038950280983, + "learning_rate": 7.919537940030184e-07, + "loss": 0.5159, + "step": 16734 + }, + { + "epoch": 4.442585955130758, + "grad_norm": 0.4785808617281227, + "learning_rate": 7.916988725615124e-07, + "loss": 0.5431, + "step": 16735 + }, + { + "epoch": 4.442851453604142, + "grad_norm": 0.47371839271109195, + "learning_rate": 7.914439844360619e-07, + "loss": 0.527, + "step": 16736 + }, + { + "epoch": 4.443116952077526, + "grad_norm": 0.4765259850200113, + "learning_rate": 7.911891296316363e-07, + "loss": 0.5377, + "step": 16737 + }, + { + "epoch": 4.443382450550909, + "grad_norm": 0.5016182844340449, + "learning_rate": 7.909343081532073e-07, + "loss": 0.5236, + "step": 16738 + }, + { + "epoch": 4.443647949024293, + "grad_norm": 0.4852329858543557, + "learning_rate": 7.906795200057438e-07, + "loss": 0.5366, + "step": 16739 + }, + { + "epoch": 4.4439134474976765, + "grad_norm": 0.48338999111474706, + "learning_rate": 7.904247651942148e-07, + "loss": 0.5606, + "step": 16740 + }, + { + "epoch": 4.444178945971061, + "grad_norm": 0.49035578233200916, + "learning_rate": 7.901700437235879e-07, + "loss": 0.5422, + "step": 16741 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.4831257543465122, + "learning_rate": 7.899153555988323e-07, + "loss": 0.544, + "step": 16742 + }, + { + "epoch": 4.444709942917828, + "grad_norm": 0.4836291559174445, + "learning_rate": 7.896607008249135e-07, + "loss": 0.4983, + "step": 16743 + }, + { + "epoch": 4.444975441391212, + "grad_norm": 0.47972689334546736, + "learning_rate": 7.894060794067987e-07, + "loss": 0.5574, + "step": 16744 + }, + { + "epoch": 4.445240939864596, + "grad_norm": 0.46919579797531896, + "learning_rate": 7.891514913494552e-07, + "loss": 0.5123, + "step": 16745 + }, + { + "epoch": 4.445506438337979, + "grad_norm": 0.4958050831216418, + "learning_rate": 7.888969366578447e-07, + "loss": 0.5154, + "step": 16746 + }, + { + "epoch": 4.445771936811363, + "grad_norm": 0.46484540848926803, + "learning_rate": 7.886424153369345e-07, + "loss": 0.5173, + "step": 16747 + }, + { + "epoch": 4.446037435284747, + "grad_norm": 0.49628724274766695, + "learning_rate": 7.883879273916864e-07, + "loss": 0.5447, + "step": 16748 + }, + { + "epoch": 4.446302933758131, + "grad_norm": 0.47494283027982415, + "learning_rate": 7.881334728270654e-07, + "loss": 0.556, + "step": 16749 + }, + { + "epoch": 4.446568432231515, + "grad_norm": 0.47685395495933697, + "learning_rate": 7.87879051648032e-07, + "loss": 0.572, + "step": 16750 + }, + { + "epoch": 4.446833930704899, + "grad_norm": 0.4910985040099849, + "learning_rate": 7.8762466385955e-07, + "loss": 0.5568, + "step": 16751 + }, + { + "epoch": 4.447099429178282, + "grad_norm": 0.48524782649668274, + "learning_rate": 7.873703094665797e-07, + "loss": 0.533, + "step": 16752 + }, + { + "epoch": 4.447364927651666, + "grad_norm": 0.48305599840260044, + "learning_rate": 7.871159884740809e-07, + "loss": 0.5494, + "step": 16753 + }, + { + "epoch": 4.44763042612505, + "grad_norm": 0.48820265328632984, + "learning_rate": 7.868617008870147e-07, + "loss": 0.5337, + "step": 16754 + }, + { + "epoch": 4.447895924598433, + "grad_norm": 0.4697167839572464, + "learning_rate": 7.866074467103393e-07, + "loss": 0.5623, + "step": 16755 + }, + { + "epoch": 4.448161423071817, + "grad_norm": 0.4782113379668213, + "learning_rate": 7.863532259490153e-07, + "loss": 0.5286, + "step": 16756 + }, + { + "epoch": 4.4484269215452015, + "grad_norm": 0.46709937070414953, + "learning_rate": 7.860990386079975e-07, + "loss": 0.5164, + "step": 16757 + }, + { + "epoch": 4.448692420018585, + "grad_norm": 0.48349875174302726, + "learning_rate": 7.858448846922457e-07, + "loss": 0.5542, + "step": 16758 + }, + { + "epoch": 4.448957918491969, + "grad_norm": 0.48189910242283673, + "learning_rate": 7.855907642067146e-07, + "loss": 0.5576, + "step": 16759 + }, + { + "epoch": 4.449223416965353, + "grad_norm": 0.46975698146348355, + "learning_rate": 7.853366771563623e-07, + "loss": 0.5366, + "step": 16760 + }, + { + "epoch": 4.449488915438736, + "grad_norm": 0.4729750889908494, + "learning_rate": 7.85082623546142e-07, + "loss": 0.5426, + "step": 16761 + }, + { + "epoch": 4.44975441391212, + "grad_norm": 0.4812866589759176, + "learning_rate": 7.848286033810101e-07, + "loss": 0.5735, + "step": 16762 + }, + { + "epoch": 4.450019912385503, + "grad_norm": 0.4846423364481977, + "learning_rate": 7.845746166659202e-07, + "loss": 0.5574, + "step": 16763 + }, + { + "epoch": 4.450285410858887, + "grad_norm": 0.4875330358884606, + "learning_rate": 7.843206634058242e-07, + "loss": 0.5522, + "step": 16764 + }, + { + "epoch": 4.4505509093322715, + "grad_norm": 0.47797265466770783, + "learning_rate": 7.84066743605677e-07, + "loss": 0.5076, + "step": 16765 + }, + { + "epoch": 4.450816407805656, + "grad_norm": 0.47567480459066036, + "learning_rate": 7.838128572704287e-07, + "loss": 0.5368, + "step": 16766 + }, + { + "epoch": 4.451081906279039, + "grad_norm": 0.47934673595619226, + "learning_rate": 7.835590044050326e-07, + "loss": 0.5058, + "step": 16767 + }, + { + "epoch": 4.451347404752423, + "grad_norm": 0.4761895208501416, + "learning_rate": 7.833051850144386e-07, + "loss": 0.5351, + "step": 16768 + }, + { + "epoch": 4.451612903225806, + "grad_norm": 0.4750377818317134, + "learning_rate": 7.830513991035957e-07, + "loss": 0.5341, + "step": 16769 + }, + { + "epoch": 4.45187840169919, + "grad_norm": 0.459227752282165, + "learning_rate": 7.82797646677455e-07, + "loss": 0.5131, + "step": 16770 + }, + { + "epoch": 4.452143900172574, + "grad_norm": 0.48329382337217675, + "learning_rate": 7.825439277409638e-07, + "loss": 0.5756, + "step": 16771 + }, + { + "epoch": 4.452409398645957, + "grad_norm": 0.4844951852258455, + "learning_rate": 7.822902422990722e-07, + "loss": 0.5441, + "step": 16772 + }, + { + "epoch": 4.4526748971193415, + "grad_norm": 0.48052774196571807, + "learning_rate": 7.820365903567256e-07, + "loss": 0.5024, + "step": 16773 + }, + { + "epoch": 4.452940395592726, + "grad_norm": 0.4930231968862931, + "learning_rate": 7.817829719188732e-07, + "loss": 0.5396, + "step": 16774 + }, + { + "epoch": 4.453205894066109, + "grad_norm": 0.476994970560152, + "learning_rate": 7.815293869904586e-07, + "loss": 0.5516, + "step": 16775 + }, + { + "epoch": 4.453471392539493, + "grad_norm": 0.46195527064165515, + "learning_rate": 7.812758355764288e-07, + "loss": 0.5298, + "step": 16776 + }, + { + "epoch": 4.453736891012877, + "grad_norm": 0.48098791457085927, + "learning_rate": 7.810223176817278e-07, + "loss": 0.5331, + "step": 16777 + }, + { + "epoch": 4.45400238948626, + "grad_norm": 0.4881889105233967, + "learning_rate": 7.807688333113011e-07, + "loss": 0.5557, + "step": 16778 + }, + { + "epoch": 4.454267887959644, + "grad_norm": 0.4783860724493161, + "learning_rate": 7.805153824700917e-07, + "loss": 0.5772, + "step": 16779 + }, + { + "epoch": 4.454533386433028, + "grad_norm": 0.46467341343593643, + "learning_rate": 7.802619651630416e-07, + "loss": 0.5289, + "step": 16780 + }, + { + "epoch": 4.4547988849064115, + "grad_norm": 0.4729765867223776, + "learning_rate": 7.800085813950944e-07, + "loss": 0.5198, + "step": 16781 + }, + { + "epoch": 4.455064383379796, + "grad_norm": 0.4894763189619003, + "learning_rate": 7.797552311711906e-07, + "loss": 0.5591, + "step": 16782 + }, + { + "epoch": 4.45532988185318, + "grad_norm": 0.4795746775098918, + "learning_rate": 7.795019144962723e-07, + "loss": 0.5494, + "step": 16783 + }, + { + "epoch": 4.455595380326563, + "grad_norm": 0.4783304874710241, + "learning_rate": 7.792486313752784e-07, + "loss": 0.5073, + "step": 16784 + }, + { + "epoch": 4.455860878799947, + "grad_norm": 0.47836348382014443, + "learning_rate": 7.789953818131509e-07, + "loss": 0.5359, + "step": 16785 + }, + { + "epoch": 4.456126377273331, + "grad_norm": 0.4636302201351266, + "learning_rate": 7.787421658148256e-07, + "loss": 0.5106, + "step": 16786 + }, + { + "epoch": 4.456391875746714, + "grad_norm": 0.46698177063766205, + "learning_rate": 7.784889833852433e-07, + "loss": 0.5116, + "step": 16787 + }, + { + "epoch": 4.456657374220098, + "grad_norm": 0.47998549117174644, + "learning_rate": 7.7823583452934e-07, + "loss": 0.512, + "step": 16788 + }, + { + "epoch": 4.456922872693482, + "grad_norm": 0.47774672511202276, + "learning_rate": 7.779827192520545e-07, + "loss": 0.585, + "step": 16789 + }, + { + "epoch": 4.457188371166866, + "grad_norm": 0.487583737099386, + "learning_rate": 7.777296375583209e-07, + "loss": 0.5421, + "step": 16790 + }, + { + "epoch": 4.45745386964025, + "grad_norm": 0.48931293033171347, + "learning_rate": 7.774765894530776e-07, + "loss": 0.5577, + "step": 16791 + }, + { + "epoch": 4.457719368113633, + "grad_norm": 0.476947046330505, + "learning_rate": 7.77223574941258e-07, + "loss": 0.5575, + "step": 16792 + }, + { + "epoch": 4.457984866587017, + "grad_norm": 0.47321848869325484, + "learning_rate": 7.769705940277958e-07, + "loss": 0.5156, + "step": 16793 + }, + { + "epoch": 4.458250365060401, + "grad_norm": 0.4688683850512754, + "learning_rate": 7.767176467176266e-07, + "loss": 0.5339, + "step": 16794 + }, + { + "epoch": 4.458515863533785, + "grad_norm": 0.47543291376002594, + "learning_rate": 7.764647330156818e-07, + "loss": 0.5194, + "step": 16795 + }, + { + "epoch": 4.458781362007168, + "grad_norm": 0.4795077400845625, + "learning_rate": 7.762118529268959e-07, + "loss": 0.5392, + "step": 16796 + }, + { + "epoch": 4.459046860480552, + "grad_norm": 0.47132659691120776, + "learning_rate": 7.759590064561989e-07, + "loss": 0.5416, + "step": 16797 + }, + { + "epoch": 4.459312358953936, + "grad_norm": 0.4700233051810075, + "learning_rate": 7.757061936085217e-07, + "loss": 0.496, + "step": 16798 + }, + { + "epoch": 4.45957785742732, + "grad_norm": 0.45174579096216716, + "learning_rate": 7.754534143887965e-07, + "loss": 0.469, + "step": 16799 + }, + { + "epoch": 4.459843355900704, + "grad_norm": 0.4868248325826904, + "learning_rate": 7.752006688019512e-07, + "loss": 0.512, + "step": 16800 + }, + { + "epoch": 4.460108854374087, + "grad_norm": 0.48859601130345104, + "learning_rate": 7.74947956852917e-07, + "loss": 0.535, + "step": 16801 + }, + { + "epoch": 4.460374352847471, + "grad_norm": 0.4742936560337386, + "learning_rate": 7.746952785466202e-07, + "loss": 0.5205, + "step": 16802 + }, + { + "epoch": 4.460639851320855, + "grad_norm": 0.4721852507099897, + "learning_rate": 7.744426338879912e-07, + "loss": 0.5219, + "step": 16803 + }, + { + "epoch": 4.460905349794238, + "grad_norm": 0.4817985646773516, + "learning_rate": 7.741900228819546e-07, + "loss": 0.5437, + "step": 16804 + }, + { + "epoch": 4.4611708482676224, + "grad_norm": 0.4740503672211233, + "learning_rate": 7.739374455334386e-07, + "loss": 0.5485, + "step": 16805 + }, + { + "epoch": 4.4614363467410065, + "grad_norm": 0.47462231297439617, + "learning_rate": 7.736849018473677e-07, + "loss": 0.5233, + "step": 16806 + }, + { + "epoch": 4.46170184521439, + "grad_norm": 0.4698423276837917, + "learning_rate": 7.734323918286688e-07, + "loss": 0.5409, + "step": 16807 + }, + { + "epoch": 4.461967343687774, + "grad_norm": 0.47837117070306, + "learning_rate": 7.731799154822656e-07, + "loss": 0.5568, + "step": 16808 + }, + { + "epoch": 4.462232842161158, + "grad_norm": 0.4790102385519283, + "learning_rate": 7.729274728130812e-07, + "loss": 0.5444, + "step": 16809 + }, + { + "epoch": 4.462498340634541, + "grad_norm": 0.48203130885808015, + "learning_rate": 7.726750638260408e-07, + "loss": 0.5724, + "step": 16810 + }, + { + "epoch": 4.462763839107925, + "grad_norm": 0.463640351984473, + "learning_rate": 7.724226885260647e-07, + "loss": 0.5309, + "step": 16811 + }, + { + "epoch": 4.463029337581309, + "grad_norm": 0.48554503779120334, + "learning_rate": 7.721703469180769e-07, + "loss": 0.5745, + "step": 16812 + }, + { + "epoch": 4.4632948360546925, + "grad_norm": 0.47484646678052106, + "learning_rate": 7.719180390069972e-07, + "loss": 0.561, + "step": 16813 + }, + { + "epoch": 4.4635603345280765, + "grad_norm": 0.49526564017900865, + "learning_rate": 7.716657647977482e-07, + "loss": 0.5312, + "step": 16814 + }, + { + "epoch": 4.463825833001461, + "grad_norm": 0.4944937048713317, + "learning_rate": 7.71413524295247e-07, + "loss": 0.5724, + "step": 16815 + }, + { + "epoch": 4.464091331474844, + "grad_norm": 0.4830335153746338, + "learning_rate": 7.711613175044141e-07, + "loss": 0.5407, + "step": 16816 + }, + { + "epoch": 4.464356829948228, + "grad_norm": 0.4914262325565297, + "learning_rate": 7.709091444301694e-07, + "loss": 0.5295, + "step": 16817 + }, + { + "epoch": 4.464622328421612, + "grad_norm": 0.49815447029919036, + "learning_rate": 7.706570050774293e-07, + "loss": 0.5585, + "step": 16818 + }, + { + "epoch": 4.464887826894995, + "grad_norm": 0.481570914138703, + "learning_rate": 7.704048994511123e-07, + "loss": 0.5237, + "step": 16819 + }, + { + "epoch": 4.465153325368379, + "grad_norm": 0.4772179254194095, + "learning_rate": 7.701528275561349e-07, + "loss": 0.5626, + "step": 16820 + }, + { + "epoch": 4.4654188238417625, + "grad_norm": 0.4745463224914997, + "learning_rate": 7.699007893974125e-07, + "loss": 0.5313, + "step": 16821 + }, + { + "epoch": 4.465684322315147, + "grad_norm": 0.48371416579121224, + "learning_rate": 7.696487849798598e-07, + "loss": 0.5491, + "step": 16822 + }, + { + "epoch": 4.465949820788531, + "grad_norm": 0.4783245824257991, + "learning_rate": 7.693968143083936e-07, + "loss": 0.5469, + "step": 16823 + }, + { + "epoch": 4.466215319261914, + "grad_norm": 0.47661642220234274, + "learning_rate": 7.691448773879257e-07, + "loss": 0.5239, + "step": 16824 + }, + { + "epoch": 4.466480817735298, + "grad_norm": 0.47825627645453545, + "learning_rate": 7.688929742233714e-07, + "loss": 0.537, + "step": 16825 + }, + { + "epoch": 4.466746316208682, + "grad_norm": 0.48049150338784724, + "learning_rate": 7.686411048196426e-07, + "loss": 0.5156, + "step": 16826 + }, + { + "epoch": 4.467011814682065, + "grad_norm": 0.4853666868482445, + "learning_rate": 7.683892691816505e-07, + "loss": 0.5582, + "step": 16827 + }, + { + "epoch": 4.467277313155449, + "grad_norm": 0.46326720083328293, + "learning_rate": 7.681374673143083e-07, + "loss": 0.4836, + "step": 16828 + }, + { + "epoch": 4.467542811628833, + "grad_norm": 0.4708498491803565, + "learning_rate": 7.678856992225248e-07, + "loss": 0.5533, + "step": 16829 + }, + { + "epoch": 4.467808310102217, + "grad_norm": 0.4763351351571721, + "learning_rate": 7.676339649112121e-07, + "loss": 0.5732, + "step": 16830 + }, + { + "epoch": 4.468073808575601, + "grad_norm": 0.4827704324084567, + "learning_rate": 7.673822643852785e-07, + "loss": 0.5531, + "step": 16831 + }, + { + "epoch": 4.468339307048985, + "grad_norm": 0.4811741930674635, + "learning_rate": 7.671305976496329e-07, + "loss": 0.5488, + "step": 16832 + }, + { + "epoch": 4.468604805522368, + "grad_norm": 0.4974384865027843, + "learning_rate": 7.668789647091828e-07, + "loss": 0.5393, + "step": 16833 + }, + { + "epoch": 4.468870303995752, + "grad_norm": 0.4801160540271777, + "learning_rate": 7.666273655688372e-07, + "loss": 0.571, + "step": 16834 + }, + { + "epoch": 4.469135802469136, + "grad_norm": 0.47421844996759044, + "learning_rate": 7.663758002335009e-07, + "loss": 0.5245, + "step": 16835 + }, + { + "epoch": 4.469401300942519, + "grad_norm": 0.4746300183106361, + "learning_rate": 7.661242687080813e-07, + "loss": 0.5273, + "step": 16836 + }, + { + "epoch": 4.469666799415903, + "grad_norm": 0.47818752052738495, + "learning_rate": 7.658727709974853e-07, + "loss": 0.555, + "step": 16837 + }, + { + "epoch": 4.4699322978892875, + "grad_norm": 0.47902922010848037, + "learning_rate": 7.656213071066146e-07, + "loss": 0.5239, + "step": 16838 + }, + { + "epoch": 4.470197796362671, + "grad_norm": 0.4710875706464958, + "learning_rate": 7.653698770403755e-07, + "loss": 0.5467, + "step": 16839 + }, + { + "epoch": 4.470463294836055, + "grad_norm": 0.49564338962342525, + "learning_rate": 7.651184808036705e-07, + "loss": 0.5475, + "step": 16840 + }, + { + "epoch": 4.470728793309439, + "grad_norm": 0.47452879767399997, + "learning_rate": 7.648671184014033e-07, + "loss": 0.5133, + "step": 16841 + }, + { + "epoch": 4.470994291782822, + "grad_norm": 0.484640531195572, + "learning_rate": 7.646157898384751e-07, + "loss": 0.5247, + "step": 16842 + }, + { + "epoch": 4.471259790256206, + "grad_norm": 0.48931215975181286, + "learning_rate": 7.643644951197885e-07, + "loss": 0.557, + "step": 16843 + }, + { + "epoch": 4.47152528872959, + "grad_norm": 0.48300915134314404, + "learning_rate": 7.641132342502439e-07, + "loss": 0.5065, + "step": 16844 + }, + { + "epoch": 4.471790787202973, + "grad_norm": 0.46721172452743115, + "learning_rate": 7.638620072347405e-07, + "loss": 0.5473, + "step": 16845 + }, + { + "epoch": 4.4720562856763575, + "grad_norm": 0.48094695221407097, + "learning_rate": 7.636108140781798e-07, + "loss": 0.5526, + "step": 16846 + }, + { + "epoch": 4.472321784149742, + "grad_norm": 0.4794989833368633, + "learning_rate": 7.633596547854588e-07, + "loss": 0.5486, + "step": 16847 + }, + { + "epoch": 4.472587282623125, + "grad_norm": 0.48316894742047806, + "learning_rate": 7.631085293614771e-07, + "loss": 0.5339, + "step": 16848 + }, + { + "epoch": 4.472852781096509, + "grad_norm": 0.49101837193917114, + "learning_rate": 7.62857437811132e-07, + "loss": 0.55, + "step": 16849 + }, + { + "epoch": 4.473118279569892, + "grad_norm": 0.4785061978352384, + "learning_rate": 7.6260638013932e-07, + "loss": 0.4978, + "step": 16850 + }, + { + "epoch": 4.473383778043276, + "grad_norm": 0.4683036301392794, + "learning_rate": 7.623553563509367e-07, + "loss": 0.5402, + "step": 16851 + }, + { + "epoch": 4.47364927651666, + "grad_norm": 0.47321788405586923, + "learning_rate": 7.621043664508793e-07, + "loss": 0.5127, + "step": 16852 + }, + { + "epoch": 4.473914774990043, + "grad_norm": 0.467525947748451, + "learning_rate": 7.618534104440409e-07, + "loss": 0.5448, + "step": 16853 + }, + { + "epoch": 4.4741802734634275, + "grad_norm": 0.46238664931981793, + "learning_rate": 7.616024883353176e-07, + "loss": 0.5053, + "step": 16854 + }, + { + "epoch": 4.474445771936812, + "grad_norm": 0.4821784597825707, + "learning_rate": 7.61351600129602e-07, + "loss": 0.5339, + "step": 16855 + }, + { + "epoch": 4.474711270410195, + "grad_norm": 0.49618910802445, + "learning_rate": 7.611007458317865e-07, + "loss": 0.5516, + "step": 16856 + }, + { + "epoch": 4.474976768883579, + "grad_norm": 0.4871873461800691, + "learning_rate": 7.608499254467647e-07, + "loss": 0.5262, + "step": 16857 + }, + { + "epoch": 4.475242267356963, + "grad_norm": 0.48274045317548486, + "learning_rate": 7.605991389794268e-07, + "loss": 0.5557, + "step": 16858 + }, + { + "epoch": 4.475507765830346, + "grad_norm": 0.4767275049196292, + "learning_rate": 7.603483864346653e-07, + "loss": 0.5166, + "step": 16859 + }, + { + "epoch": 4.47577326430373, + "grad_norm": 0.4755331375435532, + "learning_rate": 7.600976678173697e-07, + "loss": 0.5488, + "step": 16860 + }, + { + "epoch": 4.476038762777114, + "grad_norm": 0.4719080735202083, + "learning_rate": 7.598469831324284e-07, + "loss": 0.5176, + "step": 16861 + }, + { + "epoch": 4.4763042612504975, + "grad_norm": 0.46911830415695305, + "learning_rate": 7.595963323847328e-07, + "loss": 0.5193, + "step": 16862 + }, + { + "epoch": 4.476569759723882, + "grad_norm": 0.4933431117284641, + "learning_rate": 7.593457155791692e-07, + "loss": 0.5242, + "step": 16863 + }, + { + "epoch": 4.476835258197266, + "grad_norm": 0.48850423620668215, + "learning_rate": 7.590951327206264e-07, + "loss": 0.5049, + "step": 16864 + }, + { + "epoch": 4.477100756670649, + "grad_norm": 0.475543322350099, + "learning_rate": 7.588445838139905e-07, + "loss": 0.5362, + "step": 16865 + }, + { + "epoch": 4.477366255144033, + "grad_norm": 0.482123911051005, + "learning_rate": 7.585940688641499e-07, + "loss": 0.5541, + "step": 16866 + }, + { + "epoch": 4.477631753617417, + "grad_norm": 0.4771534579196199, + "learning_rate": 7.58343587875987e-07, + "loss": 0.525, + "step": 16867 + }, + { + "epoch": 4.4778972520908, + "grad_norm": 0.477193930247143, + "learning_rate": 7.58093140854389e-07, + "loss": 0.5495, + "step": 16868 + }, + { + "epoch": 4.478162750564184, + "grad_norm": 0.48754187436692237, + "learning_rate": 7.578427278042391e-07, + "loss": 0.5234, + "step": 16869 + }, + { + "epoch": 4.478428249037568, + "grad_norm": 0.4765260618184064, + "learning_rate": 7.575923487304224e-07, + "loss": 0.5305, + "step": 16870 + }, + { + "epoch": 4.478693747510952, + "grad_norm": 0.47742572936281885, + "learning_rate": 7.573420036378204e-07, + "loss": 0.528, + "step": 16871 + }, + { + "epoch": 4.478959245984336, + "grad_norm": 0.4835757187076809, + "learning_rate": 7.570916925313165e-07, + "loss": 0.532, + "step": 16872 + }, + { + "epoch": 4.479224744457719, + "grad_norm": 0.47941219447669864, + "learning_rate": 7.568414154157922e-07, + "loss": 0.5516, + "step": 16873 + }, + { + "epoch": 4.479490242931103, + "grad_norm": 0.48778509592229713, + "learning_rate": 7.565911722961275e-07, + "loss": 0.5141, + "step": 16874 + }, + { + "epoch": 4.479755741404487, + "grad_norm": 0.47460255721038447, + "learning_rate": 7.563409631772043e-07, + "loss": 0.5233, + "step": 16875 + }, + { + "epoch": 4.480021239877871, + "grad_norm": 0.48913858413621, + "learning_rate": 7.560907880639007e-07, + "loss": 0.5188, + "step": 16876 + }, + { + "epoch": 4.480286738351254, + "grad_norm": 0.49581404032252385, + "learning_rate": 7.558406469610982e-07, + "loss": 0.5298, + "step": 16877 + }, + { + "epoch": 4.480552236824638, + "grad_norm": 0.47283289376814264, + "learning_rate": 7.555905398736718e-07, + "loss": 0.5023, + "step": 16878 + }, + { + "epoch": 4.480817735298022, + "grad_norm": 0.4904549769963252, + "learning_rate": 7.553404668065017e-07, + "loss": 0.5682, + "step": 16879 + }, + { + "epoch": 4.481083233771406, + "grad_norm": 0.48689180341885197, + "learning_rate": 7.550904277644635e-07, + "loss": 0.5339, + "step": 16880 + }, + { + "epoch": 4.48134873224479, + "grad_norm": 0.4835409913859052, + "learning_rate": 7.548404227524342e-07, + "loss": 0.5399, + "step": 16881 + }, + { + "epoch": 4.481614230718173, + "grad_norm": 0.4938640234370828, + "learning_rate": 7.545904517752906e-07, + "loss": 0.5721, + "step": 16882 + }, + { + "epoch": 4.481879729191557, + "grad_norm": 0.4849340218789454, + "learning_rate": 7.543405148379063e-07, + "loss": 0.5473, + "step": 16883 + }, + { + "epoch": 4.482145227664941, + "grad_norm": 0.48938650005707524, + "learning_rate": 7.540906119451564e-07, + "loss": 0.544, + "step": 16884 + }, + { + "epoch": 4.482410726138324, + "grad_norm": 0.4666671839274467, + "learning_rate": 7.538407431019132e-07, + "loss": 0.5366, + "step": 16885 + }, + { + "epoch": 4.4826762246117084, + "grad_norm": 0.47559293955184323, + "learning_rate": 7.535909083130515e-07, + "loss": 0.5472, + "step": 16886 + }, + { + "epoch": 4.4829417230850925, + "grad_norm": 0.5006126537964072, + "learning_rate": 7.533411075834426e-07, + "loss": 0.5593, + "step": 16887 + }, + { + "epoch": 4.483207221558476, + "grad_norm": 0.48276985867423183, + "learning_rate": 7.530913409179594e-07, + "loss": 0.536, + "step": 16888 + }, + { + "epoch": 4.48347272003186, + "grad_norm": 0.5027431727573257, + "learning_rate": 7.52841608321472e-07, + "loss": 0.5491, + "step": 16889 + }, + { + "epoch": 4.483738218505244, + "grad_norm": 0.48647038534828857, + "learning_rate": 7.525919097988504e-07, + "loss": 0.5572, + "step": 16890 + }, + { + "epoch": 4.484003716978627, + "grad_norm": 0.4755863022857314, + "learning_rate": 7.523422453549659e-07, + "loss": 0.5291, + "step": 16891 + }, + { + "epoch": 4.484269215452011, + "grad_norm": 0.47451792876456994, + "learning_rate": 7.520926149946856e-07, + "loss": 0.5298, + "step": 16892 + }, + { + "epoch": 4.484534713925395, + "grad_norm": 0.4817184723231692, + "learning_rate": 7.518430187228798e-07, + "loss": 0.5457, + "step": 16893 + }, + { + "epoch": 4.4848002123987785, + "grad_norm": 0.5005842622885278, + "learning_rate": 7.515934565444147e-07, + "loss": 0.5484, + "step": 16894 + }, + { + "epoch": 4.4850657108721625, + "grad_norm": 0.47156579819148675, + "learning_rate": 7.513439284641593e-07, + "loss": 0.5462, + "step": 16895 + }, + { + "epoch": 4.485331209345547, + "grad_norm": 0.4816356064933116, + "learning_rate": 7.510944344869775e-07, + "loss": 0.5437, + "step": 16896 + }, + { + "epoch": 4.48559670781893, + "grad_norm": 0.4697736734255272, + "learning_rate": 7.508449746177368e-07, + "loss": 0.5451, + "step": 16897 + }, + { + "epoch": 4.485862206292314, + "grad_norm": 0.48860648227893994, + "learning_rate": 7.505955488613009e-07, + "loss": 0.5647, + "step": 16898 + }, + { + "epoch": 4.486127704765698, + "grad_norm": 0.48804592987491996, + "learning_rate": 7.503461572225363e-07, + "loss": 0.515, + "step": 16899 + }, + { + "epoch": 4.486393203239081, + "grad_norm": 0.4837839544995788, + "learning_rate": 7.500967997063053e-07, + "loss": 0.5405, + "step": 16900 + }, + { + "epoch": 4.486658701712465, + "grad_norm": 0.46677871479160515, + "learning_rate": 7.498474763174701e-07, + "loss": 0.5161, + "step": 16901 + }, + { + "epoch": 4.4869242001858485, + "grad_norm": 0.48240247205356773, + "learning_rate": 7.495981870608954e-07, + "loss": 0.512, + "step": 16902 + }, + { + "epoch": 4.487189698659233, + "grad_norm": 0.4790539763797526, + "learning_rate": 7.49348931941441e-07, + "loss": 0.5577, + "step": 16903 + }, + { + "epoch": 4.487455197132617, + "grad_norm": 0.4792779104803753, + "learning_rate": 7.490997109639694e-07, + "loss": 0.4978, + "step": 16904 + }, + { + "epoch": 4.487720695606001, + "grad_norm": 0.483206801909153, + "learning_rate": 7.488505241333396e-07, + "loss": 0.5409, + "step": 16905 + }, + { + "epoch": 4.487986194079384, + "grad_norm": 0.4801602877983675, + "learning_rate": 7.48601371454413e-07, + "loss": 0.515, + "step": 16906 + }, + { + "epoch": 4.488251692552768, + "grad_norm": 0.4676238656834283, + "learning_rate": 7.483522529320478e-07, + "loss": 0.5146, + "step": 16907 + }, + { + "epoch": 4.488517191026151, + "grad_norm": 0.4592553050851243, + "learning_rate": 7.481031685711015e-07, + "loss": 0.503, + "step": 16908 + }, + { + "epoch": 4.488782689499535, + "grad_norm": 0.47310982065033996, + "learning_rate": 7.478541183764337e-07, + "loss": 0.5453, + "step": 16909 + }, + { + "epoch": 4.489048187972919, + "grad_norm": 0.4791007215648069, + "learning_rate": 7.476051023528999e-07, + "loss": 0.5303, + "step": 16910 + }, + { + "epoch": 4.489313686446303, + "grad_norm": 0.48350580083832817, + "learning_rate": 7.473561205053579e-07, + "loss": 0.5429, + "step": 16911 + }, + { + "epoch": 4.489579184919687, + "grad_norm": 0.48356158255585346, + "learning_rate": 7.471071728386628e-07, + "loss": 0.5611, + "step": 16912 + }, + { + "epoch": 4.489844683393071, + "grad_norm": 0.48434905581566023, + "learning_rate": 7.468582593576695e-07, + "loss": 0.5205, + "step": 16913 + }, + { + "epoch": 4.490110181866454, + "grad_norm": 0.4863816130482888, + "learning_rate": 7.46609380067232e-07, + "loss": 0.5643, + "step": 16914 + }, + { + "epoch": 4.490375680339838, + "grad_norm": 0.49023488038667695, + "learning_rate": 7.463605349722053e-07, + "loss": 0.5129, + "step": 16915 + }, + { + "epoch": 4.490641178813222, + "grad_norm": 0.5013896789594667, + "learning_rate": 7.46111724077441e-07, + "loss": 0.5697, + "step": 16916 + }, + { + "epoch": 4.490906677286605, + "grad_norm": 0.47410695460108165, + "learning_rate": 7.458629473877932e-07, + "loss": 0.5164, + "step": 16917 + }, + { + "epoch": 4.491172175759989, + "grad_norm": 0.46937514587447743, + "learning_rate": 7.456142049081128e-07, + "loss": 0.5098, + "step": 16918 + }, + { + "epoch": 4.4914376742333735, + "grad_norm": 0.48969793829860647, + "learning_rate": 7.453654966432499e-07, + "loss": 0.5548, + "step": 16919 + }, + { + "epoch": 4.491703172706757, + "grad_norm": 0.4824671567736626, + "learning_rate": 7.451168225980568e-07, + "loss": 0.5427, + "step": 16920 + }, + { + "epoch": 4.491968671180141, + "grad_norm": 0.4797311162807997, + "learning_rate": 7.448681827773818e-07, + "loss": 0.5186, + "step": 16921 + }, + { + "epoch": 4.492234169653525, + "grad_norm": 0.48393732244421994, + "learning_rate": 7.44619577186075e-07, + "loss": 0.5331, + "step": 16922 + }, + { + "epoch": 4.492499668126908, + "grad_norm": 0.5381283689486273, + "learning_rate": 7.443710058289838e-07, + "loss": 0.5393, + "step": 16923 + }, + { + "epoch": 4.492765166600292, + "grad_norm": 0.49030020167986993, + "learning_rate": 7.44122468710958e-07, + "loss": 0.5396, + "step": 16924 + }, + { + "epoch": 4.493030665073676, + "grad_norm": 0.4930901571664935, + "learning_rate": 7.438739658368415e-07, + "loss": 0.5472, + "step": 16925 + }, + { + "epoch": 4.493296163547059, + "grad_norm": 0.4826504580637794, + "learning_rate": 7.436254972114834e-07, + "loss": 0.5316, + "step": 16926 + }, + { + "epoch": 4.4935616620204435, + "grad_norm": 0.4717327993254758, + "learning_rate": 7.433770628397275e-07, + "loss": 0.5626, + "step": 16927 + }, + { + "epoch": 4.493827160493828, + "grad_norm": 0.49770538511141765, + "learning_rate": 7.4312866272642e-07, + "loss": 0.5296, + "step": 16928 + }, + { + "epoch": 4.494092658967211, + "grad_norm": 0.4963797481207869, + "learning_rate": 7.428802968764065e-07, + "loss": 0.5442, + "step": 16929 + }, + { + "epoch": 4.494358157440595, + "grad_norm": 0.4953809504470916, + "learning_rate": 7.42631965294528e-07, + "loss": 0.5343, + "step": 16930 + }, + { + "epoch": 4.494623655913978, + "grad_norm": 0.47500050298483265, + "learning_rate": 7.423836679856297e-07, + "loss": 0.5487, + "step": 16931 + }, + { + "epoch": 4.494889154387362, + "grad_norm": 0.4770482723518948, + "learning_rate": 7.421354049545523e-07, + "loss": 0.5268, + "step": 16932 + }, + { + "epoch": 4.495154652860746, + "grad_norm": 0.4908095563182328, + "learning_rate": 7.418871762061395e-07, + "loss": 0.5634, + "step": 16933 + }, + { + "epoch": 4.495420151334129, + "grad_norm": 0.4614574327736395, + "learning_rate": 7.416389817452305e-07, + "loss": 0.5092, + "step": 16934 + }, + { + "epoch": 4.4956856498075135, + "grad_norm": 0.47582695627563304, + "learning_rate": 7.413908215766674e-07, + "loss": 0.5297, + "step": 16935 + }, + { + "epoch": 4.495951148280898, + "grad_norm": 0.4783684967687841, + "learning_rate": 7.411426957052889e-07, + "loss": 0.5128, + "step": 16936 + }, + { + "epoch": 4.496216646754281, + "grad_norm": 0.49209127875958075, + "learning_rate": 7.408946041359335e-07, + "loss": 0.5599, + "step": 16937 + }, + { + "epoch": 4.496482145227665, + "grad_norm": 0.4655614755894159, + "learning_rate": 7.406465468734414e-07, + "loss": 0.5116, + "step": 16938 + }, + { + "epoch": 4.496747643701049, + "grad_norm": 0.4805901012572312, + "learning_rate": 7.403985239226483e-07, + "loss": 0.5537, + "step": 16939 + }, + { + "epoch": 4.497013142174432, + "grad_norm": 0.46882041325265494, + "learning_rate": 7.401505352883931e-07, + "loss": 0.542, + "step": 16940 + }, + { + "epoch": 4.497278640647816, + "grad_norm": 0.4761215044096941, + "learning_rate": 7.399025809755115e-07, + "loss": 0.5193, + "step": 16941 + }, + { + "epoch": 4.4975441391212, + "grad_norm": 0.4700648182285545, + "learning_rate": 7.396546609888386e-07, + "loss": 0.5467, + "step": 16942 + }, + { + "epoch": 4.4978096375945835, + "grad_norm": 0.49771163063530444, + "learning_rate": 7.394067753332093e-07, + "loss": 0.5503, + "step": 16943 + }, + { + "epoch": 4.498075136067968, + "grad_norm": 0.47450447993969835, + "learning_rate": 7.391589240134595e-07, + "loss": 0.524, + "step": 16944 + }, + { + "epoch": 4.498340634541352, + "grad_norm": 0.4813077247300755, + "learning_rate": 7.38911107034421e-07, + "loss": 0.5, + "step": 16945 + }, + { + "epoch": 4.498606133014735, + "grad_norm": 0.48201271872923684, + "learning_rate": 7.386633244009286e-07, + "loss": 0.5687, + "step": 16946 + }, + { + "epoch": 4.498871631488119, + "grad_norm": 0.49633406856176554, + "learning_rate": 7.384155761178139e-07, + "loss": 0.5351, + "step": 16947 + }, + { + "epoch": 4.499137129961503, + "grad_norm": 0.47938631256457137, + "learning_rate": 7.381678621899077e-07, + "loss": 0.5347, + "step": 16948 + }, + { + "epoch": 4.499402628434886, + "grad_norm": 0.4896837414055922, + "learning_rate": 7.379201826220428e-07, + "loss": 0.5548, + "step": 16949 + }, + { + "epoch": 4.49966812690827, + "grad_norm": 0.487473603836185, + "learning_rate": 7.376725374190477e-07, + "loss": 0.5342, + "step": 16950 + }, + { + "epoch": 4.499933625381654, + "grad_norm": 0.4811244634122249, + "learning_rate": 7.374249265857539e-07, + "loss": 0.5362, + "step": 16951 + }, + { + "epoch": 4.500199123855038, + "grad_norm": 0.4753147625143773, + "learning_rate": 7.371773501269896e-07, + "loss": 0.4855, + "step": 16952 + }, + { + "epoch": 4.500464622328422, + "grad_norm": 0.4926499510261048, + "learning_rate": 7.369298080475823e-07, + "loss": 0.528, + "step": 16953 + }, + { + "epoch": 4.500730120801805, + "grad_norm": 0.4680023665475965, + "learning_rate": 7.366823003523613e-07, + "loss": 0.528, + "step": 16954 + }, + { + "epoch": 4.500995619275189, + "grad_norm": 0.4735317292590741, + "learning_rate": 7.364348270461521e-07, + "loss": 0.5233, + "step": 16955 + }, + { + "epoch": 4.501261117748573, + "grad_norm": 0.489973539259352, + "learning_rate": 7.361873881337825e-07, + "loss": 0.5659, + "step": 16956 + }, + { + "epoch": 4.501526616221957, + "grad_norm": 0.4924248897348761, + "learning_rate": 7.359399836200765e-07, + "loss": 0.5374, + "step": 16957 + }, + { + "epoch": 4.50179211469534, + "grad_norm": 0.49282710857938555, + "learning_rate": 7.356926135098613e-07, + "loss": 0.5487, + "step": 16958 + }, + { + "epoch": 4.502057613168724, + "grad_norm": 0.49443093333366245, + "learning_rate": 7.354452778079588e-07, + "loss": 0.5323, + "step": 16959 + }, + { + "epoch": 4.502323111642108, + "grad_norm": 0.489493308546048, + "learning_rate": 7.351979765191944e-07, + "loss": 0.5599, + "step": 16960 + }, + { + "epoch": 4.502588610115492, + "grad_norm": 0.4951842396036879, + "learning_rate": 7.349507096483896e-07, + "loss": 0.5773, + "step": 16961 + }, + { + "epoch": 4.502854108588876, + "grad_norm": 0.46474732981387473, + "learning_rate": 7.347034772003683e-07, + "loss": 0.5555, + "step": 16962 + }, + { + "epoch": 4.50311960706226, + "grad_norm": 0.4797496398541281, + "learning_rate": 7.344562791799506e-07, + "loss": 0.4795, + "step": 16963 + }, + { + "epoch": 4.503385105535643, + "grad_norm": 0.4602019662658021, + "learning_rate": 7.34209115591959e-07, + "loss": 0.4744, + "step": 16964 + }, + { + "epoch": 4.503650604009027, + "grad_norm": 0.48097318012081397, + "learning_rate": 7.339619864412132e-07, + "loss": 0.5273, + "step": 16965 + }, + { + "epoch": 4.50391610248241, + "grad_norm": 0.4829890226546275, + "learning_rate": 7.337148917325318e-07, + "loss": 0.5171, + "step": 16966 + }, + { + "epoch": 4.5041816009557945, + "grad_norm": 0.4758960958572048, + "learning_rate": 7.334678314707352e-07, + "loss": 0.522, + "step": 16967 + }, + { + "epoch": 4.5044470994291785, + "grad_norm": 0.48494206463868483, + "learning_rate": 7.332208056606407e-07, + "loss": 0.5436, + "step": 16968 + }, + { + "epoch": 4.504712597902562, + "grad_norm": 0.46903458364130224, + "learning_rate": 7.329738143070669e-07, + "loss": 0.5391, + "step": 16969 + }, + { + "epoch": 4.504978096375946, + "grad_norm": 0.4727224555518883, + "learning_rate": 7.327268574148303e-07, + "loss": 0.5441, + "step": 16970 + }, + { + "epoch": 4.50524359484933, + "grad_norm": 0.4681686313130701, + "learning_rate": 7.324799349887468e-07, + "loss": 0.5221, + "step": 16971 + }, + { + "epoch": 4.505509093322713, + "grad_norm": 0.461541023757282, + "learning_rate": 7.322330470336314e-07, + "loss": 0.5193, + "step": 16972 + }, + { + "epoch": 4.505774591796097, + "grad_norm": 0.4709099460401255, + "learning_rate": 7.319861935542999e-07, + "loss": 0.5255, + "step": 16973 + }, + { + "epoch": 4.506040090269481, + "grad_norm": 0.48224645745410866, + "learning_rate": 7.317393745555673e-07, + "loss": 0.5185, + "step": 16974 + }, + { + "epoch": 4.5063055887428645, + "grad_norm": 0.46845469518806615, + "learning_rate": 7.314925900422462e-07, + "loss": 0.5175, + "step": 16975 + }, + { + "epoch": 4.5065710872162486, + "grad_norm": 0.48838061835994057, + "learning_rate": 7.3124584001915e-07, + "loss": 0.5107, + "step": 16976 + }, + { + "epoch": 4.506836585689633, + "grad_norm": 0.48566651021651097, + "learning_rate": 7.309991244910894e-07, + "loss": 0.5332, + "step": 16977 + }, + { + "epoch": 4.507102084163016, + "grad_norm": 0.4756539894724518, + "learning_rate": 7.307524434628782e-07, + "loss": 0.5055, + "step": 16978 + }, + { + "epoch": 4.5073675826364, + "grad_norm": 0.49749365224412573, + "learning_rate": 7.305057969393253e-07, + "loss": 0.5689, + "step": 16979 + }, + { + "epoch": 4.507633081109784, + "grad_norm": 0.4995769357074368, + "learning_rate": 7.302591849252424e-07, + "loss": 0.559, + "step": 16980 + }, + { + "epoch": 4.507898579583167, + "grad_norm": 0.4917647366377767, + "learning_rate": 7.300126074254388e-07, + "loss": 0.5296, + "step": 16981 + }, + { + "epoch": 4.508164078056551, + "grad_norm": 0.47123873377533154, + "learning_rate": 7.297660644447222e-07, + "loss": 0.5372, + "step": 16982 + }, + { + "epoch": 4.5084295765299345, + "grad_norm": 0.4895773710019, + "learning_rate": 7.295195559879023e-07, + "loss": 0.5179, + "step": 16983 + }, + { + "epoch": 4.508695075003319, + "grad_norm": 0.48024350512628794, + "learning_rate": 7.292730820597851e-07, + "loss": 0.54, + "step": 16984 + }, + { + "epoch": 4.508960573476703, + "grad_norm": 0.4904819133172295, + "learning_rate": 7.290266426651793e-07, + "loss": 0.5753, + "step": 16985 + }, + { + "epoch": 4.509226071950087, + "grad_norm": 0.4815809186989495, + "learning_rate": 7.28780237808889e-07, + "loss": 0.5578, + "step": 16986 + }, + { + "epoch": 4.50949157042347, + "grad_norm": 0.49296871762187355, + "learning_rate": 7.285338674957227e-07, + "loss": 0.5094, + "step": 16987 + }, + { + "epoch": 4.509757068896854, + "grad_norm": 0.479858091462161, + "learning_rate": 7.282875317304814e-07, + "loss": 0.5331, + "step": 16988 + }, + { + "epoch": 4.510022567370237, + "grad_norm": 0.47403651661129304, + "learning_rate": 7.280412305179721e-07, + "loss": 0.5272, + "step": 16989 + }, + { + "epoch": 4.510288065843621, + "grad_norm": 0.4794431122574388, + "learning_rate": 7.277949638629963e-07, + "loss": 0.5348, + "step": 16990 + }, + { + "epoch": 4.510553564317005, + "grad_norm": 0.480067006900869, + "learning_rate": 7.275487317703587e-07, + "loss": 0.5241, + "step": 16991 + }, + { + "epoch": 4.510819062790389, + "grad_norm": 0.4782274469247924, + "learning_rate": 7.273025342448597e-07, + "loss": 0.531, + "step": 16992 + }, + { + "epoch": 4.511084561263773, + "grad_norm": 0.4826277832994021, + "learning_rate": 7.270563712913026e-07, + "loss": 0.5217, + "step": 16993 + }, + { + "epoch": 4.511350059737157, + "grad_norm": 0.47191955247493816, + "learning_rate": 7.268102429144869e-07, + "loss": 0.5001, + "step": 16994 + }, + { + "epoch": 4.51161555821054, + "grad_norm": 0.4883096277948197, + "learning_rate": 7.265641491192124e-07, + "loss": 0.5446, + "step": 16995 + }, + { + "epoch": 4.511881056683924, + "grad_norm": 0.49203822214061277, + "learning_rate": 7.263180899102798e-07, + "loss": 0.5356, + "step": 16996 + }, + { + "epoch": 4.512146555157308, + "grad_norm": 0.47464331347661587, + "learning_rate": 7.260720652924863e-07, + "loss": 0.5382, + "step": 16997 + }, + { + "epoch": 4.512412053630691, + "grad_norm": 0.4838217958530632, + "learning_rate": 7.258260752706319e-07, + "loss": 0.5745, + "step": 16998 + }, + { + "epoch": 4.512677552104075, + "grad_norm": 0.4959804291619121, + "learning_rate": 7.255801198495127e-07, + "loss": 0.5249, + "step": 16999 + }, + { + "epoch": 4.5129430505774595, + "grad_norm": 0.4773349809650804, + "learning_rate": 7.253341990339252e-07, + "loss": 0.5132, + "step": 17000 + }, + { + "epoch": 4.513208549050843, + "grad_norm": 0.49146660998110225, + "learning_rate": 7.250883128286665e-07, + "loss": 0.5209, + "step": 17001 + }, + { + "epoch": 4.513474047524227, + "grad_norm": 0.4884867978572633, + "learning_rate": 7.248424612385305e-07, + "loss": 0.5415, + "step": 17002 + }, + { + "epoch": 4.513739545997611, + "grad_norm": 0.4915365845990471, + "learning_rate": 7.245966442683139e-07, + "loss": 0.5302, + "step": 17003 + }, + { + "epoch": 4.514005044470994, + "grad_norm": 0.48480243727730016, + "learning_rate": 7.243508619228095e-07, + "loss": 0.5269, + "step": 17004 + }, + { + "epoch": 4.514270542944378, + "grad_norm": 0.4810746972375595, + "learning_rate": 7.241051142068109e-07, + "loss": 0.5291, + "step": 17005 + }, + { + "epoch": 4.514536041417762, + "grad_norm": 0.4878136577160576, + "learning_rate": 7.238594011251101e-07, + "loss": 0.5066, + "step": 17006 + }, + { + "epoch": 4.514801539891145, + "grad_norm": 0.4767093077299178, + "learning_rate": 7.236137226825001e-07, + "loss": 0.5697, + "step": 17007 + }, + { + "epoch": 4.5150670383645295, + "grad_norm": 0.47739927779179636, + "learning_rate": 7.233680788837716e-07, + "loss": 0.5302, + "step": 17008 + }, + { + "epoch": 4.515332536837914, + "grad_norm": 0.485306985148506, + "learning_rate": 7.231224697337161e-07, + "loss": 0.559, + "step": 17009 + }, + { + "epoch": 4.515598035311297, + "grad_norm": 0.49644838930471236, + "learning_rate": 7.228768952371226e-07, + "loss": 0.5697, + "step": 17010 + }, + { + "epoch": 4.515863533784681, + "grad_norm": 0.4815247460210243, + "learning_rate": 7.226313553987805e-07, + "loss": 0.5488, + "step": 17011 + }, + { + "epoch": 4.516129032258064, + "grad_norm": 0.47162127896877176, + "learning_rate": 7.223858502234792e-07, + "loss": 0.4941, + "step": 17012 + }, + { + "epoch": 4.516394530731448, + "grad_norm": 0.46595440916367403, + "learning_rate": 7.221403797160056e-07, + "loss": 0.4967, + "step": 17013 + }, + { + "epoch": 4.516660029204832, + "grad_norm": 0.48137482716616214, + "learning_rate": 7.218949438811482e-07, + "loss": 0.5563, + "step": 17014 + }, + { + "epoch": 4.516925527678216, + "grad_norm": 0.4711263560496944, + "learning_rate": 7.216495427236922e-07, + "loss": 0.519, + "step": 17015 + }, + { + "epoch": 4.5171910261515995, + "grad_norm": 0.49741487686336977, + "learning_rate": 7.214041762484259e-07, + "loss": 0.5356, + "step": 17016 + }, + { + "epoch": 4.517456524624984, + "grad_norm": 0.4713400802878359, + "learning_rate": 7.211588444601311e-07, + "loss": 0.5363, + "step": 17017 + }, + { + "epoch": 4.517722023098367, + "grad_norm": 0.4728650057350629, + "learning_rate": 7.209135473635945e-07, + "loss": 0.4862, + "step": 17018 + }, + { + "epoch": 4.517987521571751, + "grad_norm": 0.48674330140410627, + "learning_rate": 7.206682849636001e-07, + "loss": 0.5257, + "step": 17019 + }, + { + "epoch": 4.518253020045135, + "grad_norm": 0.4776414610147334, + "learning_rate": 7.204230572649304e-07, + "loss": 0.5427, + "step": 17020 + }, + { + "epoch": 4.518518518518518, + "grad_norm": 0.48051128268866045, + "learning_rate": 7.201778642723694e-07, + "loss": 0.5468, + "step": 17021 + }, + { + "epoch": 4.518784016991902, + "grad_norm": 0.4813400816146693, + "learning_rate": 7.199327059906963e-07, + "loss": 0.5598, + "step": 17022 + }, + { + "epoch": 4.519049515465286, + "grad_norm": 0.48261664746660016, + "learning_rate": 7.196875824246946e-07, + "loss": 0.5372, + "step": 17023 + }, + { + "epoch": 4.5193150139386695, + "grad_norm": 0.47203144796706337, + "learning_rate": 7.194424935791434e-07, + "loss": 0.5166, + "step": 17024 + }, + { + "epoch": 4.519580512412054, + "grad_norm": 0.4971673053623054, + "learning_rate": 7.191974394588238e-07, + "loss": 0.5361, + "step": 17025 + }, + { + "epoch": 4.519846010885438, + "grad_norm": 0.47271344208986416, + "learning_rate": 7.189524200685135e-07, + "loss": 0.514, + "step": 17026 + }, + { + "epoch": 4.520111509358821, + "grad_norm": 0.4901810751028744, + "learning_rate": 7.187074354129925e-07, + "loss": 0.5562, + "step": 17027 + }, + { + "epoch": 4.520377007832205, + "grad_norm": 0.4668910811460749, + "learning_rate": 7.184624854970379e-07, + "loss": 0.5202, + "step": 17028 + }, + { + "epoch": 4.520642506305589, + "grad_norm": 0.4803585779873794, + "learning_rate": 7.18217570325426e-07, + "loss": 0.5273, + "step": 17029 + }, + { + "epoch": 4.520908004778972, + "grad_norm": 0.4665387721606301, + "learning_rate": 7.179726899029349e-07, + "loss": 0.5292, + "step": 17030 + }, + { + "epoch": 4.521173503252356, + "grad_norm": 0.48034290046347355, + "learning_rate": 7.177278442343386e-07, + "loss": 0.547, + "step": 17031 + }, + { + "epoch": 4.52143900172574, + "grad_norm": 0.4872394285217787, + "learning_rate": 7.17483033324414e-07, + "loss": 0.5685, + "step": 17032 + }, + { + "epoch": 4.521704500199124, + "grad_norm": 0.49518086525454763, + "learning_rate": 7.172382571779344e-07, + "loss": 0.5485, + "step": 17033 + }, + { + "epoch": 4.521969998672508, + "grad_norm": 0.4903048099403242, + "learning_rate": 7.169935157996738e-07, + "loss": 0.5282, + "step": 17034 + }, + { + "epoch": 4.522235497145892, + "grad_norm": 0.4977164194726281, + "learning_rate": 7.167488091944044e-07, + "loss": 0.5286, + "step": 17035 + }, + { + "epoch": 4.522500995619275, + "grad_norm": 0.4860800785834062, + "learning_rate": 7.165041373669005e-07, + "loss": 0.5449, + "step": 17036 + }, + { + "epoch": 4.522766494092659, + "grad_norm": 0.48135710568921836, + "learning_rate": 7.162595003219314e-07, + "loss": 0.5034, + "step": 17037 + }, + { + "epoch": 4.523031992566043, + "grad_norm": 0.48533795912033045, + "learning_rate": 7.160148980642706e-07, + "loss": 0.542, + "step": 17038 + }, + { + "epoch": 4.523297491039426, + "grad_norm": 0.4777820063167383, + "learning_rate": 7.157703305986868e-07, + "loss": 0.5359, + "step": 17039 + }, + { + "epoch": 4.52356298951281, + "grad_norm": 0.492223486819021, + "learning_rate": 7.155257979299496e-07, + "loss": 0.5068, + "step": 17040 + }, + { + "epoch": 4.523828487986194, + "grad_norm": 0.4876367816458037, + "learning_rate": 7.152813000628292e-07, + "loss": 0.5352, + "step": 17041 + }, + { + "epoch": 4.524093986459578, + "grad_norm": 0.46748608766400057, + "learning_rate": 7.150368370020924e-07, + "loss": 0.5236, + "step": 17042 + }, + { + "epoch": 4.524359484932962, + "grad_norm": 0.4858141671373916, + "learning_rate": 7.147924087525085e-07, + "loss": 0.5531, + "step": 17043 + }, + { + "epoch": 4.524624983406346, + "grad_norm": 0.4766927055743187, + "learning_rate": 7.145480153188428e-07, + "loss": 0.514, + "step": 17044 + }, + { + "epoch": 4.524890481879729, + "grad_norm": 0.480040758558741, + "learning_rate": 7.143036567058631e-07, + "loss": 0.5225, + "step": 17045 + }, + { + "epoch": 4.525155980353113, + "grad_norm": 0.4814218686804199, + "learning_rate": 7.14059332918334e-07, + "loss": 0.5629, + "step": 17046 + }, + { + "epoch": 4.525421478826496, + "grad_norm": 0.4749218711190256, + "learning_rate": 7.138150439610201e-07, + "loss": 0.5262, + "step": 17047 + }, + { + "epoch": 4.5256869772998805, + "grad_norm": 0.48041544151665233, + "learning_rate": 7.135707898386871e-07, + "loss": 0.5318, + "step": 17048 + }, + { + "epoch": 4.5259524757732645, + "grad_norm": 0.47582984384942173, + "learning_rate": 7.133265705560966e-07, + "loss": 0.5567, + "step": 17049 + }, + { + "epoch": 4.526217974246648, + "grad_norm": 0.47671511484036394, + "learning_rate": 7.130823861180139e-07, + "loss": 0.5374, + "step": 17050 + }, + { + "epoch": 4.526483472720032, + "grad_norm": 0.4609036756337673, + "learning_rate": 7.128382365291986e-07, + "loss": 0.5277, + "step": 17051 + }, + { + "epoch": 4.526748971193416, + "grad_norm": 0.4839139772650828, + "learning_rate": 7.12594121794414e-07, + "loss": 0.5269, + "step": 17052 + }, + { + "epoch": 4.527014469666799, + "grad_norm": 0.4833401243595286, + "learning_rate": 7.123500419184196e-07, + "loss": 0.5344, + "step": 17053 + }, + { + "epoch": 4.527279968140183, + "grad_norm": 0.482129850937298, + "learning_rate": 7.121059969059771e-07, + "loss": 0.5562, + "step": 17054 + }, + { + "epoch": 4.527545466613567, + "grad_norm": 0.47768283695240804, + "learning_rate": 7.118619867618445e-07, + "loss": 0.4703, + "step": 17055 + }, + { + "epoch": 4.5278109650869505, + "grad_norm": 0.4880094429733977, + "learning_rate": 7.11618011490782e-07, + "loss": 0.5612, + "step": 17056 + }, + { + "epoch": 4.5280764635603346, + "grad_norm": 0.48673600528819844, + "learning_rate": 7.113740710975467e-07, + "loss": 0.5453, + "step": 17057 + }, + { + "epoch": 4.528341962033719, + "grad_norm": 0.4924967704691144, + "learning_rate": 7.11130165586896e-07, + "loss": 0.5359, + "step": 17058 + }, + { + "epoch": 4.528607460507102, + "grad_norm": 0.47139587850695874, + "learning_rate": 7.108862949635875e-07, + "loss": 0.5141, + "step": 17059 + }, + { + "epoch": 4.528872958980486, + "grad_norm": 0.4915602756778128, + "learning_rate": 7.106424592323761e-07, + "loss": 0.5446, + "step": 17060 + }, + { + "epoch": 4.52913845745387, + "grad_norm": 0.4658447919188393, + "learning_rate": 7.103986583980186e-07, + "loss": 0.4881, + "step": 17061 + }, + { + "epoch": 4.529403955927253, + "grad_norm": 0.48436827616519074, + "learning_rate": 7.101548924652688e-07, + "loss": 0.5206, + "step": 17062 + }, + { + "epoch": 4.529669454400637, + "grad_norm": 0.47446860303135163, + "learning_rate": 7.099111614388809e-07, + "loss": 0.5444, + "step": 17063 + }, + { + "epoch": 4.5299349528740205, + "grad_norm": 0.47647728961568964, + "learning_rate": 7.096674653236077e-07, + "loss": 0.5382, + "step": 17064 + }, + { + "epoch": 4.530200451347405, + "grad_norm": 0.47482278007781303, + "learning_rate": 7.094238041242021e-07, + "loss": 0.5428, + "step": 17065 + }, + { + "epoch": 4.530465949820789, + "grad_norm": 0.479764788278246, + "learning_rate": 7.091801778454175e-07, + "loss": 0.4763, + "step": 17066 + }, + { + "epoch": 4.530731448294173, + "grad_norm": 0.47913954189425495, + "learning_rate": 7.089365864920031e-07, + "loss": 0.5285, + "step": 17067 + }, + { + "epoch": 4.530996946767556, + "grad_norm": 0.48287101917509945, + "learning_rate": 7.086930300687123e-07, + "loss": 0.5479, + "step": 17068 + }, + { + "epoch": 4.53126244524094, + "grad_norm": 0.4791178092018074, + "learning_rate": 7.084495085802917e-07, + "loss": 0.5309, + "step": 17069 + }, + { + "epoch": 4.531527943714323, + "grad_norm": 0.4712089037534123, + "learning_rate": 7.082060220314929e-07, + "loss": 0.5465, + "step": 17070 + }, + { + "epoch": 4.531793442187707, + "grad_norm": 0.4861988046991871, + "learning_rate": 7.079625704270629e-07, + "loss": 0.5649, + "step": 17071 + }, + { + "epoch": 4.532058940661091, + "grad_norm": 0.48825746295085554, + "learning_rate": 7.077191537717512e-07, + "loss": 0.5585, + "step": 17072 + }, + { + "epoch": 4.5323244391344755, + "grad_norm": 0.47734541682794585, + "learning_rate": 7.074757720703044e-07, + "loss": 0.5352, + "step": 17073 + }, + { + "epoch": 4.532589937607859, + "grad_norm": 0.48806459323741086, + "learning_rate": 7.072324253274679e-07, + "loss": 0.5256, + "step": 17074 + }, + { + "epoch": 4.532855436081243, + "grad_norm": 0.49030320511559183, + "learning_rate": 7.069891135479895e-07, + "loss": 0.553, + "step": 17075 + }, + { + "epoch": 4.533120934554626, + "grad_norm": 0.4836595261590822, + "learning_rate": 7.067458367366125e-07, + "loss": 0.5393, + "step": 17076 + }, + { + "epoch": 4.53338643302801, + "grad_norm": 0.4902939498173901, + "learning_rate": 7.065025948980833e-07, + "loss": 0.5689, + "step": 17077 + }, + { + "epoch": 4.533651931501394, + "grad_norm": 0.5036341596647592, + "learning_rate": 7.062593880371441e-07, + "loss": 0.5557, + "step": 17078 + }, + { + "epoch": 4.533917429974777, + "grad_norm": 0.47971191567358085, + "learning_rate": 7.060162161585398e-07, + "loss": 0.5065, + "step": 17079 + }, + { + "epoch": 4.534182928448161, + "grad_norm": 0.49103041316689, + "learning_rate": 7.057730792670106e-07, + "loss": 0.5719, + "step": 17080 + }, + { + "epoch": 4.5344484269215455, + "grad_norm": 0.48625113524707025, + "learning_rate": 7.055299773672999e-07, + "loss": 0.5408, + "step": 17081 + }, + { + "epoch": 4.534713925394929, + "grad_norm": 0.46214254661361515, + "learning_rate": 7.052869104641475e-07, + "loss": 0.5159, + "step": 17082 + }, + { + "epoch": 4.534979423868313, + "grad_norm": 0.4862982622162461, + "learning_rate": 7.050438785622954e-07, + "loss": 0.539, + "step": 17083 + }, + { + "epoch": 4.535244922341697, + "grad_norm": 0.47196227991581247, + "learning_rate": 7.048008816664817e-07, + "loss": 0.5266, + "step": 17084 + }, + { + "epoch": 4.53551042081508, + "grad_norm": 0.4783948465096514, + "learning_rate": 7.045579197814467e-07, + "loss": 0.5951, + "step": 17085 + }, + { + "epoch": 4.535775919288464, + "grad_norm": 0.4845641896880734, + "learning_rate": 7.043149929119286e-07, + "loss": 0.4968, + "step": 17086 + }, + { + "epoch": 4.536041417761848, + "grad_norm": 0.48431994506276194, + "learning_rate": 7.040721010626639e-07, + "loss": 0.5326, + "step": 17087 + }, + { + "epoch": 4.536306916235231, + "grad_norm": 0.49026023733211976, + "learning_rate": 7.038292442383912e-07, + "loss": 0.5431, + "step": 17088 + }, + { + "epoch": 4.5365724147086155, + "grad_norm": 0.4765346801577704, + "learning_rate": 7.035864224438452e-07, + "loss": 0.54, + "step": 17089 + }, + { + "epoch": 4.536837913182, + "grad_norm": 0.4779254366833416, + "learning_rate": 7.033436356837631e-07, + "loss": 0.5125, + "step": 17090 + }, + { + "epoch": 4.537103411655383, + "grad_norm": 0.48547530478247264, + "learning_rate": 7.031008839628792e-07, + "loss": 0.5396, + "step": 17091 + }, + { + "epoch": 4.537368910128767, + "grad_norm": 0.4954785495080932, + "learning_rate": 7.028581672859266e-07, + "loss": 0.5416, + "step": 17092 + }, + { + "epoch": 4.53763440860215, + "grad_norm": 0.47137601000978313, + "learning_rate": 7.026154856576408e-07, + "loss": 0.5232, + "step": 17093 + }, + { + "epoch": 4.537899907075534, + "grad_norm": 0.4948690525051935, + "learning_rate": 7.023728390827531e-07, + "loss": 0.5751, + "step": 17094 + }, + { + "epoch": 4.538165405548918, + "grad_norm": 0.486729437993439, + "learning_rate": 7.021302275659972e-07, + "loss": 0.5492, + "step": 17095 + }, + { + "epoch": 4.538430904022302, + "grad_norm": 0.4717125563238074, + "learning_rate": 7.018876511121036e-07, + "loss": 0.5011, + "step": 17096 + }, + { + "epoch": 4.5386964024956855, + "grad_norm": 0.482055994808619, + "learning_rate": 7.016451097258034e-07, + "loss": 0.5629, + "step": 17097 + }, + { + "epoch": 4.53896190096907, + "grad_norm": 0.4711325570521054, + "learning_rate": 7.01402603411826e-07, + "loss": 0.5102, + "step": 17098 + }, + { + "epoch": 4.539227399442453, + "grad_norm": 0.4665266406850192, + "learning_rate": 7.011601321749023e-07, + "loss": 0.5211, + "step": 17099 + }, + { + "epoch": 4.539492897915837, + "grad_norm": 0.47348041357416615, + "learning_rate": 7.009176960197598e-07, + "loss": 0.5372, + "step": 17100 + }, + { + "epoch": 4.539758396389221, + "grad_norm": 0.48600806257825885, + "learning_rate": 7.006752949511276e-07, + "loss": 0.5094, + "step": 17101 + }, + { + "epoch": 4.540023894862604, + "grad_norm": 0.4787955309567304, + "learning_rate": 7.004329289737327e-07, + "loss": 0.5479, + "step": 17102 + }, + { + "epoch": 4.540289393335988, + "grad_norm": 0.48966247462132756, + "learning_rate": 7.00190598092301e-07, + "loss": 0.5503, + "step": 17103 + }, + { + "epoch": 4.540554891809372, + "grad_norm": 0.4630849057938057, + "learning_rate": 6.999483023115603e-07, + "loss": 0.5174, + "step": 17104 + }, + { + "epoch": 4.5408203902827555, + "grad_norm": 0.47101241465481675, + "learning_rate": 6.997060416362339e-07, + "loss": 0.5232, + "step": 17105 + }, + { + "epoch": 4.54108588875614, + "grad_norm": 0.4783609951345164, + "learning_rate": 6.994638160710488e-07, + "loss": 0.5467, + "step": 17106 + }, + { + "epoch": 4.541351387229524, + "grad_norm": 0.49704181747810544, + "learning_rate": 6.992216256207265e-07, + "loss": 0.5433, + "step": 17107 + }, + { + "epoch": 4.541616885702907, + "grad_norm": 0.4855917038479662, + "learning_rate": 6.989794702899932e-07, + "loss": 0.5552, + "step": 17108 + }, + { + "epoch": 4.541882384176291, + "grad_norm": 0.47130637007583176, + "learning_rate": 6.987373500835687e-07, + "loss": 0.5371, + "step": 17109 + }, + { + "epoch": 4.542147882649675, + "grad_norm": 0.4814510693213958, + "learning_rate": 6.984952650061758e-07, + "loss": 0.5333, + "step": 17110 + }, + { + "epoch": 4.542413381123058, + "grad_norm": 0.4806272355497954, + "learning_rate": 6.98253215062537e-07, + "loss": 0.5126, + "step": 17111 + }, + { + "epoch": 4.542678879596442, + "grad_norm": 0.48588513037576586, + "learning_rate": 6.980112002573711e-07, + "loss": 0.5507, + "step": 17112 + }, + { + "epoch": 4.542944378069826, + "grad_norm": 0.4791730005141744, + "learning_rate": 6.977692205953995e-07, + "loss": 0.5295, + "step": 17113 + }, + { + "epoch": 4.54320987654321, + "grad_norm": 0.5072940205877899, + "learning_rate": 6.975272760813409e-07, + "loss": 0.5609, + "step": 17114 + }, + { + "epoch": 4.543475375016594, + "grad_norm": 0.48647322768105367, + "learning_rate": 6.972853667199137e-07, + "loss": 0.5542, + "step": 17115 + }, + { + "epoch": 4.543740873489978, + "grad_norm": 0.4934932090540298, + "learning_rate": 6.970434925158346e-07, + "loss": 0.5385, + "step": 17116 + }, + { + "epoch": 4.544006371963361, + "grad_norm": 0.4908918802037143, + "learning_rate": 6.968016534738225e-07, + "loss": 0.5405, + "step": 17117 + }, + { + "epoch": 4.544271870436745, + "grad_norm": 0.4806260677079348, + "learning_rate": 6.965598495985929e-07, + "loss": 0.5518, + "step": 17118 + }, + { + "epoch": 4.544537368910129, + "grad_norm": 0.4652201748869212, + "learning_rate": 6.963180808948622e-07, + "loss": 0.5197, + "step": 17119 + }, + { + "epoch": 4.544802867383512, + "grad_norm": 0.485869148445642, + "learning_rate": 6.960763473673451e-07, + "loss": 0.5676, + "step": 17120 + }, + { + "epoch": 4.545068365856896, + "grad_norm": 0.4866266573608137, + "learning_rate": 6.958346490207551e-07, + "loss": 0.5612, + "step": 17121 + }, + { + "epoch": 4.54533386433028, + "grad_norm": 0.4756666381204711, + "learning_rate": 6.955929858598079e-07, + "loss": 0.533, + "step": 17122 + }, + { + "epoch": 4.545599362803664, + "grad_norm": 0.48016387869433635, + "learning_rate": 6.953513578892146e-07, + "loss": 0.5311, + "step": 17123 + }, + { + "epoch": 4.545864861277048, + "grad_norm": 0.4687052844037668, + "learning_rate": 6.95109765113689e-07, + "loss": 0.5315, + "step": 17124 + }, + { + "epoch": 4.546130359750432, + "grad_norm": 0.5048588431508544, + "learning_rate": 6.94868207537942e-07, + "loss": 0.5447, + "step": 17125 + }, + { + "epoch": 4.546395858223815, + "grad_norm": 0.4958975491939647, + "learning_rate": 6.94626685166685e-07, + "loss": 0.5262, + "step": 17126 + }, + { + "epoch": 4.546661356697199, + "grad_norm": 0.4879210798133318, + "learning_rate": 6.943851980046271e-07, + "loss": 0.5432, + "step": 17127 + }, + { + "epoch": 4.546926855170582, + "grad_norm": 0.4885303984486785, + "learning_rate": 6.941437460564792e-07, + "loss": 0.5509, + "step": 17128 + }, + { + "epoch": 4.5471923536439665, + "grad_norm": 0.483111641145223, + "learning_rate": 6.939023293269492e-07, + "loss": 0.5314, + "step": 17129 + }, + { + "epoch": 4.5474578521173505, + "grad_norm": 0.49041160091805946, + "learning_rate": 6.936609478207459e-07, + "loss": 0.5061, + "step": 17130 + }, + { + "epoch": 4.547723350590734, + "grad_norm": 0.46582950451098615, + "learning_rate": 6.934196015425784e-07, + "loss": 0.5455, + "step": 17131 + }, + { + "epoch": 4.547988849064118, + "grad_norm": 0.4791485881221155, + "learning_rate": 6.931782904971504e-07, + "loss": 0.5526, + "step": 17132 + }, + { + "epoch": 4.548254347537502, + "grad_norm": 0.4882908665359969, + "learning_rate": 6.929370146891703e-07, + "loss": 0.5401, + "step": 17133 + }, + { + "epoch": 4.548519846010885, + "grad_norm": 0.4786581692068963, + "learning_rate": 6.926957741233419e-07, + "loss": 0.5199, + "step": 17134 + }, + { + "epoch": 4.548785344484269, + "grad_norm": 0.4903894802705, + "learning_rate": 6.924545688043721e-07, + "loss": 0.5448, + "step": 17135 + }, + { + "epoch": 4.549050842957653, + "grad_norm": 0.4859267182188451, + "learning_rate": 6.922133987369628e-07, + "loss": 0.5675, + "step": 17136 + }, + { + "epoch": 4.5493163414310365, + "grad_norm": 0.45543675142027945, + "learning_rate": 6.919722639258195e-07, + "loss": 0.5333, + "step": 17137 + }, + { + "epoch": 4.5495818399044206, + "grad_norm": 0.4870302225979478, + "learning_rate": 6.917311643756436e-07, + "loss": 0.5299, + "step": 17138 + }, + { + "epoch": 4.549847338377805, + "grad_norm": 0.48684877157998746, + "learning_rate": 6.914901000911367e-07, + "loss": 0.5197, + "step": 17139 + }, + { + "epoch": 4.550112836851188, + "grad_norm": 0.4757733777122875, + "learning_rate": 6.912490710770015e-07, + "loss": 0.5224, + "step": 17140 + }, + { + "epoch": 4.550378335324572, + "grad_norm": 0.4875327884697538, + "learning_rate": 6.910080773379374e-07, + "loss": 0.5312, + "step": 17141 + }, + { + "epoch": 4.550643833797956, + "grad_norm": 0.4790581017913816, + "learning_rate": 6.907671188786463e-07, + "loss": 0.5534, + "step": 17142 + }, + { + "epoch": 4.550909332271339, + "grad_norm": 0.48079907762830476, + "learning_rate": 6.90526195703825e-07, + "loss": 0.5699, + "step": 17143 + }, + { + "epoch": 4.551174830744723, + "grad_norm": 0.46956144029583297, + "learning_rate": 6.902853078181737e-07, + "loss": 0.4964, + "step": 17144 + }, + { + "epoch": 4.551440329218107, + "grad_norm": 0.48387744386522435, + "learning_rate": 6.900444552263891e-07, + "loss": 0.5134, + "step": 17145 + }, + { + "epoch": 4.551705827691491, + "grad_norm": 0.47149620046592317, + "learning_rate": 6.898036379331699e-07, + "loss": 0.5504, + "step": 17146 + }, + { + "epoch": 4.551971326164875, + "grad_norm": 0.4929477458935296, + "learning_rate": 6.895628559432113e-07, + "loss": 0.5555, + "step": 17147 + }, + { + "epoch": 4.552236824638259, + "grad_norm": 0.4899617239122328, + "learning_rate": 6.893221092612104e-07, + "loss": 0.5259, + "step": 17148 + }, + { + "epoch": 4.552502323111642, + "grad_norm": 0.46162926133999677, + "learning_rate": 6.890813978918617e-07, + "loss": 0.5035, + "step": 17149 + }, + { + "epoch": 4.552767821585026, + "grad_norm": 0.485307788365755, + "learning_rate": 6.888407218398588e-07, + "loss": 0.5616, + "step": 17150 + }, + { + "epoch": 4.553033320058409, + "grad_norm": 0.4773210174683122, + "learning_rate": 6.88600081109897e-07, + "loss": 0.532, + "step": 17151 + }, + { + "epoch": 4.553298818531793, + "grad_norm": 0.48455310720350026, + "learning_rate": 6.88359475706668e-07, + "loss": 0.5554, + "step": 17152 + }, + { + "epoch": 4.553564317005177, + "grad_norm": 0.48360112787341364, + "learning_rate": 6.881189056348658e-07, + "loss": 0.5647, + "step": 17153 + }, + { + "epoch": 4.5538298154785615, + "grad_norm": 0.4778923441340996, + "learning_rate": 6.87878370899181e-07, + "loss": 0.5382, + "step": 17154 + }, + { + "epoch": 4.554095313951945, + "grad_norm": 0.4792811632668715, + "learning_rate": 6.876378715043042e-07, + "loss": 0.5378, + "step": 17155 + }, + { + "epoch": 4.554360812425329, + "grad_norm": 0.4806240874642617, + "learning_rate": 6.87397407454927e-07, + "loss": 0.5803, + "step": 17156 + }, + { + "epoch": 4.554626310898712, + "grad_norm": 0.4776272868560844, + "learning_rate": 6.871569787557375e-07, + "loss": 0.5322, + "step": 17157 + }, + { + "epoch": 4.554891809372096, + "grad_norm": 0.49056719442931385, + "learning_rate": 6.869165854114262e-07, + "loss": 0.5451, + "step": 17158 + }, + { + "epoch": 4.55515730784548, + "grad_norm": 0.46707632864129345, + "learning_rate": 6.866762274266802e-07, + "loss": 0.5107, + "step": 17159 + }, + { + "epoch": 4.555422806318863, + "grad_norm": 0.4951714741395873, + "learning_rate": 6.864359048061891e-07, + "loss": 0.5536, + "step": 17160 + }, + { + "epoch": 4.555688304792247, + "grad_norm": 0.49489527732218297, + "learning_rate": 6.861956175546366e-07, + "loss": 0.5205, + "step": 17161 + }, + { + "epoch": 4.5559538032656315, + "grad_norm": 0.4920662916684385, + "learning_rate": 6.859553656767112e-07, + "loss": 0.5466, + "step": 17162 + }, + { + "epoch": 4.556219301739015, + "grad_norm": 0.4882950732383972, + "learning_rate": 6.85715149177097e-07, + "loss": 0.5187, + "step": 17163 + }, + { + "epoch": 4.556484800212399, + "grad_norm": 0.4772643547634087, + "learning_rate": 6.854749680604802e-07, + "loss": 0.5604, + "step": 17164 + }, + { + "epoch": 4.556750298685783, + "grad_norm": 0.47708757447293043, + "learning_rate": 6.852348223315435e-07, + "loss": 0.5099, + "step": 17165 + }, + { + "epoch": 4.557015797159166, + "grad_norm": 0.5020379710488587, + "learning_rate": 6.849947119949721e-07, + "loss": 0.5669, + "step": 17166 + }, + { + "epoch": 4.55728129563255, + "grad_norm": 0.489642298055821, + "learning_rate": 6.847546370554475e-07, + "loss": 0.5308, + "step": 17167 + }, + { + "epoch": 4.557546794105934, + "grad_norm": 0.475728110611278, + "learning_rate": 6.845145975176512e-07, + "loss": 0.5442, + "step": 17168 + }, + { + "epoch": 4.557812292579317, + "grad_norm": 0.47753854951884805, + "learning_rate": 6.842745933862663e-07, + "loss": 0.5527, + "step": 17169 + }, + { + "epoch": 4.5580777910527015, + "grad_norm": 0.49509551128834955, + "learning_rate": 6.840346246659716e-07, + "loss": 0.5646, + "step": 17170 + }, + { + "epoch": 4.558343289526086, + "grad_norm": 0.46540404747565056, + "learning_rate": 6.837946913614493e-07, + "loss": 0.4778, + "step": 17171 + }, + { + "epoch": 4.558608787999469, + "grad_norm": 0.49200152924063617, + "learning_rate": 6.835547934773759e-07, + "loss": 0.5237, + "step": 17172 + }, + { + "epoch": 4.558874286472853, + "grad_norm": 0.48048909654545585, + "learning_rate": 6.833149310184322e-07, + "loss": 0.5028, + "step": 17173 + }, + { + "epoch": 4.559139784946236, + "grad_norm": 0.48079049309936467, + "learning_rate": 6.83075103989295e-07, + "loss": 0.5282, + "step": 17174 + }, + { + "epoch": 4.55940528341962, + "grad_norm": 0.4980994638297322, + "learning_rate": 6.828353123946421e-07, + "loss": 0.5544, + "step": 17175 + }, + { + "epoch": 4.559670781893004, + "grad_norm": 0.48429804693509954, + "learning_rate": 6.825955562391493e-07, + "loss": 0.5268, + "step": 17176 + }, + { + "epoch": 4.559936280366388, + "grad_norm": 0.4834066983432881, + "learning_rate": 6.823558355274937e-07, + "loss": 0.5279, + "step": 17177 + }, + { + "epoch": 4.5602017788397715, + "grad_norm": 0.49291989688726173, + "learning_rate": 6.821161502643495e-07, + "loss": 0.5168, + "step": 17178 + }, + { + "epoch": 4.560467277313156, + "grad_norm": 0.4770239503189813, + "learning_rate": 6.818765004543906e-07, + "loss": 0.5342, + "step": 17179 + }, + { + "epoch": 4.560732775786539, + "grad_norm": 0.47066236248491955, + "learning_rate": 6.816368861022923e-07, + "loss": 0.5558, + "step": 17180 + }, + { + "epoch": 4.560998274259923, + "grad_norm": 0.469710280253976, + "learning_rate": 6.813973072127261e-07, + "loss": 0.5305, + "step": 17181 + }, + { + "epoch": 4.561263772733307, + "grad_norm": 0.4759236660418632, + "learning_rate": 6.81157763790366e-07, + "loss": 0.5329, + "step": 17182 + }, + { + "epoch": 4.561529271206691, + "grad_norm": 0.4821425532973299, + "learning_rate": 6.809182558398828e-07, + "loss": 0.5425, + "step": 17183 + }, + { + "epoch": 4.561794769680074, + "grad_norm": 0.476824950258736, + "learning_rate": 6.806787833659464e-07, + "loss": 0.5053, + "step": 17184 + }, + { + "epoch": 4.562060268153458, + "grad_norm": 0.47586108360500484, + "learning_rate": 6.804393463732293e-07, + "loss": 0.5445, + "step": 17185 + }, + { + "epoch": 4.5623257666268415, + "grad_norm": 0.4878188895390884, + "learning_rate": 6.801999448663991e-07, + "loss": 0.5805, + "step": 17186 + }, + { + "epoch": 4.562591265100226, + "grad_norm": 0.46892668516340896, + "learning_rate": 6.799605788501262e-07, + "loss": 0.5261, + "step": 17187 + }, + { + "epoch": 4.56285676357361, + "grad_norm": 0.4784872825847286, + "learning_rate": 6.797212483290777e-07, + "loss": 0.5451, + "step": 17188 + }, + { + "epoch": 4.563122262046993, + "grad_norm": 0.4940402384855071, + "learning_rate": 6.79481953307923e-07, + "loss": 0.5622, + "step": 17189 + }, + { + "epoch": 4.563387760520377, + "grad_norm": 0.4982410041945445, + "learning_rate": 6.792426937913263e-07, + "loss": 0.5183, + "step": 17190 + }, + { + "epoch": 4.563653258993761, + "grad_norm": 0.48748470306447933, + "learning_rate": 6.790034697839556e-07, + "loss": 0.5485, + "step": 17191 + }, + { + "epoch": 4.563918757467144, + "grad_norm": 0.48581950585643846, + "learning_rate": 6.787642812904752e-07, + "loss": 0.5714, + "step": 17192 + }, + { + "epoch": 4.564184255940528, + "grad_norm": 0.4974989678766032, + "learning_rate": 6.785251283155511e-07, + "loss": 0.5569, + "step": 17193 + }, + { + "epoch": 4.564449754413912, + "grad_norm": 0.4854672056003726, + "learning_rate": 6.78286010863847e-07, + "loss": 0.5555, + "step": 17194 + }, + { + "epoch": 4.564715252887296, + "grad_norm": 0.4798503326164381, + "learning_rate": 6.780469289400249e-07, + "loss": 0.5337, + "step": 17195 + }, + { + "epoch": 4.56498075136068, + "grad_norm": 0.48818149924633414, + "learning_rate": 6.778078825487497e-07, + "loss": 0.5463, + "step": 17196 + }, + { + "epoch": 4.565246249834064, + "grad_norm": 0.48280400619828245, + "learning_rate": 6.775688716946813e-07, + "loss": 0.5456, + "step": 17197 + }, + { + "epoch": 4.565511748307447, + "grad_norm": 0.476226632930575, + "learning_rate": 6.773298963824829e-07, + "loss": 0.5359, + "step": 17198 + }, + { + "epoch": 4.565777246780831, + "grad_norm": 0.4911757587209145, + "learning_rate": 6.770909566168133e-07, + "loss": 0.515, + "step": 17199 + }, + { + "epoch": 4.566042745254215, + "grad_norm": 0.48265399089289346, + "learning_rate": 6.768520524023348e-07, + "loss": 0.5246, + "step": 17200 + }, + { + "epoch": 4.566308243727598, + "grad_norm": 0.47824632329777716, + "learning_rate": 6.766131837437037e-07, + "loss": 0.5128, + "step": 17201 + }, + { + "epoch": 4.5665737422009824, + "grad_norm": 0.48136169839817466, + "learning_rate": 6.763743506455797e-07, + "loss": 0.5002, + "step": 17202 + }, + { + "epoch": 4.566839240674366, + "grad_norm": 0.47642750138075096, + "learning_rate": 6.761355531126221e-07, + "loss": 0.5354, + "step": 17203 + }, + { + "epoch": 4.56710473914775, + "grad_norm": 0.4792102793522238, + "learning_rate": 6.758967911494857e-07, + "loss": 0.5131, + "step": 17204 + }, + { + "epoch": 4.567370237621134, + "grad_norm": 0.4696769252991467, + "learning_rate": 6.756580647608288e-07, + "loss": 0.5304, + "step": 17205 + }, + { + "epoch": 4.567635736094518, + "grad_norm": 0.4695128355675209, + "learning_rate": 6.754193739513065e-07, + "loss": 0.5215, + "step": 17206 + }, + { + "epoch": 4.567901234567901, + "grad_norm": 0.488938932940753, + "learning_rate": 6.751807187255738e-07, + "loss": 0.5202, + "step": 17207 + }, + { + "epoch": 4.568166733041285, + "grad_norm": 0.4969564664600508, + "learning_rate": 6.74942099088284e-07, + "loss": 0.5336, + "step": 17208 + }, + { + "epoch": 4.568432231514668, + "grad_norm": 0.46748869405619303, + "learning_rate": 6.747035150440928e-07, + "loss": 0.5387, + "step": 17209 + }, + { + "epoch": 4.5686977299880525, + "grad_norm": 0.47279814089595606, + "learning_rate": 6.744649665976513e-07, + "loss": 0.5513, + "step": 17210 + }, + { + "epoch": 4.5689632284614365, + "grad_norm": 0.47653588690922505, + "learning_rate": 6.742264537536133e-07, + "loss": 0.5028, + "step": 17211 + }, + { + "epoch": 4.56922872693482, + "grad_norm": 0.47406683259452637, + "learning_rate": 6.739879765166299e-07, + "loss": 0.5234, + "step": 17212 + }, + { + "epoch": 4.569494225408204, + "grad_norm": 0.48962323254468404, + "learning_rate": 6.737495348913509e-07, + "loss": 0.5264, + "step": 17213 + }, + { + "epoch": 4.569759723881588, + "grad_norm": 0.4843586101828353, + "learning_rate": 6.735111288824281e-07, + "loss": 0.5477, + "step": 17214 + }, + { + "epoch": 4.570025222354971, + "grad_norm": 0.47734474171207525, + "learning_rate": 6.732727584945098e-07, + "loss": 0.5233, + "step": 17215 + }, + { + "epoch": 4.570290720828355, + "grad_norm": 0.48214337715167, + "learning_rate": 6.730344237322459e-07, + "loss": 0.5396, + "step": 17216 + }, + { + "epoch": 4.570556219301739, + "grad_norm": 0.5021900943159227, + "learning_rate": 6.72796124600284e-07, + "loss": 0.5167, + "step": 17217 + }, + { + "epoch": 4.5708217177751225, + "grad_norm": 0.48158096792201477, + "learning_rate": 6.725578611032713e-07, + "loss": 0.5285, + "step": 17218 + }, + { + "epoch": 4.571087216248507, + "grad_norm": 0.4724053983274434, + "learning_rate": 6.723196332458539e-07, + "loss": 0.5405, + "step": 17219 + }, + { + "epoch": 4.571352714721891, + "grad_norm": 0.48787416209390355, + "learning_rate": 6.720814410326795e-07, + "loss": 0.5425, + "step": 17220 + }, + { + "epoch": 4.571618213195274, + "grad_norm": 0.4641381056761872, + "learning_rate": 6.718432844683917e-07, + "loss": 0.5223, + "step": 17221 + }, + { + "epoch": 4.571883711668658, + "grad_norm": 0.48918285892535657, + "learning_rate": 6.716051635576359e-07, + "loss": 0.547, + "step": 17222 + }, + { + "epoch": 4.572149210142042, + "grad_norm": 0.4727924425579615, + "learning_rate": 6.713670783050577e-07, + "loss": 0.5502, + "step": 17223 + }, + { + "epoch": 4.572414708615425, + "grad_norm": 0.47997722969102286, + "learning_rate": 6.711290287152972e-07, + "loss": 0.534, + "step": 17224 + }, + { + "epoch": 4.572680207088809, + "grad_norm": 0.4791669935808119, + "learning_rate": 6.708910147929993e-07, + "loss": 0.5237, + "step": 17225 + }, + { + "epoch": 4.572945705562193, + "grad_norm": 0.48252595732700376, + "learning_rate": 6.70653036542804e-07, + "loss": 0.5501, + "step": 17226 + }, + { + "epoch": 4.573211204035577, + "grad_norm": 0.4980275857003703, + "learning_rate": 6.704150939693546e-07, + "loss": 0.5489, + "step": 17227 + }, + { + "epoch": 4.573476702508961, + "grad_norm": 0.491811048352953, + "learning_rate": 6.701771870772894e-07, + "loss": 0.5256, + "step": 17228 + }, + { + "epoch": 4.573742200982345, + "grad_norm": 0.47983251729763715, + "learning_rate": 6.699393158712503e-07, + "loss": 0.581, + "step": 17229 + }, + { + "epoch": 4.574007699455728, + "grad_norm": 0.4890747790824589, + "learning_rate": 6.697014803558752e-07, + "loss": 0.5765, + "step": 17230 + }, + { + "epoch": 4.574273197929112, + "grad_norm": 0.47633728958575655, + "learning_rate": 6.694636805358016e-07, + "loss": 0.5458, + "step": 17231 + }, + { + "epoch": 4.574538696402495, + "grad_norm": 0.47726055600972683, + "learning_rate": 6.692259164156692e-07, + "loss": 0.5028, + "step": 17232 + }, + { + "epoch": 4.574804194875879, + "grad_norm": 0.4740314086061241, + "learning_rate": 6.689881880001131e-07, + "loss": 0.5448, + "step": 17233 + }, + { + "epoch": 4.575069693349263, + "grad_norm": 0.48065267457395444, + "learning_rate": 6.68750495293771e-07, + "loss": 0.5214, + "step": 17234 + }, + { + "epoch": 4.5753351918226475, + "grad_norm": 0.47090751322597807, + "learning_rate": 6.685128383012782e-07, + "loss": 0.5404, + "step": 17235 + }, + { + "epoch": 4.575600690296031, + "grad_norm": 0.47849260927998805, + "learning_rate": 6.682752170272691e-07, + "loss": 0.5468, + "step": 17236 + }, + { + "epoch": 4.575866188769415, + "grad_norm": 0.5016774457200815, + "learning_rate": 6.680376314763773e-07, + "loss": 0.5396, + "step": 17237 + }, + { + "epoch": 4.576131687242798, + "grad_norm": 0.47640312142178937, + "learning_rate": 6.678000816532381e-07, + "loss": 0.5244, + "step": 17238 + }, + { + "epoch": 4.576397185716182, + "grad_norm": 0.469484645253419, + "learning_rate": 6.675625675624825e-07, + "loss": 0.5296, + "step": 17239 + }, + { + "epoch": 4.576662684189566, + "grad_norm": 0.4802714649381744, + "learning_rate": 6.673250892087441e-07, + "loss": 0.5119, + "step": 17240 + }, + { + "epoch": 4.576928182662949, + "grad_norm": 0.484651396021477, + "learning_rate": 6.670876465966536e-07, + "loss": 0.5356, + "step": 17241 + }, + { + "epoch": 4.577193681136333, + "grad_norm": 0.474452397273157, + "learning_rate": 6.66850239730841e-07, + "loss": 0.5159, + "step": 17242 + }, + { + "epoch": 4.5774591796097175, + "grad_norm": 0.48715717450839036, + "learning_rate": 6.666128686159376e-07, + "loss": 0.5124, + "step": 17243 + }, + { + "epoch": 4.577724678083101, + "grad_norm": 0.4847015381863419, + "learning_rate": 6.663755332565716e-07, + "loss": 0.548, + "step": 17244 + }, + { + "epoch": 4.577990176556485, + "grad_norm": 0.4620611511065152, + "learning_rate": 6.66138233657373e-07, + "loss": 0.4913, + "step": 17245 + }, + { + "epoch": 4.578255675029869, + "grad_norm": 0.4708939455040296, + "learning_rate": 6.65900969822969e-07, + "loss": 0.5596, + "step": 17246 + }, + { + "epoch": 4.578521173503252, + "grad_norm": 0.4918910957013495, + "learning_rate": 6.656637417579859e-07, + "loss": 0.5236, + "step": 17247 + }, + { + "epoch": 4.578786671976636, + "grad_norm": 0.4931152916154499, + "learning_rate": 6.654265494670517e-07, + "loss": 0.5262, + "step": 17248 + }, + { + "epoch": 4.57905217045002, + "grad_norm": 0.4707283422372326, + "learning_rate": 6.65189392954791e-07, + "loss": 0.5443, + "step": 17249 + }, + { + "epoch": 4.579317668923403, + "grad_norm": 0.48242748625921567, + "learning_rate": 6.649522722258301e-07, + "loss": 0.5632, + "step": 17250 + }, + { + "epoch": 4.5795831673967875, + "grad_norm": 0.4786895828320936, + "learning_rate": 6.647151872847921e-07, + "loss": 0.4868, + "step": 17251 + }, + { + "epoch": 4.579848665870172, + "grad_norm": 0.4693656573111654, + "learning_rate": 6.64478138136303e-07, + "loss": 0.5279, + "step": 17252 + }, + { + "epoch": 4.580114164343555, + "grad_norm": 0.48385856252776116, + "learning_rate": 6.64241124784983e-07, + "loss": 0.5094, + "step": 17253 + }, + { + "epoch": 4.580379662816939, + "grad_norm": 0.506119600576592, + "learning_rate": 6.640041472354567e-07, + "loss": 0.5558, + "step": 17254 + }, + { + "epoch": 4.580645161290323, + "grad_norm": 0.4915980632534202, + "learning_rate": 6.637672054923436e-07, + "loss": 0.5483, + "step": 17255 + }, + { + "epoch": 4.580910659763706, + "grad_norm": 0.46420933717999996, + "learning_rate": 6.635302995602669e-07, + "loss": 0.5418, + "step": 17256 + }, + { + "epoch": 4.58117615823709, + "grad_norm": 0.48700086116791647, + "learning_rate": 6.632934294438451e-07, + "loss": 0.5278, + "step": 17257 + }, + { + "epoch": 4.581441656710474, + "grad_norm": 0.49054521279438595, + "learning_rate": 6.630565951476992e-07, + "loss": 0.5319, + "step": 17258 + }, + { + "epoch": 4.5817071551838575, + "grad_norm": 0.47721463339418885, + "learning_rate": 6.628197966764471e-07, + "loss": 0.56, + "step": 17259 + }, + { + "epoch": 4.581972653657242, + "grad_norm": 0.47392959116876726, + "learning_rate": 6.625830340347067e-07, + "loss": 0.5258, + "step": 17260 + }, + { + "epoch": 4.582238152130625, + "grad_norm": 0.4862036638340638, + "learning_rate": 6.623463072270964e-07, + "loss": 0.559, + "step": 17261 + }, + { + "epoch": 4.582503650604009, + "grad_norm": 0.4720959035807394, + "learning_rate": 6.621096162582319e-07, + "loss": 0.5418, + "step": 17262 + }, + { + "epoch": 4.582769149077393, + "grad_norm": 0.4718342250451868, + "learning_rate": 6.618729611327313e-07, + "loss": 0.5203, + "step": 17263 + }, + { + "epoch": 4.583034647550777, + "grad_norm": 0.48642522012845996, + "learning_rate": 6.616363418552069e-07, + "loss": 0.5039, + "step": 17264 + }, + { + "epoch": 4.58330014602416, + "grad_norm": 0.4959256711853409, + "learning_rate": 6.613997584302756e-07, + "loss": 0.5538, + "step": 17265 + }, + { + "epoch": 4.583565644497544, + "grad_norm": 0.46902689892776284, + "learning_rate": 6.611632108625498e-07, + "loss": 0.5227, + "step": 17266 + }, + { + "epoch": 4.5838311429709275, + "grad_norm": 0.4773103957908522, + "learning_rate": 6.60926699156644e-07, + "loss": 0.5257, + "step": 17267 + }, + { + "epoch": 4.584096641444312, + "grad_norm": 0.47231769178954197, + "learning_rate": 6.60690223317171e-07, + "loss": 0.5147, + "step": 17268 + }, + { + "epoch": 4.584362139917696, + "grad_norm": 0.49492769996366154, + "learning_rate": 6.60453783348742e-07, + "loss": 0.5471, + "step": 17269 + }, + { + "epoch": 4.584627638391079, + "grad_norm": 0.4868640983641583, + "learning_rate": 6.602173792559683e-07, + "loss": 0.5608, + "step": 17270 + }, + { + "epoch": 4.584893136864463, + "grad_norm": 0.48078771009235843, + "learning_rate": 6.599810110434593e-07, + "loss": 0.5589, + "step": 17271 + }, + { + "epoch": 4.585158635337847, + "grad_norm": 0.48171629121128, + "learning_rate": 6.597446787158268e-07, + "loss": 0.5667, + "step": 17272 + }, + { + "epoch": 4.58542413381123, + "grad_norm": 0.4795312123798523, + "learning_rate": 6.59508382277678e-07, + "loss": 0.5357, + "step": 17273 + }, + { + "epoch": 4.585689632284614, + "grad_norm": 0.4826982129620007, + "learning_rate": 6.592721217336226e-07, + "loss": 0.5011, + "step": 17274 + }, + { + "epoch": 4.585955130757998, + "grad_norm": 0.49521054719034063, + "learning_rate": 6.590358970882677e-07, + "loss": 0.5479, + "step": 17275 + }, + { + "epoch": 4.586220629231382, + "grad_norm": 0.47797629804790837, + "learning_rate": 6.587997083462197e-07, + "loss": 0.5431, + "step": 17276 + }, + { + "epoch": 4.586486127704766, + "grad_norm": 0.4961224433377459, + "learning_rate": 6.585635555120859e-07, + "loss": 0.5472, + "step": 17277 + }, + { + "epoch": 4.58675162617815, + "grad_norm": 0.4961438713426632, + "learning_rate": 6.583274385904709e-07, + "loss": 0.5558, + "step": 17278 + }, + { + "epoch": 4.587017124651533, + "grad_norm": 0.4788743574320205, + "learning_rate": 6.580913575859807e-07, + "loss": 0.5429, + "step": 17279 + }, + { + "epoch": 4.587282623124917, + "grad_norm": 0.4779864084731545, + "learning_rate": 6.57855312503218e-07, + "loss": 0.5132, + "step": 17280 + }, + { + "epoch": 4.587548121598301, + "grad_norm": 0.46735563780690403, + "learning_rate": 6.576193033467887e-07, + "loss": 0.4697, + "step": 17281 + }, + { + "epoch": 4.587813620071684, + "grad_norm": 0.47583677429816956, + "learning_rate": 6.573833301212923e-07, + "loss": 0.5369, + "step": 17282 + }, + { + "epoch": 4.5880791185450684, + "grad_norm": 0.4725832828491452, + "learning_rate": 6.571473928313332e-07, + "loss": 0.5733, + "step": 17283 + }, + { + "epoch": 4.5883446170184525, + "grad_norm": 0.4964485779259073, + "learning_rate": 6.569114914815111e-07, + "loss": 0.5249, + "step": 17284 + }, + { + "epoch": 4.588610115491836, + "grad_norm": 0.4642813379363941, + "learning_rate": 6.566756260764287e-07, + "loss": 0.5364, + "step": 17285 + }, + { + "epoch": 4.58887561396522, + "grad_norm": 0.48672996767569293, + "learning_rate": 6.564397966206836e-07, + "loss": 0.5557, + "step": 17286 + }, + { + "epoch": 4.589141112438604, + "grad_norm": 0.48797537442724864, + "learning_rate": 6.562040031188773e-07, + "loss": 0.5737, + "step": 17287 + }, + { + "epoch": 4.589406610911987, + "grad_norm": 0.47369139712473873, + "learning_rate": 6.559682455756075e-07, + "loss": 0.5257, + "step": 17288 + }, + { + "epoch": 4.589672109385371, + "grad_norm": 0.4800866137753457, + "learning_rate": 6.55732523995471e-07, + "loss": 0.5493, + "step": 17289 + }, + { + "epoch": 4.589937607858754, + "grad_norm": 0.4790438024171878, + "learning_rate": 6.554968383830665e-07, + "loss": 0.5484, + "step": 17290 + }, + { + "epoch": 4.5902031063321385, + "grad_norm": 0.4926238312728841, + "learning_rate": 6.552611887429891e-07, + "loss": 0.5345, + "step": 17291 + }, + { + "epoch": 4.5904686048055225, + "grad_norm": 0.48702839882089777, + "learning_rate": 6.550255750798359e-07, + "loss": 0.5344, + "step": 17292 + }, + { + "epoch": 4.590734103278907, + "grad_norm": 0.47881925044588713, + "learning_rate": 6.547899973982014e-07, + "loss": 0.4999, + "step": 17293 + }, + { + "epoch": 4.59099960175229, + "grad_norm": 0.4831463568891131, + "learning_rate": 6.54554455702679e-07, + "loss": 0.5359, + "step": 17294 + }, + { + "epoch": 4.591265100225674, + "grad_norm": 0.49014652527471375, + "learning_rate": 6.543189499978638e-07, + "loss": 0.5213, + "step": 17295 + }, + { + "epoch": 4.591530598699057, + "grad_norm": 0.48506302121277745, + "learning_rate": 6.540834802883475e-07, + "loss": 0.555, + "step": 17296 + }, + { + "epoch": 4.591796097172441, + "grad_norm": 0.47845680846038047, + "learning_rate": 6.538480465787236e-07, + "loss": 0.5512, + "step": 17297 + }, + { + "epoch": 4.592061595645825, + "grad_norm": 0.4814313943474611, + "learning_rate": 6.536126488735828e-07, + "loss": 0.5227, + "step": 17298 + }, + { + "epoch": 4.5923270941192085, + "grad_norm": 0.470977617637183, + "learning_rate": 6.533772871775162e-07, + "loss": 0.5061, + "step": 17299 + }, + { + "epoch": 4.592592592592593, + "grad_norm": 0.4894117386603172, + "learning_rate": 6.531419614951132e-07, + "loss": 0.535, + "step": 17300 + }, + { + "epoch": 4.592858091065977, + "grad_norm": 0.4887360920426576, + "learning_rate": 6.529066718309643e-07, + "loss": 0.5095, + "step": 17301 + }, + { + "epoch": 4.59312358953936, + "grad_norm": 0.4869726230841305, + "learning_rate": 6.526714181896573e-07, + "loss": 0.5433, + "step": 17302 + }, + { + "epoch": 4.593389088012744, + "grad_norm": 0.47769366664393115, + "learning_rate": 6.524362005757811e-07, + "loss": 0.527, + "step": 17303 + }, + { + "epoch": 4.593654586486128, + "grad_norm": 0.4870302974104966, + "learning_rate": 6.522010189939229e-07, + "loss": 0.5449, + "step": 17304 + }, + { + "epoch": 4.593920084959511, + "grad_norm": 0.5024733537923884, + "learning_rate": 6.519658734486681e-07, + "loss": 0.5718, + "step": 17305 + }, + { + "epoch": 4.594185583432895, + "grad_norm": 0.4811477113574121, + "learning_rate": 6.517307639446044e-07, + "loss": 0.561, + "step": 17306 + }, + { + "epoch": 4.594451081906279, + "grad_norm": 0.468422224431071, + "learning_rate": 6.514956904863154e-07, + "loss": 0.5239, + "step": 17307 + }, + { + "epoch": 4.594716580379663, + "grad_norm": 0.4722604426185916, + "learning_rate": 6.512606530783872e-07, + "loss": 0.5588, + "step": 17308 + }, + { + "epoch": 4.594982078853047, + "grad_norm": 0.494579951202955, + "learning_rate": 6.510256517254018e-07, + "loss": 0.549, + "step": 17309 + }, + { + "epoch": 4.595247577326431, + "grad_norm": 0.47287098344723394, + "learning_rate": 6.507906864319446e-07, + "loss": 0.5334, + "step": 17310 + }, + { + "epoch": 4.595513075799814, + "grad_norm": 0.466100249257651, + "learning_rate": 6.505557572025955e-07, + "loss": 0.4973, + "step": 17311 + }, + { + "epoch": 4.595778574273198, + "grad_norm": 0.47361141344074614, + "learning_rate": 6.503208640419381e-07, + "loss": 0.5384, + "step": 17312 + }, + { + "epoch": 4.596044072746581, + "grad_norm": 0.4817522343316887, + "learning_rate": 6.500860069545517e-07, + "loss": 0.5496, + "step": 17313 + }, + { + "epoch": 4.596309571219965, + "grad_norm": 0.4872812452313049, + "learning_rate": 6.498511859450177e-07, + "loss": 0.538, + "step": 17314 + }, + { + "epoch": 4.596575069693349, + "grad_norm": 0.48697952867984573, + "learning_rate": 6.496164010179167e-07, + "loss": 0.5386, + "step": 17315 + }, + { + "epoch": 4.5968405681667335, + "grad_norm": 0.475896187012161, + "learning_rate": 6.493816521778254e-07, + "loss": 0.5393, + "step": 17316 + }, + { + "epoch": 4.597106066640117, + "grad_norm": 0.4734418847840232, + "learning_rate": 6.491469394293235e-07, + "loss": 0.512, + "step": 17317 + }, + { + "epoch": 4.597371565113501, + "grad_norm": 0.487635026913717, + "learning_rate": 6.48912262776987e-07, + "loss": 0.5337, + "step": 17318 + }, + { + "epoch": 4.597637063586884, + "grad_norm": 0.4828028027958323, + "learning_rate": 6.486776222253946e-07, + "loss": 0.5107, + "step": 17319 + }, + { + "epoch": 4.597902562060268, + "grad_norm": 0.4698875026378944, + "learning_rate": 6.484430177791207e-07, + "loss": 0.525, + "step": 17320 + }, + { + "epoch": 4.598168060533652, + "grad_norm": 0.4808326233541619, + "learning_rate": 6.482084494427418e-07, + "loss": 0.5515, + "step": 17321 + }, + { + "epoch": 4.598433559007036, + "grad_norm": 0.4858764275157084, + "learning_rate": 6.479739172208324e-07, + "loss": 0.5185, + "step": 17322 + }, + { + "epoch": 4.598699057480419, + "grad_norm": 0.5107945480221684, + "learning_rate": 6.477394211179652e-07, + "loss": 0.5338, + "step": 17323 + }, + { + "epoch": 4.5989645559538035, + "grad_norm": 0.49059586222909546, + "learning_rate": 6.475049611387152e-07, + "loss": 0.5416, + "step": 17324 + }, + { + "epoch": 4.599230054427187, + "grad_norm": 0.47927003748091607, + "learning_rate": 6.472705372876533e-07, + "loss": 0.5237, + "step": 17325 + }, + { + "epoch": 4.599495552900571, + "grad_norm": 0.4674364960090774, + "learning_rate": 6.470361495693534e-07, + "loss": 0.4785, + "step": 17326 + }, + { + "epoch": 4.599761051373955, + "grad_norm": 0.4757338269593397, + "learning_rate": 6.46801797988385e-07, + "loss": 0.5208, + "step": 17327 + }, + { + "epoch": 4.600026549847338, + "grad_norm": 0.4851689833037894, + "learning_rate": 6.46567482549319e-07, + "loss": 0.5725, + "step": 17328 + }, + { + "epoch": 4.600292048320722, + "grad_norm": 0.49755397757279407, + "learning_rate": 6.463332032567244e-07, + "loss": 0.5332, + "step": 17329 + }, + { + "epoch": 4.600557546794106, + "grad_norm": 0.4804176864232585, + "learning_rate": 6.460989601151716e-07, + "loss": 0.5792, + "step": 17330 + }, + { + "epoch": 4.600823045267489, + "grad_norm": 0.4799247096087188, + "learning_rate": 6.458647531292276e-07, + "loss": 0.5782, + "step": 17331 + }, + { + "epoch": 4.6010885437408735, + "grad_norm": 0.4678981572949605, + "learning_rate": 6.456305823034612e-07, + "loss": 0.5216, + "step": 17332 + }, + { + "epoch": 4.601354042214258, + "grad_norm": 0.4959774940702711, + "learning_rate": 6.453964476424387e-07, + "loss": 0.5194, + "step": 17333 + }, + { + "epoch": 4.601619540687641, + "grad_norm": 0.47432003621312463, + "learning_rate": 6.451623491507256e-07, + "loss": 0.5559, + "step": 17334 + }, + { + "epoch": 4.601885039161025, + "grad_norm": 0.48193639714086517, + "learning_rate": 6.449282868328891e-07, + "loss": 0.5384, + "step": 17335 + }, + { + "epoch": 4.602150537634409, + "grad_norm": 0.4951029700848952, + "learning_rate": 6.446942606934919e-07, + "loss": 0.5146, + "step": 17336 + }, + { + "epoch": 4.602416036107792, + "grad_norm": 0.49995458114157404, + "learning_rate": 6.444602707371e-07, + "loss": 0.5782, + "step": 17337 + }, + { + "epoch": 4.602681534581176, + "grad_norm": 0.46960299613210243, + "learning_rate": 6.44226316968276e-07, + "loss": 0.521, + "step": 17338 + }, + { + "epoch": 4.60294703305456, + "grad_norm": 0.4840516926504308, + "learning_rate": 6.43992399391582e-07, + "loss": 0.5278, + "step": 17339 + }, + { + "epoch": 4.6032125315279435, + "grad_norm": 0.4852931836358397, + "learning_rate": 6.43758518011581e-07, + "loss": 0.5178, + "step": 17340 + }, + { + "epoch": 4.603478030001328, + "grad_norm": 0.4688824781239619, + "learning_rate": 6.435246728328332e-07, + "loss": 0.5425, + "step": 17341 + }, + { + "epoch": 4.603743528474711, + "grad_norm": 0.48388384248791577, + "learning_rate": 6.432908638599003e-07, + "loss": 0.5414, + "step": 17342 + }, + { + "epoch": 4.604009026948095, + "grad_norm": 0.4732440774880686, + "learning_rate": 6.43057091097341e-07, + "loss": 0.5036, + "step": 17343 + }, + { + "epoch": 4.604274525421479, + "grad_norm": 0.4784110938977977, + "learning_rate": 6.428233545497165e-07, + "loss": 0.53, + "step": 17344 + }, + { + "epoch": 4.604540023894863, + "grad_norm": 0.4766481092781776, + "learning_rate": 6.425896542215821e-07, + "loss": 0.5566, + "step": 17345 + }, + { + "epoch": 4.604805522368246, + "grad_norm": 0.4900214150419999, + "learning_rate": 6.423559901174981e-07, + "loss": 0.5586, + "step": 17346 + }, + { + "epoch": 4.60507102084163, + "grad_norm": 0.48743031055794483, + "learning_rate": 6.421223622420194e-07, + "loss": 0.5359, + "step": 17347 + }, + { + "epoch": 4.6053365193150135, + "grad_norm": 0.48164934780152713, + "learning_rate": 6.418887705997046e-07, + "loss": 0.5355, + "step": 17348 + }, + { + "epoch": 4.605602017788398, + "grad_norm": 0.4830949060834166, + "learning_rate": 6.416552151951075e-07, + "loss": 0.5683, + "step": 17349 + }, + { + "epoch": 4.605867516261782, + "grad_norm": 0.4836943040325858, + "learning_rate": 6.414216960327844e-07, + "loss": 0.5631, + "step": 17350 + }, + { + "epoch": 4.606133014735165, + "grad_norm": 0.47832669251993887, + "learning_rate": 6.411882131172887e-07, + "loss": 0.5528, + "step": 17351 + }, + { + "epoch": 4.606398513208549, + "grad_norm": 0.47680765733040154, + "learning_rate": 6.409547664531734e-07, + "loss": 0.5348, + "step": 17352 + }, + { + "epoch": 4.606664011681933, + "grad_norm": 0.4804330388960981, + "learning_rate": 6.407213560449926e-07, + "loss": 0.5887, + "step": 17353 + }, + { + "epoch": 4.606929510155316, + "grad_norm": 0.4839095738969038, + "learning_rate": 6.40487981897297e-07, + "loss": 0.5231, + "step": 17354 + }, + { + "epoch": 4.6071950086287, + "grad_norm": 0.49127087479993997, + "learning_rate": 6.40254644014639e-07, + "loss": 0.5592, + "step": 17355 + }, + { + "epoch": 4.607460507102084, + "grad_norm": 0.4876203261880093, + "learning_rate": 6.400213424015691e-07, + "loss": 0.5591, + "step": 17356 + }, + { + "epoch": 4.607726005575468, + "grad_norm": 0.471595103459599, + "learning_rate": 6.397880770626372e-07, + "loss": 0.5157, + "step": 17357 + }, + { + "epoch": 4.607991504048852, + "grad_norm": 0.4847718370299789, + "learning_rate": 6.395548480023917e-07, + "loss": 0.5516, + "step": 17358 + }, + { + "epoch": 4.608257002522236, + "grad_norm": 0.4820107632343782, + "learning_rate": 6.393216552253814e-07, + "loss": 0.58, + "step": 17359 + }, + { + "epoch": 4.608522500995619, + "grad_norm": 0.47716768056181674, + "learning_rate": 6.390884987361556e-07, + "loss": 0.528, + "step": 17360 + }, + { + "epoch": 4.608787999469003, + "grad_norm": 0.486268430343264, + "learning_rate": 6.388553785392604e-07, + "loss": 0.5023, + "step": 17361 + }, + { + "epoch": 4.609053497942387, + "grad_norm": 0.4804585050846759, + "learning_rate": 6.38622294639242e-07, + "loss": 0.5558, + "step": 17362 + }, + { + "epoch": 4.60931899641577, + "grad_norm": 0.47449720872194084, + "learning_rate": 6.383892470406456e-07, + "loss": 0.5282, + "step": 17363 + }, + { + "epoch": 4.6095844948891544, + "grad_norm": 0.46913942144166004, + "learning_rate": 6.381562357480176e-07, + "loss": 0.523, + "step": 17364 + }, + { + "epoch": 4.6098499933625385, + "grad_norm": 0.48453931208963535, + "learning_rate": 6.379232607659008e-07, + "loss": 0.5327, + "step": 17365 + }, + { + "epoch": 4.610115491835922, + "grad_norm": 0.4923217881141727, + "learning_rate": 6.376903220988406e-07, + "loss": 0.5433, + "step": 17366 + }, + { + "epoch": 4.610380990309306, + "grad_norm": 0.48994490808075924, + "learning_rate": 6.374574197513786e-07, + "loss": 0.5417, + "step": 17367 + }, + { + "epoch": 4.61064648878269, + "grad_norm": 0.48034568735882427, + "learning_rate": 6.372245537280566e-07, + "loss": 0.5509, + "step": 17368 + }, + { + "epoch": 4.610911987256073, + "grad_norm": 0.49149677299488925, + "learning_rate": 6.369917240334173e-07, + "loss": 0.5294, + "step": 17369 + }, + { + "epoch": 4.611177485729457, + "grad_norm": 0.4788799625973278, + "learning_rate": 6.367589306720001e-07, + "loss": 0.5272, + "step": 17370 + }, + { + "epoch": 4.61144298420284, + "grad_norm": 0.487062945198114, + "learning_rate": 6.365261736483464e-07, + "loss": 0.5384, + "step": 17371 + }, + { + "epoch": 4.6117084826762245, + "grad_norm": 0.49938578574907366, + "learning_rate": 6.362934529669942e-07, + "loss": 0.5655, + "step": 17372 + }, + { + "epoch": 4.6119739811496085, + "grad_norm": 0.47647180187905896, + "learning_rate": 6.360607686324843e-07, + "loss": 0.5174, + "step": 17373 + }, + { + "epoch": 4.612239479622993, + "grad_norm": 0.48676080379822495, + "learning_rate": 6.358281206493516e-07, + "loss": 0.5675, + "step": 17374 + }, + { + "epoch": 4.612504978096376, + "grad_norm": 0.4935539926240541, + "learning_rate": 6.355955090221358e-07, + "loss": 0.5271, + "step": 17375 + }, + { + "epoch": 4.61277047656976, + "grad_norm": 0.47258890903380474, + "learning_rate": 6.353629337553716e-07, + "loss": 0.5137, + "step": 17376 + }, + { + "epoch": 4.613035975043143, + "grad_norm": 0.4885127829573645, + "learning_rate": 6.351303948535961e-07, + "loss": 0.5436, + "step": 17377 + }, + { + "epoch": 4.613301473516527, + "grad_norm": 0.4943823281167259, + "learning_rate": 6.348978923213434e-07, + "loss": 0.575, + "step": 17378 + }, + { + "epoch": 4.613566971989911, + "grad_norm": 0.4662464502894001, + "learning_rate": 6.346654261631488e-07, + "loss": 0.5433, + "step": 17379 + }, + { + "epoch": 4.6138324704632945, + "grad_norm": 0.4794442306213752, + "learning_rate": 6.344329963835458e-07, + "loss": 0.5635, + "step": 17380 + }, + { + "epoch": 4.614097968936679, + "grad_norm": 0.4715141785878349, + "learning_rate": 6.342006029870662e-07, + "loss": 0.505, + "step": 17381 + }, + { + "epoch": 4.614363467410063, + "grad_norm": 0.48262661391284667, + "learning_rate": 6.33968245978244e-07, + "loss": 0.5249, + "step": 17382 + }, + { + "epoch": 4.614628965883446, + "grad_norm": 0.4734583988441969, + "learning_rate": 6.33735925361609e-07, + "loss": 0.524, + "step": 17383 + }, + { + "epoch": 4.61489446435683, + "grad_norm": 0.4800495271630748, + "learning_rate": 6.335036411416937e-07, + "loss": 0.5558, + "step": 17384 + }, + { + "epoch": 4.615159962830214, + "grad_norm": 0.47829998197458673, + "learning_rate": 6.332713933230273e-07, + "loss": 0.536, + "step": 17385 + }, + { + "epoch": 4.615425461303597, + "grad_norm": 0.478371725356284, + "learning_rate": 6.330391819101383e-07, + "loss": 0.563, + "step": 17386 + }, + { + "epoch": 4.615690959776981, + "grad_norm": 0.4890444885993846, + "learning_rate": 6.328070069075573e-07, + "loss": 0.568, + "step": 17387 + }, + { + "epoch": 4.615956458250365, + "grad_norm": 0.47233389043203333, + "learning_rate": 6.325748683198105e-07, + "loss": 0.5428, + "step": 17388 + }, + { + "epoch": 4.616221956723749, + "grad_norm": 0.4953522092553053, + "learning_rate": 6.32342766151427e-07, + "loss": 0.5258, + "step": 17389 + }, + { + "epoch": 4.616487455197133, + "grad_norm": 0.47744473881326605, + "learning_rate": 6.321107004069321e-07, + "loss": 0.4984, + "step": 17390 + }, + { + "epoch": 4.616752953670517, + "grad_norm": 0.4678089315618051, + "learning_rate": 6.31878671090852e-07, + "loss": 0.5021, + "step": 17391 + }, + { + "epoch": 4.6170184521439, + "grad_norm": 0.4812693702339518, + "learning_rate": 6.31646678207711e-07, + "loss": 0.5154, + "step": 17392 + }, + { + "epoch": 4.617283950617284, + "grad_norm": 0.48341641424434456, + "learning_rate": 6.314147217620353e-07, + "loss": 0.523, + "step": 17393 + }, + { + "epoch": 4.617549449090668, + "grad_norm": 0.4760495685496969, + "learning_rate": 6.311828017583465e-07, + "loss": 0.5374, + "step": 17394 + }, + { + "epoch": 4.617814947564051, + "grad_norm": 0.4710700820505262, + "learning_rate": 6.309509182011697e-07, + "loss": 0.5217, + "step": 17395 + }, + { + "epoch": 4.618080446037435, + "grad_norm": 0.4706078408148169, + "learning_rate": 6.307190710950262e-07, + "loss": 0.5, + "step": 17396 + }, + { + "epoch": 4.6183459445108195, + "grad_norm": 0.4761084428167169, + "learning_rate": 6.304872604444368e-07, + "loss": 0.5032, + "step": 17397 + }, + { + "epoch": 4.618611442984203, + "grad_norm": 0.4809530096122509, + "learning_rate": 6.30255486253924e-07, + "loss": 0.5053, + "step": 17398 + }, + { + "epoch": 4.618876941457587, + "grad_norm": 0.48062221779797143, + "learning_rate": 6.300237485280061e-07, + "loss": 0.5306, + "step": 17399 + }, + { + "epoch": 4.61914243993097, + "grad_norm": 0.47712197772264653, + "learning_rate": 6.297920472712046e-07, + "loss": 0.5115, + "step": 17400 + }, + { + "epoch": 4.619407938404354, + "grad_norm": 0.4766162981676967, + "learning_rate": 6.295603824880364e-07, + "loss": 0.5269, + "step": 17401 + }, + { + "epoch": 4.619673436877738, + "grad_norm": 0.480338787414521, + "learning_rate": 6.293287541830221e-07, + "loss": 0.5034, + "step": 17402 + }, + { + "epoch": 4.619938935351122, + "grad_norm": 0.49319928129128415, + "learning_rate": 6.290971623606756e-07, + "loss": 0.5282, + "step": 17403 + }, + { + "epoch": 4.620204433824505, + "grad_norm": 0.49239175771260724, + "learning_rate": 6.288656070255151e-07, + "loss": 0.556, + "step": 17404 + }, + { + "epoch": 4.6204699322978895, + "grad_norm": 0.4860605010334084, + "learning_rate": 6.286340881820574e-07, + "loss": 0.54, + "step": 17405 + }, + { + "epoch": 4.620735430771273, + "grad_norm": 0.48913162410872707, + "learning_rate": 6.284026058348161e-07, + "loss": 0.5188, + "step": 17406 + }, + { + "epoch": 4.621000929244657, + "grad_norm": 0.5068455248672303, + "learning_rate": 6.281711599883075e-07, + "loss": 0.5666, + "step": 17407 + }, + { + "epoch": 4.621266427718041, + "grad_norm": 0.4877095348112145, + "learning_rate": 6.279397506470439e-07, + "loss": 0.5421, + "step": 17408 + }, + { + "epoch": 4.621531926191424, + "grad_norm": 0.47402155753709624, + "learning_rate": 6.27708377815539e-07, + "loss": 0.5685, + "step": 17409 + }, + { + "epoch": 4.621797424664808, + "grad_norm": 0.48390233346666245, + "learning_rate": 6.274770414983039e-07, + "loss": 0.5185, + "step": 17410 + }, + { + "epoch": 4.622062923138192, + "grad_norm": 0.48774895120884687, + "learning_rate": 6.272457416998523e-07, + "loss": 0.5138, + "step": 17411 + }, + { + "epoch": 4.622328421611575, + "grad_norm": 0.4851430959718481, + "learning_rate": 6.270144784246931e-07, + "loss": 0.5521, + "step": 17412 + }, + { + "epoch": 4.6225939200849595, + "grad_norm": 0.48529250735105295, + "learning_rate": 6.267832516773381e-07, + "loss": 0.5275, + "step": 17413 + }, + { + "epoch": 4.622859418558344, + "grad_norm": 0.476380753878276, + "learning_rate": 6.265520614622963e-07, + "loss": 0.5014, + "step": 17414 + }, + { + "epoch": 4.623124917031727, + "grad_norm": 0.4794223010468181, + "learning_rate": 6.263209077840754e-07, + "loss": 0.5295, + "step": 17415 + }, + { + "epoch": 4.623390415505111, + "grad_norm": 0.48075648986178093, + "learning_rate": 6.260897906471852e-07, + "loss": 0.5582, + "step": 17416 + }, + { + "epoch": 4.623655913978495, + "grad_norm": 0.469727595806367, + "learning_rate": 6.258587100561314e-07, + "loss": 0.515, + "step": 17417 + }, + { + "epoch": 4.623921412451878, + "grad_norm": 0.4619622467732009, + "learning_rate": 6.256276660154223e-07, + "loss": 0.521, + "step": 17418 + }, + { + "epoch": 4.624186910925262, + "grad_norm": 0.48091931716641556, + "learning_rate": 6.253966585295629e-07, + "loss": 0.5242, + "step": 17419 + }, + { + "epoch": 4.624452409398646, + "grad_norm": 0.46125218228401604, + "learning_rate": 6.251656876030585e-07, + "loss": 0.5219, + "step": 17420 + }, + { + "epoch": 4.6247179078720295, + "grad_norm": 0.4817523300192529, + "learning_rate": 6.249347532404126e-07, + "loss": 0.5312, + "step": 17421 + }, + { + "epoch": 4.624983406345414, + "grad_norm": 0.4762771483367275, + "learning_rate": 6.24703855446131e-07, + "loss": 0.5079, + "step": 17422 + }, + { + "epoch": 4.625248904818797, + "grad_norm": 0.44350279705194995, + "learning_rate": 6.244729942247149e-07, + "loss": 0.4568, + "step": 17423 + }, + { + "epoch": 4.625514403292181, + "grad_norm": 0.4818949817427838, + "learning_rate": 6.24242169580668e-07, + "loss": 0.5562, + "step": 17424 + }, + { + "epoch": 4.625779901765565, + "grad_norm": 0.452745423213986, + "learning_rate": 6.240113815184917e-07, + "loss": 0.542, + "step": 17425 + }, + { + "epoch": 4.626045400238949, + "grad_norm": 0.4824915490620668, + "learning_rate": 6.23780630042686e-07, + "loss": 0.5394, + "step": 17426 + }, + { + "epoch": 4.626310898712332, + "grad_norm": 0.48891760215921237, + "learning_rate": 6.235499151577523e-07, + "loss": 0.4947, + "step": 17427 + }, + { + "epoch": 4.626576397185716, + "grad_norm": 0.4944088870161946, + "learning_rate": 6.23319236868189e-07, + "loss": 0.5138, + "step": 17428 + }, + { + "epoch": 4.6268418956590995, + "grad_norm": 0.48849470466160577, + "learning_rate": 6.230885951784962e-07, + "loss": 0.5355, + "step": 17429 + }, + { + "epoch": 4.627107394132484, + "grad_norm": 0.4898893242228827, + "learning_rate": 6.228579900931705e-07, + "loss": 0.5185, + "step": 17430 + }, + { + "epoch": 4.627372892605868, + "grad_norm": 0.48601509631922596, + "learning_rate": 6.226274216167108e-07, + "loss": 0.5363, + "step": 17431 + }, + { + "epoch": 4.627638391079252, + "grad_norm": 0.49998110713082267, + "learning_rate": 6.223968897536132e-07, + "loss": 0.5733, + "step": 17432 + }, + { + "epoch": 4.627903889552635, + "grad_norm": 0.4759075321254341, + "learning_rate": 6.221663945083722e-07, + "loss": 0.5518, + "step": 17433 + }, + { + "epoch": 4.628169388026019, + "grad_norm": 0.460289053919721, + "learning_rate": 6.219359358854854e-07, + "loss": 0.5181, + "step": 17434 + }, + { + "epoch": 4.628434886499402, + "grad_norm": 0.4746934852634388, + "learning_rate": 6.217055138894452e-07, + "loss": 0.5158, + "step": 17435 + }, + { + "epoch": 4.628700384972786, + "grad_norm": 0.4862767813331973, + "learning_rate": 6.214751285247478e-07, + "loss": 0.5499, + "step": 17436 + }, + { + "epoch": 4.62896588344617, + "grad_norm": 0.4934686294367308, + "learning_rate": 6.212447797958835e-07, + "loss": 0.5463, + "step": 17437 + }, + { + "epoch": 4.629231381919554, + "grad_norm": 0.47940395996941554, + "learning_rate": 6.210144677073466e-07, + "loss": 0.5395, + "step": 17438 + }, + { + "epoch": 4.629496880392938, + "grad_norm": 0.47877706630815836, + "learning_rate": 6.207841922636273e-07, + "loss": 0.5907, + "step": 17439 + }, + { + "epoch": 4.629762378866322, + "grad_norm": 0.4773088501757365, + "learning_rate": 6.205539534692184e-07, + "loss": 0.5017, + "step": 17440 + }, + { + "epoch": 4.630027877339705, + "grad_norm": 0.48735265289989993, + "learning_rate": 6.203237513286081e-07, + "loss": 0.5603, + "step": 17441 + }, + { + "epoch": 4.630293375813089, + "grad_norm": 0.4860241075083344, + "learning_rate": 6.200935858462876e-07, + "loss": 0.5646, + "step": 17442 + }, + { + "epoch": 4.630558874286473, + "grad_norm": 0.47325123215489806, + "learning_rate": 6.19863457026745e-07, + "loss": 0.5362, + "step": 17443 + }, + { + "epoch": 4.630824372759856, + "grad_norm": 0.48808316553876985, + "learning_rate": 6.196333648744679e-07, + "loss": 0.5678, + "step": 17444 + }, + { + "epoch": 4.6310898712332405, + "grad_norm": 0.489241009603056, + "learning_rate": 6.194033093939444e-07, + "loss": 0.5231, + "step": 17445 + }, + { + "epoch": 4.6313553697066245, + "grad_norm": 0.48524170259881627, + "learning_rate": 6.191732905896605e-07, + "loss": 0.545, + "step": 17446 + }, + { + "epoch": 4.631620868180008, + "grad_norm": 0.48817508160181505, + "learning_rate": 6.189433084661032e-07, + "loss": 0.5665, + "step": 17447 + }, + { + "epoch": 4.631886366653392, + "grad_norm": 0.4853968632618056, + "learning_rate": 6.187133630277567e-07, + "loss": 0.5283, + "step": 17448 + }, + { + "epoch": 4.632151865126776, + "grad_norm": 0.49036848600591465, + "learning_rate": 6.184834542791052e-07, + "loss": 0.5415, + "step": 17449 + }, + { + "epoch": 4.632417363600159, + "grad_norm": 0.502976468837631, + "learning_rate": 6.182535822246341e-07, + "loss": 0.5731, + "step": 17450 + }, + { + "epoch": 4.632682862073543, + "grad_norm": 0.48508347639929883, + "learning_rate": 6.180237468688244e-07, + "loss": 0.5412, + "step": 17451 + }, + { + "epoch": 4.632948360546926, + "grad_norm": 0.4816568495650498, + "learning_rate": 6.177939482161605e-07, + "loss": 0.5283, + "step": 17452 + }, + { + "epoch": 4.6332138590203105, + "grad_norm": 0.46404991222252395, + "learning_rate": 6.17564186271122e-07, + "loss": 0.5133, + "step": 17453 + }, + { + "epoch": 4.6334793574936946, + "grad_norm": 0.47244615237967086, + "learning_rate": 6.173344610381924e-07, + "loss": 0.502, + "step": 17454 + }, + { + "epoch": 4.633744855967079, + "grad_norm": 0.47972845434142186, + "learning_rate": 6.17104772521849e-07, + "loss": 0.5492, + "step": 17455 + }, + { + "epoch": 4.634010354440462, + "grad_norm": 0.49119879894851753, + "learning_rate": 6.168751207265736e-07, + "loss": 0.562, + "step": 17456 + }, + { + "epoch": 4.634275852913846, + "grad_norm": 0.47486576763100935, + "learning_rate": 6.16645505656843e-07, + "loss": 0.5259, + "step": 17457 + }, + { + "epoch": 4.634541351387229, + "grad_norm": 0.4877641438497815, + "learning_rate": 6.164159273171369e-07, + "loss": 0.5323, + "step": 17458 + }, + { + "epoch": 4.634806849860613, + "grad_norm": 0.4961935495312679, + "learning_rate": 6.161863857119324e-07, + "loss": 0.5311, + "step": 17459 + }, + { + "epoch": 4.635072348333997, + "grad_norm": 0.481101578654917, + "learning_rate": 6.159568808457048e-07, + "loss": 0.5003, + "step": 17460 + }, + { + "epoch": 4.6353378468073805, + "grad_norm": 0.4839613214708843, + "learning_rate": 6.157274127229318e-07, + "loss": 0.555, + "step": 17461 + }, + { + "epoch": 4.635603345280765, + "grad_norm": 0.4812460667614524, + "learning_rate": 6.154979813480869e-07, + "loss": 0.5484, + "step": 17462 + }, + { + "epoch": 4.635868843754149, + "grad_norm": 0.4794861512602386, + "learning_rate": 6.152685867256461e-07, + "loss": 0.5363, + "step": 17463 + }, + { + "epoch": 4.636134342227532, + "grad_norm": 0.4834313531542578, + "learning_rate": 6.150392288600821e-07, + "loss": 0.516, + "step": 17464 + }, + { + "epoch": 4.636399840700916, + "grad_norm": 0.47850102674842715, + "learning_rate": 6.148099077558695e-07, + "loss": 0.5573, + "step": 17465 + }, + { + "epoch": 4.6366653391743, + "grad_norm": 0.47814185892697764, + "learning_rate": 6.145806234174778e-07, + "loss": 0.5377, + "step": 17466 + }, + { + "epoch": 4.636930837647683, + "grad_norm": 0.4851323906919248, + "learning_rate": 6.143513758493813e-07, + "loss": 0.5475, + "step": 17467 + }, + { + "epoch": 4.637196336121067, + "grad_norm": 0.470426608263812, + "learning_rate": 6.141221650560489e-07, + "loss": 0.5094, + "step": 17468 + }, + { + "epoch": 4.637461834594451, + "grad_norm": 0.4838404876221839, + "learning_rate": 6.138929910419525e-07, + "loss": 0.526, + "step": 17469 + }, + { + "epoch": 4.637727333067835, + "grad_norm": 0.4907255134779473, + "learning_rate": 6.136638538115603e-07, + "loss": 0.5207, + "step": 17470 + }, + { + "epoch": 4.637992831541219, + "grad_norm": 0.4741284719927159, + "learning_rate": 6.134347533693419e-07, + "loss": 0.5308, + "step": 17471 + }, + { + "epoch": 4.638258330014603, + "grad_norm": 0.4996884731539871, + "learning_rate": 6.13205689719765e-07, + "loss": 0.5452, + "step": 17472 + }, + { + "epoch": 4.638523828487986, + "grad_norm": 0.4956992712917525, + "learning_rate": 6.129766628672959e-07, + "loss": 0.5545, + "step": 17473 + }, + { + "epoch": 4.63878932696137, + "grad_norm": 0.4794383293491906, + "learning_rate": 6.127476728164031e-07, + "loss": 0.5232, + "step": 17474 + }, + { + "epoch": 4.639054825434754, + "grad_norm": 0.48137553056288507, + "learning_rate": 6.125187195715507e-07, + "loss": 0.5121, + "step": 17475 + }, + { + "epoch": 4.639320323908137, + "grad_norm": 0.4772370957135108, + "learning_rate": 6.122898031372052e-07, + "loss": 0.5319, + "step": 17476 + }, + { + "epoch": 4.639585822381521, + "grad_norm": 0.48624608121262863, + "learning_rate": 6.120609235178307e-07, + "loss": 0.5427, + "step": 17477 + }, + { + "epoch": 4.6398513208549055, + "grad_norm": 0.48644441560115587, + "learning_rate": 6.118320807178899e-07, + "loss": 0.5517, + "step": 17478 + }, + { + "epoch": 4.640116819328289, + "grad_norm": 0.4844592690779896, + "learning_rate": 6.116032747418471e-07, + "loss": 0.5303, + "step": 17479 + }, + { + "epoch": 4.640382317801673, + "grad_norm": 0.4701282192906114, + "learning_rate": 6.113745055941634e-07, + "loss": 0.5403, + "step": 17480 + }, + { + "epoch": 4.640647816275056, + "grad_norm": 0.4779649823021076, + "learning_rate": 6.111457732793017e-07, + "loss": 0.5288, + "step": 17481 + }, + { + "epoch": 4.64091331474844, + "grad_norm": 0.4885092674320545, + "learning_rate": 6.10917077801722e-07, + "loss": 0.5502, + "step": 17482 + }, + { + "epoch": 4.641178813221824, + "grad_norm": 0.4975695693618214, + "learning_rate": 6.106884191658846e-07, + "loss": 0.5174, + "step": 17483 + }, + { + "epoch": 4.641444311695208, + "grad_norm": 0.4849399329062805, + "learning_rate": 6.104597973762483e-07, + "loss": 0.5424, + "step": 17484 + }, + { + "epoch": 4.641709810168591, + "grad_norm": 0.4826121018508602, + "learning_rate": 6.102312124372731e-07, + "loss": 0.5203, + "step": 17485 + }, + { + "epoch": 4.6419753086419755, + "grad_norm": 0.4932130839163423, + "learning_rate": 6.100026643534154e-07, + "loss": 0.5564, + "step": 17486 + }, + { + "epoch": 4.642240807115359, + "grad_norm": 0.4724307312405337, + "learning_rate": 6.09774153129134e-07, + "loss": 0.5029, + "step": 17487 + }, + { + "epoch": 4.642506305588743, + "grad_norm": 0.47417809804209765, + "learning_rate": 6.095456787688847e-07, + "loss": 0.5157, + "step": 17488 + }, + { + "epoch": 4.642771804062127, + "grad_norm": 0.49435612741861995, + "learning_rate": 6.093172412771223e-07, + "loss": 0.5528, + "step": 17489 + }, + { + "epoch": 4.64303730253551, + "grad_norm": 0.4842382068137344, + "learning_rate": 6.090888406583039e-07, + "loss": 0.5226, + "step": 17490 + }, + { + "epoch": 4.643302801008894, + "grad_norm": 0.4918516301146902, + "learning_rate": 6.088604769168821e-07, + "loss": 0.5585, + "step": 17491 + }, + { + "epoch": 4.643568299482278, + "grad_norm": 0.4801535752771731, + "learning_rate": 6.08632150057312e-07, + "loss": 0.5415, + "step": 17492 + }, + { + "epoch": 4.643833797955661, + "grad_norm": 0.48523799197822826, + "learning_rate": 6.08403860084045e-07, + "loss": 0.52, + "step": 17493 + }, + { + "epoch": 4.6440992964290455, + "grad_norm": 0.4632014856315313, + "learning_rate": 6.081756070015358e-07, + "loss": 0.5296, + "step": 17494 + }, + { + "epoch": 4.64436479490243, + "grad_norm": 0.4837804512705638, + "learning_rate": 6.079473908142328e-07, + "loss": 0.5391, + "step": 17495 + }, + { + "epoch": 4.644630293375813, + "grad_norm": 0.4789590340781613, + "learning_rate": 6.077192115265881e-07, + "loss": 0.5223, + "step": 17496 + }, + { + "epoch": 4.644895791849197, + "grad_norm": 0.4735435715764255, + "learning_rate": 6.074910691430528e-07, + "loss": 0.5224, + "step": 17497 + }, + { + "epoch": 4.645161290322581, + "grad_norm": 0.48915196614027545, + "learning_rate": 6.072629636680744e-07, + "loss": 0.5418, + "step": 17498 + }, + { + "epoch": 4.645426788795964, + "grad_norm": 0.46584572037357247, + "learning_rate": 6.070348951061031e-07, + "loss": 0.5428, + "step": 17499 + }, + { + "epoch": 4.645692287269348, + "grad_norm": 0.48995319752052197, + "learning_rate": 6.068068634615862e-07, + "loss": 0.5644, + "step": 17500 + }, + { + "epoch": 4.645957785742732, + "grad_norm": 0.5090369309397824, + "learning_rate": 6.065788687389706e-07, + "loss": 0.5739, + "step": 17501 + }, + { + "epoch": 4.6462232842161155, + "grad_norm": 0.48019478309496105, + "learning_rate": 6.063509109427026e-07, + "loss": 0.5028, + "step": 17502 + }, + { + "epoch": 4.6464887826895, + "grad_norm": 0.4939007719504089, + "learning_rate": 6.061229900772286e-07, + "loss": 0.5528, + "step": 17503 + }, + { + "epoch": 4.646754281162884, + "grad_norm": 0.480556049844145, + "learning_rate": 6.058951061469931e-07, + "loss": 0.5177, + "step": 17504 + }, + { + "epoch": 4.647019779636267, + "grad_norm": 0.482218024270548, + "learning_rate": 6.056672591564409e-07, + "loss": 0.5476, + "step": 17505 + }, + { + "epoch": 4.647285278109651, + "grad_norm": 0.4849464296642132, + "learning_rate": 6.054394491100154e-07, + "loss": 0.5715, + "step": 17506 + }, + { + "epoch": 4.647550776583035, + "grad_norm": 0.4753320244116484, + "learning_rate": 6.052116760121588e-07, + "loss": 0.5483, + "step": 17507 + }, + { + "epoch": 4.647816275056418, + "grad_norm": 0.47972073926217157, + "learning_rate": 6.049839398673141e-07, + "loss": 0.5705, + "step": 17508 + }, + { + "epoch": 4.648081773529802, + "grad_norm": 0.4762911679638246, + "learning_rate": 6.04756240679922e-07, + "loss": 0.5069, + "step": 17509 + }, + { + "epoch": 4.6483472720031855, + "grad_norm": 0.4792631696037987, + "learning_rate": 6.045285784544242e-07, + "loss": 0.5519, + "step": 17510 + }, + { + "epoch": 4.64861277047657, + "grad_norm": 0.4641960891860845, + "learning_rate": 6.043009531952599e-07, + "loss": 0.5268, + "step": 17511 + }, + { + "epoch": 4.648878268949954, + "grad_norm": 0.4771457169299319, + "learning_rate": 6.040733649068686e-07, + "loss": 0.5365, + "step": 17512 + }, + { + "epoch": 4.649143767423338, + "grad_norm": 0.48621309220103204, + "learning_rate": 6.038458135936878e-07, + "loss": 0.5445, + "step": 17513 + }, + { + "epoch": 4.649409265896721, + "grad_norm": 0.49196159482123314, + "learning_rate": 6.036182992601572e-07, + "loss": 0.549, + "step": 17514 + }, + { + "epoch": 4.649674764370105, + "grad_norm": 0.4869467828458674, + "learning_rate": 6.033908219107121e-07, + "loss": 0.5067, + "step": 17515 + }, + { + "epoch": 4.649940262843488, + "grad_norm": 0.4963452041268275, + "learning_rate": 6.031633815497894e-07, + "loss": 0.5486, + "step": 17516 + }, + { + "epoch": 4.650205761316872, + "grad_norm": 0.47970259841092555, + "learning_rate": 6.02935978181827e-07, + "loss": 0.5442, + "step": 17517 + }, + { + "epoch": 4.650471259790256, + "grad_norm": 0.47584059691761377, + "learning_rate": 6.02708611811256e-07, + "loss": 0.5265, + "step": 17518 + }, + { + "epoch": 4.65073675826364, + "grad_norm": 0.48528805935740743, + "learning_rate": 6.024812824425133e-07, + "loss": 0.5506, + "step": 17519 + }, + { + "epoch": 4.651002256737024, + "grad_norm": 0.4871816146120627, + "learning_rate": 6.022539900800306e-07, + "loss": 0.5518, + "step": 17520 + }, + { + "epoch": 4.651267755210408, + "grad_norm": 0.48405214628733373, + "learning_rate": 6.020267347282427e-07, + "loss": 0.5479, + "step": 17521 + }, + { + "epoch": 4.651533253683791, + "grad_norm": 0.47814266671981204, + "learning_rate": 6.017995163915794e-07, + "loss": 0.5528, + "step": 17522 + }, + { + "epoch": 4.651798752157175, + "grad_norm": 0.4774122536340053, + "learning_rate": 6.01572335074474e-07, + "loss": 0.5079, + "step": 17523 + }, + { + "epoch": 4.652064250630559, + "grad_norm": 0.47598598565462225, + "learning_rate": 6.013451907813561e-07, + "loss": 0.5323, + "step": 17524 + }, + { + "epoch": 4.652329749103942, + "grad_norm": 0.47714727082369907, + "learning_rate": 6.011180835166553e-07, + "loss": 0.5472, + "step": 17525 + }, + { + "epoch": 4.6525952475773265, + "grad_norm": 0.4914648203897081, + "learning_rate": 6.008910132848018e-07, + "loss": 0.5788, + "step": 17526 + }, + { + "epoch": 4.6528607460507105, + "grad_norm": 0.48297420672682345, + "learning_rate": 6.006639800902223e-07, + "loss": 0.5333, + "step": 17527 + }, + { + "epoch": 4.653126244524094, + "grad_norm": 0.5029774032039211, + "learning_rate": 6.004369839373464e-07, + "loss": 0.5255, + "step": 17528 + }, + { + "epoch": 4.653391742997478, + "grad_norm": 0.4906322123643664, + "learning_rate": 6.002100248306003e-07, + "loss": 0.5043, + "step": 17529 + }, + { + "epoch": 4.653657241470862, + "grad_norm": 0.47776693799916914, + "learning_rate": 5.999831027744102e-07, + "loss": 0.5192, + "step": 17530 + }, + { + "epoch": 4.653922739944245, + "grad_norm": 0.4761885990384588, + "learning_rate": 5.997562177732011e-07, + "loss": 0.5277, + "step": 17531 + }, + { + "epoch": 4.654188238417629, + "grad_norm": 0.48960847557217146, + "learning_rate": 5.995293698313989e-07, + "loss": 0.541, + "step": 17532 + }, + { + "epoch": 4.654453736891012, + "grad_norm": 0.46827458238421205, + "learning_rate": 5.993025589534263e-07, + "loss": 0.5272, + "step": 17533 + }, + { + "epoch": 4.6547192353643965, + "grad_norm": 0.48098516453084234, + "learning_rate": 5.990757851437084e-07, + "loss": 0.5146, + "step": 17534 + }, + { + "epoch": 4.6549847338377806, + "grad_norm": 0.48522723591487577, + "learning_rate": 5.98849048406667e-07, + "loss": 0.5346, + "step": 17535 + }, + { + "epoch": 4.655250232311165, + "grad_norm": 0.4885720496346273, + "learning_rate": 5.98622348746723e-07, + "loss": 0.6009, + "step": 17536 + }, + { + "epoch": 4.655515730784548, + "grad_norm": 0.4724206205717385, + "learning_rate": 5.983956861682996e-07, + "loss": 0.521, + "step": 17537 + }, + { + "epoch": 4.655781229257932, + "grad_norm": 0.48080486246124465, + "learning_rate": 5.981690606758153e-07, + "loss": 0.5242, + "step": 17538 + }, + { + "epoch": 4.656046727731315, + "grad_norm": 0.48382632734128944, + "learning_rate": 5.979424722736917e-07, + "loss": 0.561, + "step": 17539 + }, + { + "epoch": 4.656312226204699, + "grad_norm": 0.47459040019245186, + "learning_rate": 5.977159209663469e-07, + "loss": 0.522, + "step": 17540 + }, + { + "epoch": 4.656577724678083, + "grad_norm": 0.4960771640876124, + "learning_rate": 5.974894067581985e-07, + "loss": 0.5528, + "step": 17541 + }, + { + "epoch": 4.656843223151467, + "grad_norm": 0.4946943687706496, + "learning_rate": 5.972629296536656e-07, + "loss": 0.5504, + "step": 17542 + }, + { + "epoch": 4.657108721624851, + "grad_norm": 0.47413147783826876, + "learning_rate": 5.970364896571632e-07, + "loss": 0.5345, + "step": 17543 + }, + { + "epoch": 4.657374220098235, + "grad_norm": 0.48884385933972324, + "learning_rate": 5.968100867731097e-07, + "loss": 0.5576, + "step": 17544 + }, + { + "epoch": 4.657639718571618, + "grad_norm": 0.4723688093824866, + "learning_rate": 5.965837210059183e-07, + "loss": 0.5052, + "step": 17545 + }, + { + "epoch": 4.657905217045002, + "grad_norm": 0.5053242426606632, + "learning_rate": 5.963573923600061e-07, + "loss": 0.5286, + "step": 17546 + }, + { + "epoch": 4.658170715518386, + "grad_norm": 0.49443720557402704, + "learning_rate": 5.961311008397844e-07, + "loss": 0.5357, + "step": 17547 + }, + { + "epoch": 4.658436213991769, + "grad_norm": 0.4882227340649235, + "learning_rate": 5.959048464496684e-07, + "loss": 0.5503, + "step": 17548 + }, + { + "epoch": 4.658701712465153, + "grad_norm": 0.48024454238455566, + "learning_rate": 5.956786291940692e-07, + "loss": 0.5528, + "step": 17549 + }, + { + "epoch": 4.658967210938537, + "grad_norm": 0.4870678964927204, + "learning_rate": 5.954524490774e-07, + "loss": 0.5456, + "step": 17550 + }, + { + "epoch": 4.659232709411921, + "grad_norm": 0.4905170549489484, + "learning_rate": 5.952263061040706e-07, + "loss": 0.5258, + "step": 17551 + }, + { + "epoch": 4.659498207885305, + "grad_norm": 0.4659922391824595, + "learning_rate": 5.950002002784925e-07, + "loss": 0.5226, + "step": 17552 + }, + { + "epoch": 4.659763706358689, + "grad_norm": 0.49058509734621847, + "learning_rate": 5.947741316050748e-07, + "loss": 0.5563, + "step": 17553 + }, + { + "epoch": 4.660029204832072, + "grad_norm": 0.49105648665873314, + "learning_rate": 5.945481000882256e-07, + "loss": 0.485, + "step": 17554 + }, + { + "epoch": 4.660294703305456, + "grad_norm": 0.46140717033832296, + "learning_rate": 5.943221057323545e-07, + "loss": 0.5257, + "step": 17555 + }, + { + "epoch": 4.66056020177884, + "grad_norm": 0.47002253033801006, + "learning_rate": 5.940961485418676e-07, + "loss": 0.5219, + "step": 17556 + }, + { + "epoch": 4.660825700252223, + "grad_norm": 0.4683450032792175, + "learning_rate": 5.938702285211734e-07, + "loss": 0.5507, + "step": 17557 + }, + { + "epoch": 4.661091198725607, + "grad_norm": 0.4821275873845253, + "learning_rate": 5.936443456746757e-07, + "loss": 0.5077, + "step": 17558 + }, + { + "epoch": 4.6613566971989915, + "grad_norm": 0.47015878379510917, + "learning_rate": 5.934185000067813e-07, + "loss": 0.5359, + "step": 17559 + }, + { + "epoch": 4.661622195672375, + "grad_norm": 0.4819174436246306, + "learning_rate": 5.931926915218936e-07, + "loss": 0.5516, + "step": 17560 + }, + { + "epoch": 4.661887694145759, + "grad_norm": 0.49564587563414625, + "learning_rate": 5.929669202244171e-07, + "loss": 0.5257, + "step": 17561 + }, + { + "epoch": 4.662153192619142, + "grad_norm": 0.4722876064451484, + "learning_rate": 5.927411861187557e-07, + "loss": 0.5582, + "step": 17562 + }, + { + "epoch": 4.662418691092526, + "grad_norm": 0.4713146127236207, + "learning_rate": 5.925154892093107e-07, + "loss": 0.5088, + "step": 17563 + }, + { + "epoch": 4.66268418956591, + "grad_norm": 0.4978120518295372, + "learning_rate": 5.922898295004842e-07, + "loss": 0.5377, + "step": 17564 + }, + { + "epoch": 4.662949688039294, + "grad_norm": 0.4840002431160575, + "learning_rate": 5.920642069966761e-07, + "loss": 0.5594, + "step": 17565 + }, + { + "epoch": 4.663215186512677, + "grad_norm": 0.48583530444246537, + "learning_rate": 5.918386217022882e-07, + "loss": 0.5554, + "step": 17566 + }, + { + "epoch": 4.6634806849860615, + "grad_norm": 0.49066548228280116, + "learning_rate": 5.916130736217184e-07, + "loss": 0.5542, + "step": 17567 + }, + { + "epoch": 4.663746183459445, + "grad_norm": 0.48258149995797395, + "learning_rate": 5.913875627593668e-07, + "loss": 0.5564, + "step": 17568 + }, + { + "epoch": 4.664011681932829, + "grad_norm": 0.47329799252740956, + "learning_rate": 5.91162089119631e-07, + "loss": 0.5197, + "step": 17569 + }, + { + "epoch": 4.664277180406213, + "grad_norm": 0.4713087173524675, + "learning_rate": 5.909366527069074e-07, + "loss": 0.5379, + "step": 17570 + }, + { + "epoch": 4.664542678879596, + "grad_norm": 0.5033998052036267, + "learning_rate": 5.907112535255938e-07, + "loss": 0.5525, + "step": 17571 + }, + { + "epoch": 4.66480817735298, + "grad_norm": 0.4879632228198571, + "learning_rate": 5.904858915800849e-07, + "loss": 0.5625, + "step": 17572 + }, + { + "epoch": 4.665073675826364, + "grad_norm": 0.4731921613301923, + "learning_rate": 5.902605668747771e-07, + "loss": 0.5282, + "step": 17573 + }, + { + "epoch": 4.665339174299747, + "grad_norm": 0.504304310238166, + "learning_rate": 5.900352794140635e-07, + "loss": 0.5786, + "step": 17574 + }, + { + "epoch": 4.6656046727731315, + "grad_norm": 0.48854622053171703, + "learning_rate": 5.898100292023396e-07, + "loss": 0.5413, + "step": 17575 + }, + { + "epoch": 4.665870171246516, + "grad_norm": 0.46770579152568525, + "learning_rate": 5.895848162439957e-07, + "loss": 0.5137, + "step": 17576 + }, + { + "epoch": 4.666135669719899, + "grad_norm": 0.4809217873879484, + "learning_rate": 5.89359640543426e-07, + "loss": 0.5576, + "step": 17577 + }, + { + "epoch": 4.666401168193283, + "grad_norm": 0.4846342404448908, + "learning_rate": 5.89134502105021e-07, + "loss": 0.5294, + "step": 17578 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.46090536410155997, + "learning_rate": 5.889094009331723e-07, + "loss": 0.5338, + "step": 17579 + }, + { + "epoch": 4.66693216514005, + "grad_norm": 0.4896591115933786, + "learning_rate": 5.886843370322692e-07, + "loss": 0.5715, + "step": 17580 + }, + { + "epoch": 4.667197663613434, + "grad_norm": 0.48910987785018056, + "learning_rate": 5.884593104067007e-07, + "loss": 0.532, + "step": 17581 + }, + { + "epoch": 4.667463162086818, + "grad_norm": 0.4765665283913625, + "learning_rate": 5.882343210608568e-07, + "loss": 0.5524, + "step": 17582 + }, + { + "epoch": 4.6677286605602015, + "grad_norm": 0.48390681919362444, + "learning_rate": 5.880093689991237e-07, + "loss": 0.5548, + "step": 17583 + }, + { + "epoch": 4.667994159033586, + "grad_norm": 0.47642255429498725, + "learning_rate": 5.877844542258898e-07, + "loss": 0.5641, + "step": 17584 + }, + { + "epoch": 4.66825965750697, + "grad_norm": 0.48236550095023334, + "learning_rate": 5.875595767455403e-07, + "loss": 0.5566, + "step": 17585 + }, + { + "epoch": 4.668525155980353, + "grad_norm": 0.46751842960521256, + "learning_rate": 5.873347365624624e-07, + "loss": 0.53, + "step": 17586 + }, + { + "epoch": 4.668790654453737, + "grad_norm": 0.4742034038382578, + "learning_rate": 5.871099336810399e-07, + "loss": 0.5416, + "step": 17587 + }, + { + "epoch": 4.669056152927121, + "grad_norm": 0.4814675753361654, + "learning_rate": 5.868851681056567e-07, + "loss": 0.5396, + "step": 17588 + }, + { + "epoch": 4.669321651400504, + "grad_norm": 0.4977385163735499, + "learning_rate": 5.866604398406974e-07, + "loss": 0.5444, + "step": 17589 + }, + { + "epoch": 4.669587149873888, + "grad_norm": 0.4851284381527935, + "learning_rate": 5.864357488905436e-07, + "loss": 0.5472, + "step": 17590 + }, + { + "epoch": 4.6698526483472715, + "grad_norm": 0.48440975359483257, + "learning_rate": 5.862110952595784e-07, + "loss": 0.5273, + "step": 17591 + }, + { + "epoch": 4.670118146820656, + "grad_norm": 0.4836313881015596, + "learning_rate": 5.859864789521829e-07, + "loss": 0.5526, + "step": 17592 + }, + { + "epoch": 4.67038364529404, + "grad_norm": 0.48868472492330045, + "learning_rate": 5.857618999727374e-07, + "loss": 0.5491, + "step": 17593 + }, + { + "epoch": 4.670649143767424, + "grad_norm": 0.4733038777661721, + "learning_rate": 5.855373583256208e-07, + "loss": 0.5218, + "step": 17594 + }, + { + "epoch": 4.670914642240807, + "grad_norm": 0.49228433959135315, + "learning_rate": 5.85312854015214e-07, + "loss": 0.5149, + "step": 17595 + }, + { + "epoch": 4.671180140714191, + "grad_norm": 0.4849395865814369, + "learning_rate": 5.850883870458938e-07, + "loss": 0.5666, + "step": 17596 + }, + { + "epoch": 4.671445639187574, + "grad_norm": 0.49142280099286195, + "learning_rate": 5.848639574220388e-07, + "loss": 0.5419, + "step": 17597 + }, + { + "epoch": 4.671711137660958, + "grad_norm": 0.47582298862092975, + "learning_rate": 5.846395651480263e-07, + "loss": 0.549, + "step": 17598 + }, + { + "epoch": 4.671976636134342, + "grad_norm": 0.4969085833713176, + "learning_rate": 5.844152102282308e-07, + "loss": 0.5531, + "step": 17599 + }, + { + "epoch": 4.672242134607726, + "grad_norm": 0.48560548166621376, + "learning_rate": 5.841908926670298e-07, + "loss": 0.5412, + "step": 17600 + }, + { + "epoch": 4.67250763308111, + "grad_norm": 0.4881200512521312, + "learning_rate": 5.839666124687963e-07, + "loss": 0.5385, + "step": 17601 + }, + { + "epoch": 4.672773131554494, + "grad_norm": 0.4848123636599459, + "learning_rate": 5.837423696379058e-07, + "loss": 0.5515, + "step": 17602 + }, + { + "epoch": 4.673038630027877, + "grad_norm": 0.48706127766612134, + "learning_rate": 5.83518164178731e-07, + "loss": 0.5335, + "step": 17603 + }, + { + "epoch": 4.673304128501261, + "grad_norm": 0.47707888814198224, + "learning_rate": 5.832939960956443e-07, + "loss": 0.5456, + "step": 17604 + }, + { + "epoch": 4.673569626974645, + "grad_norm": 0.48954553381221605, + "learning_rate": 5.83069865393017e-07, + "loss": 0.5707, + "step": 17605 + }, + { + "epoch": 4.673835125448028, + "grad_norm": 0.4723074869203257, + "learning_rate": 5.828457720752212e-07, + "loss": 0.525, + "step": 17606 + }, + { + "epoch": 4.6741006239214125, + "grad_norm": 0.47291595883586074, + "learning_rate": 5.826217161466263e-07, + "loss": 0.5376, + "step": 17607 + }, + { + "epoch": 4.6743661223947965, + "grad_norm": 0.49306722599386016, + "learning_rate": 5.823976976116025e-07, + "loss": 0.529, + "step": 17608 + }, + { + "epoch": 4.67463162086818, + "grad_norm": 0.4845012409211947, + "learning_rate": 5.8217371647452e-07, + "loss": 0.5562, + "step": 17609 + }, + { + "epoch": 4.674897119341564, + "grad_norm": 0.48527092002464906, + "learning_rate": 5.819497727397444e-07, + "loss": 0.5487, + "step": 17610 + }, + { + "epoch": 4.675162617814948, + "grad_norm": 0.48576677735608037, + "learning_rate": 5.817258664116451e-07, + "loss": 0.5898, + "step": 17611 + }, + { + "epoch": 4.675428116288331, + "grad_norm": 0.47639349612336684, + "learning_rate": 5.815019974945873e-07, + "loss": 0.5193, + "step": 17612 + }, + { + "epoch": 4.675693614761715, + "grad_norm": 0.4838384117617082, + "learning_rate": 5.81278165992939e-07, + "loss": 0.5582, + "step": 17613 + }, + { + "epoch": 4.675959113235099, + "grad_norm": 0.4623622340569345, + "learning_rate": 5.810543719110631e-07, + "loss": 0.5408, + "step": 17614 + }, + { + "epoch": 4.6762246117084825, + "grad_norm": 0.4900859380467966, + "learning_rate": 5.808306152533264e-07, + "loss": 0.5425, + "step": 17615 + }, + { + "epoch": 4.6764901101818666, + "grad_norm": 0.4803258612960572, + "learning_rate": 5.806068960240912e-07, + "loss": 0.5525, + "step": 17616 + }, + { + "epoch": 4.676755608655251, + "grad_norm": 0.4684219371412779, + "learning_rate": 5.803832142277208e-07, + "loss": 0.4709, + "step": 17617 + }, + { + "epoch": 4.677021107128634, + "grad_norm": 0.4851800992748377, + "learning_rate": 5.80159569868578e-07, + "loss": 0.5329, + "step": 17618 + }, + { + "epoch": 4.677286605602018, + "grad_norm": 0.4838513057364973, + "learning_rate": 5.799359629510238e-07, + "loss": 0.5374, + "step": 17619 + }, + { + "epoch": 4.677552104075401, + "grad_norm": 0.4764960125443222, + "learning_rate": 5.797123934794199e-07, + "loss": 0.549, + "step": 17620 + }, + { + "epoch": 4.677817602548785, + "grad_norm": 0.47855145978086844, + "learning_rate": 5.794888614581262e-07, + "loss": 0.5184, + "step": 17621 + }, + { + "epoch": 4.678083101022169, + "grad_norm": 0.4755579249056458, + "learning_rate": 5.792653668915019e-07, + "loss": 0.5589, + "step": 17622 + }, + { + "epoch": 4.678348599495553, + "grad_norm": 0.4753189801687665, + "learning_rate": 5.79041909783905e-07, + "loss": 0.5257, + "step": 17623 + }, + { + "epoch": 4.678614097968937, + "grad_norm": 0.48193284402381986, + "learning_rate": 5.788184901396948e-07, + "loss": 0.5382, + "step": 17624 + }, + { + "epoch": 4.678879596442321, + "grad_norm": 0.4890202180691973, + "learning_rate": 5.785951079632273e-07, + "loss": 0.5439, + "step": 17625 + }, + { + "epoch": 4.679145094915704, + "grad_norm": 0.48442205472478506, + "learning_rate": 5.783717632588604e-07, + "loss": 0.5433, + "step": 17626 + }, + { + "epoch": 4.679410593389088, + "grad_norm": 0.4760361706132955, + "learning_rate": 5.781484560309489e-07, + "loss": 0.512, + "step": 17627 + }, + { + "epoch": 4.679676091862472, + "grad_norm": 0.46979925260422123, + "learning_rate": 5.779251862838473e-07, + "loss": 0.5184, + "step": 17628 + }, + { + "epoch": 4.679941590335855, + "grad_norm": 0.48876903818091666, + "learning_rate": 5.777019540219115e-07, + "loss": 0.5271, + "step": 17629 + }, + { + "epoch": 4.680207088809239, + "grad_norm": 0.4919130465922979, + "learning_rate": 5.774787592494932e-07, + "loss": 0.5367, + "step": 17630 + }, + { + "epoch": 4.680472587282623, + "grad_norm": 0.4912653966397032, + "learning_rate": 5.772556019709471e-07, + "loss": 0.5471, + "step": 17631 + }, + { + "epoch": 4.680738085756007, + "grad_norm": 0.4677955252629279, + "learning_rate": 5.770324821906245e-07, + "loss": 0.5156, + "step": 17632 + }, + { + "epoch": 4.681003584229391, + "grad_norm": 0.4782225828547463, + "learning_rate": 5.76809399912876e-07, + "loss": 0.548, + "step": 17633 + }, + { + "epoch": 4.681269082702775, + "grad_norm": 0.5007467722677937, + "learning_rate": 5.765863551420536e-07, + "loss": 0.5408, + "step": 17634 + }, + { + "epoch": 4.681534581176158, + "grad_norm": 0.4791252201571909, + "learning_rate": 5.763633478825062e-07, + "loss": 0.5242, + "step": 17635 + }, + { + "epoch": 4.681800079649542, + "grad_norm": 0.48141369199550105, + "learning_rate": 5.761403781385838e-07, + "loss": 0.5128, + "step": 17636 + }, + { + "epoch": 4.682065578122926, + "grad_norm": 0.4851972073438632, + "learning_rate": 5.759174459146338e-07, + "loss": 0.5166, + "step": 17637 + }, + { + "epoch": 4.682331076596309, + "grad_norm": 0.4688306452182194, + "learning_rate": 5.756945512150062e-07, + "loss": 0.4894, + "step": 17638 + }, + { + "epoch": 4.682596575069693, + "grad_norm": 0.4763806003390963, + "learning_rate": 5.754716940440444e-07, + "loss": 0.5412, + "step": 17639 + }, + { + "epoch": 4.6828620735430775, + "grad_norm": 0.467762019792464, + "learning_rate": 5.752488744060977e-07, + "loss": 0.5163, + "step": 17640 + }, + { + "epoch": 4.683127572016461, + "grad_norm": 0.48043048877006794, + "learning_rate": 5.750260923055098e-07, + "loss": 0.5481, + "step": 17641 + }, + { + "epoch": 4.683393070489845, + "grad_norm": 0.4782161327375079, + "learning_rate": 5.748033477466272e-07, + "loss": 0.534, + "step": 17642 + }, + { + "epoch": 4.683658568963228, + "grad_norm": 0.48181523038820157, + "learning_rate": 5.74580640733792e-07, + "loss": 0.4998, + "step": 17643 + }, + { + "epoch": 4.683924067436612, + "grad_norm": 0.48400657392850793, + "learning_rate": 5.743579712713493e-07, + "loss": 0.534, + "step": 17644 + }, + { + "epoch": 4.684189565909996, + "grad_norm": 0.48203313466833236, + "learning_rate": 5.741353393636409e-07, + "loss": 0.5444, + "step": 17645 + }, + { + "epoch": 4.68445506438338, + "grad_norm": 0.46303026024735244, + "learning_rate": 5.739127450150081e-07, + "loss": 0.5056, + "step": 17646 + }, + { + "epoch": 4.684720562856763, + "grad_norm": 0.49002623863633915, + "learning_rate": 5.736901882297935e-07, + "loss": 0.5391, + "step": 17647 + }, + { + "epoch": 4.6849860613301475, + "grad_norm": 0.48493375159790675, + "learning_rate": 5.734676690123359e-07, + "loss": 0.5643, + "step": 17648 + }, + { + "epoch": 4.685251559803531, + "grad_norm": 0.47225961223539814, + "learning_rate": 5.732451873669762e-07, + "loss": 0.588, + "step": 17649 + }, + { + "epoch": 4.685517058276915, + "grad_norm": 0.4825259799409351, + "learning_rate": 5.730227432980529e-07, + "loss": 0.5431, + "step": 17650 + }, + { + "epoch": 4.685782556750299, + "grad_norm": 0.47259817650436947, + "learning_rate": 5.728003368099042e-07, + "loss": 0.5239, + "step": 17651 + }, + { + "epoch": 4.686048055223683, + "grad_norm": 0.48974897267404094, + "learning_rate": 5.725779679068669e-07, + "loss": 0.5031, + "step": 17652 + }, + { + "epoch": 4.686313553697066, + "grad_norm": 0.4779099885894124, + "learning_rate": 5.723556365932783e-07, + "loss": 0.5467, + "step": 17653 + }, + { + "epoch": 4.68657905217045, + "grad_norm": 0.4725783187954063, + "learning_rate": 5.721333428734751e-07, + "loss": 0.5717, + "step": 17654 + }, + { + "epoch": 4.686844550643833, + "grad_norm": 0.4718272669169045, + "learning_rate": 5.719110867517921e-07, + "loss": 0.4784, + "step": 17655 + }, + { + "epoch": 4.6871100491172175, + "grad_norm": 0.4787208466826315, + "learning_rate": 5.716888682325636e-07, + "loss": 0.5243, + "step": 17656 + }, + { + "epoch": 4.687375547590602, + "grad_norm": 0.48185092113592476, + "learning_rate": 5.714666873201227e-07, + "loss": 0.5194, + "step": 17657 + }, + { + "epoch": 4.687641046063985, + "grad_norm": 0.46988242520776813, + "learning_rate": 5.712445440188042e-07, + "loss": 0.5385, + "step": 17658 + }, + { + "epoch": 4.687906544537369, + "grad_norm": 0.4992162803197245, + "learning_rate": 5.710224383329388e-07, + "loss": 0.5161, + "step": 17659 + }, + { + "epoch": 4.688172043010753, + "grad_norm": 0.4853268441272868, + "learning_rate": 5.708003702668594e-07, + "loss": 0.5066, + "step": 17660 + }, + { + "epoch": 4.688437541484136, + "grad_norm": 0.49216437185841266, + "learning_rate": 5.705783398248962e-07, + "loss": 0.5309, + "step": 17661 + }, + { + "epoch": 4.68870303995752, + "grad_norm": 0.48741835680902423, + "learning_rate": 5.703563470113787e-07, + "loss": 0.5506, + "step": 17662 + }, + { + "epoch": 4.688968538430904, + "grad_norm": 0.49334566346511916, + "learning_rate": 5.701343918306379e-07, + "loss": 0.5507, + "step": 17663 + }, + { + "epoch": 4.6892340369042875, + "grad_norm": 0.4863879418295176, + "learning_rate": 5.699124742870008e-07, + "loss": 0.5418, + "step": 17664 + }, + { + "epoch": 4.689499535377672, + "grad_norm": 0.4850993576671298, + "learning_rate": 5.696905943847966e-07, + "loss": 0.5512, + "step": 17665 + }, + { + "epoch": 4.689765033851056, + "grad_norm": 0.4745206470230527, + "learning_rate": 5.694687521283515e-07, + "loss": 0.5532, + "step": 17666 + }, + { + "epoch": 4.690030532324439, + "grad_norm": 0.48718978757060233, + "learning_rate": 5.692469475219939e-07, + "loss": 0.5715, + "step": 17667 + }, + { + "epoch": 4.690296030797823, + "grad_norm": 0.4935803637370086, + "learning_rate": 5.690251805700467e-07, + "loss": 0.5268, + "step": 17668 + }, + { + "epoch": 4.690561529271207, + "grad_norm": 0.49716031661918836, + "learning_rate": 5.68803451276837e-07, + "loss": 0.5605, + "step": 17669 + }, + { + "epoch": 4.69082702774459, + "grad_norm": 0.47096071214419916, + "learning_rate": 5.685817596466878e-07, + "loss": 0.542, + "step": 17670 + }, + { + "epoch": 4.691092526217974, + "grad_norm": 0.4782796887933522, + "learning_rate": 5.683601056839239e-07, + "loss": 0.4998, + "step": 17671 + }, + { + "epoch": 4.6913580246913575, + "grad_norm": 0.47083572006107266, + "learning_rate": 5.681384893928666e-07, + "loss": 0.5197, + "step": 17672 + }, + { + "epoch": 4.691623523164742, + "grad_norm": 0.4687143729760963, + "learning_rate": 5.679169107778396e-07, + "loss": 0.531, + "step": 17673 + }, + { + "epoch": 4.691889021638126, + "grad_norm": 0.48754782802963226, + "learning_rate": 5.676953698431631e-07, + "loss": 0.5154, + "step": 17674 + }, + { + "epoch": 4.69215452011151, + "grad_norm": 0.4826093035524664, + "learning_rate": 5.674738665931575e-07, + "loss": 0.5472, + "step": 17675 + }, + { + "epoch": 4.692420018584893, + "grad_norm": 0.4725475949278952, + "learning_rate": 5.672524010321439e-07, + "loss": 0.5239, + "step": 17676 + }, + { + "epoch": 4.692685517058277, + "grad_norm": 0.4802382598068678, + "learning_rate": 5.6703097316444e-07, + "loss": 0.5338, + "step": 17677 + }, + { + "epoch": 4.69295101553166, + "grad_norm": 0.47618119521432156, + "learning_rate": 5.668095829943654e-07, + "loss": 0.5257, + "step": 17678 + }, + { + "epoch": 4.693216514005044, + "grad_norm": 0.4829501546062373, + "learning_rate": 5.665882305262371e-07, + "loss": 0.5273, + "step": 17679 + }, + { + "epoch": 4.6934820124784284, + "grad_norm": 0.497360091971884, + "learning_rate": 5.663669157643717e-07, + "loss": 0.5423, + "step": 17680 + }, + { + "epoch": 4.693747510951812, + "grad_norm": 0.48470220784625606, + "learning_rate": 5.661456387130865e-07, + "loss": 0.5227, + "step": 17681 + }, + { + "epoch": 4.694013009425196, + "grad_norm": 0.48083656551025106, + "learning_rate": 5.659243993766953e-07, + "loss": 0.5273, + "step": 17682 + }, + { + "epoch": 4.69427850789858, + "grad_norm": 0.46991881641280187, + "learning_rate": 5.657031977595145e-07, + "loss": 0.5206, + "step": 17683 + }, + { + "epoch": 4.694544006371963, + "grad_norm": 0.47428118763504223, + "learning_rate": 5.654820338658573e-07, + "loss": 0.5159, + "step": 17684 + }, + { + "epoch": 4.694809504845347, + "grad_norm": 0.48141874363334025, + "learning_rate": 5.65260907700037e-07, + "loss": 0.529, + "step": 17685 + }, + { + "epoch": 4.695075003318731, + "grad_norm": 0.4807693940853336, + "learning_rate": 5.650398192663653e-07, + "loss": 0.5187, + "step": 17686 + }, + { + "epoch": 4.695340501792114, + "grad_norm": 0.47242237498816697, + "learning_rate": 5.648187685691556e-07, + "loss": 0.5219, + "step": 17687 + }, + { + "epoch": 4.6956060002654985, + "grad_norm": 0.4660428159676864, + "learning_rate": 5.645977556127171e-07, + "loss": 0.5459, + "step": 17688 + }, + { + "epoch": 4.6958714987388825, + "grad_norm": 0.47842588385539825, + "learning_rate": 5.64376780401362e-07, + "loss": 0.5063, + "step": 17689 + }, + { + "epoch": 4.696136997212266, + "grad_norm": 0.49167894992186506, + "learning_rate": 5.64155842939399e-07, + "loss": 0.524, + "step": 17690 + }, + { + "epoch": 4.69640249568565, + "grad_norm": 0.4832866071866792, + "learning_rate": 5.639349432311358e-07, + "loss": 0.5551, + "step": 17691 + }, + { + "epoch": 4.696667994159034, + "grad_norm": 0.4719939196201807, + "learning_rate": 5.637140812808822e-07, + "loss": 0.5306, + "step": 17692 + }, + { + "epoch": 4.696933492632417, + "grad_norm": 0.4825039411499423, + "learning_rate": 5.634932570929444e-07, + "loss": 0.5563, + "step": 17693 + }, + { + "epoch": 4.697198991105801, + "grad_norm": 0.46505992805491425, + "learning_rate": 5.6327247067163e-07, + "loss": 0.5036, + "step": 17694 + }, + { + "epoch": 4.697464489579185, + "grad_norm": 0.48955913827148284, + "learning_rate": 5.630517220212436e-07, + "loss": 0.5732, + "step": 17695 + }, + { + "epoch": 4.6977299880525685, + "grad_norm": 0.4790493167250704, + "learning_rate": 5.628310111460927e-07, + "loss": 0.5124, + "step": 17696 + }, + { + "epoch": 4.697995486525953, + "grad_norm": 0.47657144084990877, + "learning_rate": 5.626103380504785e-07, + "loss": 0.5246, + "step": 17697 + }, + { + "epoch": 4.698260984999337, + "grad_norm": 0.47579745159895026, + "learning_rate": 5.623897027387066e-07, + "loss": 0.5492, + "step": 17698 + }, + { + "epoch": 4.69852648347272, + "grad_norm": 0.49439699695857486, + "learning_rate": 5.621691052150799e-07, + "loss": 0.5499, + "step": 17699 + }, + { + "epoch": 4.698791981946104, + "grad_norm": 0.4856148155891063, + "learning_rate": 5.619485454838996e-07, + "loss": 0.5149, + "step": 17700 + }, + { + "epoch": 4.699057480419487, + "grad_norm": 0.4711075584651752, + "learning_rate": 5.617280235494696e-07, + "loss": 0.5551, + "step": 17701 + }, + { + "epoch": 4.699322978892871, + "grad_norm": 0.4888132691610238, + "learning_rate": 5.615075394160871e-07, + "loss": 0.5263, + "step": 17702 + }, + { + "epoch": 4.699588477366255, + "grad_norm": 0.48477841209254213, + "learning_rate": 5.612870930880546e-07, + "loss": 0.5275, + "step": 17703 + }, + { + "epoch": 4.699853975839639, + "grad_norm": 0.500526299657142, + "learning_rate": 5.6106668456967e-07, + "loss": 0.5413, + "step": 17704 + }, + { + "epoch": 4.700119474313023, + "grad_norm": 0.4891374348174846, + "learning_rate": 5.608463138652329e-07, + "loss": 0.538, + "step": 17705 + }, + { + "epoch": 4.700384972786407, + "grad_norm": 0.4928144589744349, + "learning_rate": 5.606259809790398e-07, + "loss": 0.5328, + "step": 17706 + }, + { + "epoch": 4.70065047125979, + "grad_norm": 0.46935866802592463, + "learning_rate": 5.604056859153895e-07, + "loss": 0.5232, + "step": 17707 + }, + { + "epoch": 4.700915969733174, + "grad_norm": 0.4775826680745534, + "learning_rate": 5.601854286785769e-07, + "loss": 0.5004, + "step": 17708 + }, + { + "epoch": 4.701181468206558, + "grad_norm": 0.47012894265697863, + "learning_rate": 5.599652092728974e-07, + "loss": 0.5242, + "step": 17709 + }, + { + "epoch": 4.701446966679941, + "grad_norm": 0.482002968086078, + "learning_rate": 5.59745027702647e-07, + "loss": 0.5216, + "step": 17710 + }, + { + "epoch": 4.701712465153325, + "grad_norm": 0.48505688665332697, + "learning_rate": 5.595248839721185e-07, + "loss": 0.5591, + "step": 17711 + }, + { + "epoch": 4.701977963626709, + "grad_norm": 0.48418410369329307, + "learning_rate": 5.593047780856065e-07, + "loss": 0.5401, + "step": 17712 + }, + { + "epoch": 4.702243462100093, + "grad_norm": 0.4705404933901663, + "learning_rate": 5.590847100474031e-07, + "loss": 0.5462, + "step": 17713 + }, + { + "epoch": 4.702508960573477, + "grad_norm": 0.47372703100535973, + "learning_rate": 5.588646798618e-07, + "loss": 0.5053, + "step": 17714 + }, + { + "epoch": 4.702774459046861, + "grad_norm": 0.5028926597338859, + "learning_rate": 5.586446875330875e-07, + "loss": 0.5546, + "step": 17715 + }, + { + "epoch": 4.703039957520244, + "grad_norm": 0.4951186410103098, + "learning_rate": 5.584247330655577e-07, + "loss": 0.5375, + "step": 17716 + }, + { + "epoch": 4.703305455993628, + "grad_norm": 0.4742472055461465, + "learning_rate": 5.582048164634984e-07, + "loss": 0.582, + "step": 17717 + }, + { + "epoch": 4.703570954467012, + "grad_norm": 0.4692746048232161, + "learning_rate": 5.579849377312005e-07, + "loss": 0.5112, + "step": 17718 + }, + { + "epoch": 4.703836452940395, + "grad_norm": 0.48377818946665885, + "learning_rate": 5.577650968729512e-07, + "loss": 0.5491, + "step": 17719 + }, + { + "epoch": 4.704101951413779, + "grad_norm": 0.4886841292710266, + "learning_rate": 5.575452938930373e-07, + "loss": 0.5322, + "step": 17720 + }, + { + "epoch": 4.7043674498871635, + "grad_norm": 0.4736128380926772, + "learning_rate": 5.573255287957466e-07, + "loss": 0.5256, + "step": 17721 + }, + { + "epoch": 4.704632948360547, + "grad_norm": 0.5512949986442008, + "learning_rate": 5.571058015853639e-07, + "loss": 0.548, + "step": 17722 + }, + { + "epoch": 4.704898446833931, + "grad_norm": 0.48024427215948706, + "learning_rate": 5.568861122661759e-07, + "loss": 0.5531, + "step": 17723 + }, + { + "epoch": 4.705163945307315, + "grad_norm": 0.4790075735255522, + "learning_rate": 5.566664608424663e-07, + "loss": 0.5321, + "step": 17724 + }, + { + "epoch": 4.705429443780698, + "grad_norm": 0.4773187956376124, + "learning_rate": 5.56446847318518e-07, + "loss": 0.5383, + "step": 17725 + }, + { + "epoch": 4.705694942254082, + "grad_norm": 0.47673629409431423, + "learning_rate": 5.562272716986155e-07, + "loss": 0.5098, + "step": 17726 + }, + { + "epoch": 4.705960440727466, + "grad_norm": 0.4690923263898515, + "learning_rate": 5.560077339870398e-07, + "loss": 0.5068, + "step": 17727 + }, + { + "epoch": 4.706225939200849, + "grad_norm": 0.4777111756311077, + "learning_rate": 5.557882341880735e-07, + "loss": 0.5127, + "step": 17728 + }, + { + "epoch": 4.7064914376742335, + "grad_norm": 0.4769588377876074, + "learning_rate": 5.555687723059963e-07, + "loss": 0.5436, + "step": 17729 + }, + { + "epoch": 4.706756936147617, + "grad_norm": 0.4838361199272956, + "learning_rate": 5.553493483450903e-07, + "loss": 0.5429, + "step": 17730 + }, + { + "epoch": 4.707022434621001, + "grad_norm": 0.48380839188958163, + "learning_rate": 5.551299623096318e-07, + "loss": 0.5523, + "step": 17731 + }, + { + "epoch": 4.707287933094385, + "grad_norm": 0.4704060853857825, + "learning_rate": 5.549106142039018e-07, + "loss": 0.5343, + "step": 17732 + }, + { + "epoch": 4.707553431567769, + "grad_norm": 0.49027113289736296, + "learning_rate": 5.546913040321763e-07, + "loss": 0.4773, + "step": 17733 + }, + { + "epoch": 4.707818930041152, + "grad_norm": 0.4808552880080877, + "learning_rate": 5.544720317987343e-07, + "loss": 0.5647, + "step": 17734 + }, + { + "epoch": 4.708084428514536, + "grad_norm": 0.47396429879709573, + "learning_rate": 5.542527975078502e-07, + "loss": 0.5153, + "step": 17735 + }, + { + "epoch": 4.708349926987919, + "grad_norm": 0.46603318972425906, + "learning_rate": 5.540336011638012e-07, + "loss": 0.505, + "step": 17736 + }, + { + "epoch": 4.7086154254613035, + "grad_norm": 0.48374807699342665, + "learning_rate": 5.538144427708617e-07, + "loss": 0.5579, + "step": 17737 + }, + { + "epoch": 4.708880923934688, + "grad_norm": 0.4839115586845375, + "learning_rate": 5.535953223333049e-07, + "loss": 0.523, + "step": 17738 + }, + { + "epoch": 4.709146422408071, + "grad_norm": 0.4651048068699564, + "learning_rate": 5.533762398554057e-07, + "loss": 0.5244, + "step": 17739 + }, + { + "epoch": 4.709411920881455, + "grad_norm": 0.4781862500942812, + "learning_rate": 5.531571953414353e-07, + "loss": 0.5055, + "step": 17740 + }, + { + "epoch": 4.709677419354839, + "grad_norm": 0.4818592551247389, + "learning_rate": 5.529381887956667e-07, + "loss": 0.5417, + "step": 17741 + }, + { + "epoch": 4.709942917828222, + "grad_norm": 0.47474156603044654, + "learning_rate": 5.527192202223708e-07, + "loss": 0.5122, + "step": 17742 + }, + { + "epoch": 4.710208416301606, + "grad_norm": 0.46940917969442003, + "learning_rate": 5.52500289625818e-07, + "loss": 0.5162, + "step": 17743 + }, + { + "epoch": 4.71047391477499, + "grad_norm": 0.4720338835065068, + "learning_rate": 5.522813970102769e-07, + "loss": 0.506, + "step": 17744 + }, + { + "epoch": 4.7107394132483735, + "grad_norm": 0.48044926928744863, + "learning_rate": 5.520625423800177e-07, + "loss": 0.5672, + "step": 17745 + }, + { + "epoch": 4.711004911721758, + "grad_norm": 0.48480859747769245, + "learning_rate": 5.518437257393088e-07, + "loss": 0.5384, + "step": 17746 + }, + { + "epoch": 4.711270410195142, + "grad_norm": 0.4765361350458473, + "learning_rate": 5.516249470924165e-07, + "loss": 0.5426, + "step": 17747 + }, + { + "epoch": 4.711535908668525, + "grad_norm": 0.48704799483949296, + "learning_rate": 5.514062064436096e-07, + "loss": 0.5326, + "step": 17748 + }, + { + "epoch": 4.711801407141909, + "grad_norm": 0.4713546300070442, + "learning_rate": 5.511875037971512e-07, + "loss": 0.5312, + "step": 17749 + }, + { + "epoch": 4.712066905615293, + "grad_norm": 0.4696325781583951, + "learning_rate": 5.509688391573087e-07, + "loss": 0.5604, + "step": 17750 + }, + { + "epoch": 4.712332404088676, + "grad_norm": 0.4797799861549737, + "learning_rate": 5.507502125283454e-07, + "loss": 0.5668, + "step": 17751 + }, + { + "epoch": 4.71259790256206, + "grad_norm": 0.48349585165362075, + "learning_rate": 5.505316239145261e-07, + "loss": 0.5257, + "step": 17752 + }, + { + "epoch": 4.712863401035444, + "grad_norm": 0.4810519624117421, + "learning_rate": 5.503130733201132e-07, + "loss": 0.5166, + "step": 17753 + }, + { + "epoch": 4.713128899508828, + "grad_norm": 0.4785625845715155, + "learning_rate": 5.500945607493682e-07, + "loss": 0.5432, + "step": 17754 + }, + { + "epoch": 4.713394397982212, + "grad_norm": 0.49089174685542164, + "learning_rate": 5.498760862065544e-07, + "loss": 0.4986, + "step": 17755 + }, + { + "epoch": 4.713659896455596, + "grad_norm": 0.4976340495275194, + "learning_rate": 5.496576496959307e-07, + "loss": 0.5134, + "step": 17756 + }, + { + "epoch": 4.713925394928979, + "grad_norm": 0.4749378062246209, + "learning_rate": 5.494392512217592e-07, + "loss": 0.5364, + "step": 17757 + }, + { + "epoch": 4.714190893402363, + "grad_norm": 0.48884412787440684, + "learning_rate": 5.492208907882971e-07, + "loss": 0.5663, + "step": 17758 + }, + { + "epoch": 4.714456391875746, + "grad_norm": 0.4829621297573895, + "learning_rate": 5.490025683998054e-07, + "loss": 0.5563, + "step": 17759 + }, + { + "epoch": 4.71472189034913, + "grad_norm": 0.47666660247528025, + "learning_rate": 5.487842840605389e-07, + "loss": 0.5531, + "step": 17760 + }, + { + "epoch": 4.7149873888225144, + "grad_norm": 0.4831572082266196, + "learning_rate": 5.485660377747573e-07, + "loss": 0.5508, + "step": 17761 + }, + { + "epoch": 4.7152528872958985, + "grad_norm": 0.493285742808171, + "learning_rate": 5.483478295467152e-07, + "loss": 0.5773, + "step": 17762 + }, + { + "epoch": 4.715518385769282, + "grad_norm": 0.46995877336638525, + "learning_rate": 5.481296593806692e-07, + "loss": 0.5004, + "step": 17763 + }, + { + "epoch": 4.715783884242666, + "grad_norm": 0.47369676905795444, + "learning_rate": 5.479115272808736e-07, + "loss": 0.5135, + "step": 17764 + }, + { + "epoch": 4.716049382716049, + "grad_norm": 0.4766463206880944, + "learning_rate": 5.476934332515831e-07, + "loss": 0.523, + "step": 17765 + }, + { + "epoch": 4.716314881189433, + "grad_norm": 0.479706849844324, + "learning_rate": 5.47475377297051e-07, + "loss": 0.5604, + "step": 17766 + }, + { + "epoch": 4.716580379662817, + "grad_norm": 0.4685820976551383, + "learning_rate": 5.472573594215285e-07, + "loss": 0.5251, + "step": 17767 + }, + { + "epoch": 4.7168458781362, + "grad_norm": 0.4857829123935467, + "learning_rate": 5.470393796292697e-07, + "loss": 0.5744, + "step": 17768 + }, + { + "epoch": 4.7171113766095845, + "grad_norm": 0.477079145233189, + "learning_rate": 5.46821437924524e-07, + "loss": 0.5172, + "step": 17769 + }, + { + "epoch": 4.7173768750829685, + "grad_norm": 0.47500316034050705, + "learning_rate": 5.46603534311543e-07, + "loss": 0.5361, + "step": 17770 + }, + { + "epoch": 4.717642373556352, + "grad_norm": 0.47286348970400893, + "learning_rate": 5.46385668794576e-07, + "loss": 0.5388, + "step": 17771 + }, + { + "epoch": 4.717907872029736, + "grad_norm": 0.4923784286365227, + "learning_rate": 5.461678413778709e-07, + "loss": 0.5214, + "step": 17772 + }, + { + "epoch": 4.71817337050312, + "grad_norm": 0.4665442428930693, + "learning_rate": 5.459500520656774e-07, + "loss": 0.5391, + "step": 17773 + }, + { + "epoch": 4.718438868976503, + "grad_norm": 0.47962067629857, + "learning_rate": 5.457323008622414e-07, + "loss": 0.5351, + "step": 17774 + }, + { + "epoch": 4.718704367449887, + "grad_norm": 0.45918478839971744, + "learning_rate": 5.455145877718112e-07, + "loss": 0.4914, + "step": 17775 + }, + { + "epoch": 4.718969865923271, + "grad_norm": 0.48571893380996733, + "learning_rate": 5.452969127986321e-07, + "loss": 0.4886, + "step": 17776 + }, + { + "epoch": 4.7192353643966545, + "grad_norm": 0.486727468431806, + "learning_rate": 5.450792759469492e-07, + "loss": 0.5146, + "step": 17777 + }, + { + "epoch": 4.719500862870039, + "grad_norm": 0.47601929007289173, + "learning_rate": 5.448616772210058e-07, + "loss": 0.5303, + "step": 17778 + }, + { + "epoch": 4.719766361343423, + "grad_norm": 0.48436796609739574, + "learning_rate": 5.446441166250477e-07, + "loss": 0.5056, + "step": 17779 + }, + { + "epoch": 4.720031859816806, + "grad_norm": 0.48107112526008555, + "learning_rate": 5.444265941633161e-07, + "loss": 0.5272, + "step": 17780 + }, + { + "epoch": 4.72029735829019, + "grad_norm": 0.4843916083388342, + "learning_rate": 5.442091098400548e-07, + "loss": 0.5341, + "step": 17781 + }, + { + "epoch": 4.720562856763573, + "grad_norm": 0.47748784773561725, + "learning_rate": 5.439916636595041e-07, + "loss": 0.5292, + "step": 17782 + }, + { + "epoch": 4.720828355236957, + "grad_norm": 0.4814756067568349, + "learning_rate": 5.437742556259048e-07, + "loss": 0.5404, + "step": 17783 + }, + { + "epoch": 4.721093853710341, + "grad_norm": 0.476424819551458, + "learning_rate": 5.435568857434975e-07, + "loss": 0.5592, + "step": 17784 + }, + { + "epoch": 4.721359352183725, + "grad_norm": 0.4690113284547608, + "learning_rate": 5.433395540165207e-07, + "loss": 0.5043, + "step": 17785 + }, + { + "epoch": 4.721624850657109, + "grad_norm": 0.4968395550800811, + "learning_rate": 5.43122260449214e-07, + "loss": 0.5509, + "step": 17786 + }, + { + "epoch": 4.721890349130493, + "grad_norm": 0.47507585521685436, + "learning_rate": 5.429050050458138e-07, + "loss": 0.5479, + "step": 17787 + }, + { + "epoch": 4.722155847603876, + "grad_norm": 0.4878486108722755, + "learning_rate": 5.42687787810559e-07, + "loss": 0.5391, + "step": 17788 + }, + { + "epoch": 4.72242134607726, + "grad_norm": 0.48011797870712347, + "learning_rate": 5.424706087476836e-07, + "loss": 0.56, + "step": 17789 + }, + { + "epoch": 4.722686844550644, + "grad_norm": 0.47253407556489935, + "learning_rate": 5.422534678614239e-07, + "loss": 0.5375, + "step": 17790 + }, + { + "epoch": 4.722952343024028, + "grad_norm": 0.4789269619830572, + "learning_rate": 5.420363651560157e-07, + "loss": 0.5411, + "step": 17791 + }, + { + "epoch": 4.723217841497411, + "grad_norm": 0.49035580299854287, + "learning_rate": 5.418193006356917e-07, + "loss": 0.528, + "step": 17792 + }, + { + "epoch": 4.723483339970795, + "grad_norm": 0.47868331057485014, + "learning_rate": 5.416022743046865e-07, + "loss": 0.5016, + "step": 17793 + }, + { + "epoch": 4.723748838444179, + "grad_norm": 0.4936565249839819, + "learning_rate": 5.413852861672317e-07, + "loss": 0.5465, + "step": 17794 + }, + { + "epoch": 4.724014336917563, + "grad_norm": 0.4857426582701661, + "learning_rate": 5.411683362275597e-07, + "loss": 0.5569, + "step": 17795 + }, + { + "epoch": 4.724279835390947, + "grad_norm": 0.4938315807209008, + "learning_rate": 5.409514244899003e-07, + "loss": 0.5191, + "step": 17796 + }, + { + "epoch": 4.72454533386433, + "grad_norm": 0.48188217638278147, + "learning_rate": 5.407345509584852e-07, + "loss": 0.5523, + "step": 17797 + }, + { + "epoch": 4.724810832337714, + "grad_norm": 0.46958625126377557, + "learning_rate": 5.405177156375432e-07, + "loss": 0.5203, + "step": 17798 + }, + { + "epoch": 4.725076330811098, + "grad_norm": 0.46543687305075016, + "learning_rate": 5.403009185313038e-07, + "loss": 0.5705, + "step": 17799 + }, + { + "epoch": 4.725341829284481, + "grad_norm": 0.4716544000804069, + "learning_rate": 5.400841596439945e-07, + "loss": 0.559, + "step": 17800 + }, + { + "epoch": 4.725607327757865, + "grad_norm": 0.4845506043626805, + "learning_rate": 5.398674389798425e-07, + "loss": 0.5217, + "step": 17801 + }, + { + "epoch": 4.7258728262312495, + "grad_norm": 0.4928690571091297, + "learning_rate": 5.39650756543075e-07, + "loss": 0.539, + "step": 17802 + }, + { + "epoch": 4.726138324704633, + "grad_norm": 0.49063428928526887, + "learning_rate": 5.394341123379171e-07, + "loss": 0.5832, + "step": 17803 + }, + { + "epoch": 4.726403823178017, + "grad_norm": 0.4725043437707859, + "learning_rate": 5.39217506368595e-07, + "loss": 0.5202, + "step": 17804 + }, + { + "epoch": 4.726669321651401, + "grad_norm": 0.493025484427121, + "learning_rate": 5.390009386393322e-07, + "loss": 0.5703, + "step": 17805 + }, + { + "epoch": 4.726934820124784, + "grad_norm": 0.47783903156507096, + "learning_rate": 5.387844091543526e-07, + "loss": 0.5286, + "step": 17806 + }, + { + "epoch": 4.727200318598168, + "grad_norm": 0.4769408680306065, + "learning_rate": 5.385679179178779e-07, + "loss": 0.5522, + "step": 17807 + }, + { + "epoch": 4.727465817071552, + "grad_norm": 0.48960243540671167, + "learning_rate": 5.383514649341321e-07, + "loss": 0.5426, + "step": 17808 + }, + { + "epoch": 4.727731315544935, + "grad_norm": 0.491492774226586, + "learning_rate": 5.381350502073351e-07, + "loss": 0.5855, + "step": 17809 + }, + { + "epoch": 4.7279968140183195, + "grad_norm": 0.48396600345049673, + "learning_rate": 5.37918673741708e-07, + "loss": 0.5389, + "step": 17810 + }, + { + "epoch": 4.728262312491703, + "grad_norm": 0.4748045389359221, + "learning_rate": 5.37702335541472e-07, + "loss": 0.5152, + "step": 17811 + }, + { + "epoch": 4.728527810965087, + "grad_norm": 0.517192239055643, + "learning_rate": 5.374860356108438e-07, + "loss": 0.5714, + "step": 17812 + }, + { + "epoch": 4.728793309438471, + "grad_norm": 0.4709989980691845, + "learning_rate": 5.372697739540437e-07, + "loss": 0.507, + "step": 17813 + }, + { + "epoch": 4.729058807911855, + "grad_norm": 0.49378919968521184, + "learning_rate": 5.370535505752877e-07, + "loss": 0.5643, + "step": 17814 + }, + { + "epoch": 4.729324306385238, + "grad_norm": 0.4988201036825255, + "learning_rate": 5.368373654787945e-07, + "loss": 0.5399, + "step": 17815 + }, + { + "epoch": 4.729589804858622, + "grad_norm": 0.4900107737672029, + "learning_rate": 5.366212186687783e-07, + "loss": 0.5465, + "step": 17816 + }, + { + "epoch": 4.729855303332005, + "grad_norm": 0.4832615405479388, + "learning_rate": 5.364051101494566e-07, + "loss": 0.5223, + "step": 17817 + }, + { + "epoch": 4.7301208018053895, + "grad_norm": 0.49560682784076504, + "learning_rate": 5.361890399250428e-07, + "loss": 0.499, + "step": 17818 + }, + { + "epoch": 4.730386300278774, + "grad_norm": 0.48039025017433135, + "learning_rate": 5.359730079997502e-07, + "loss": 0.4923, + "step": 17819 + }, + { + "epoch": 4.730651798752157, + "grad_norm": 0.49508421605896696, + "learning_rate": 5.357570143777935e-07, + "loss": 0.5417, + "step": 17820 + }, + { + "epoch": 4.730917297225541, + "grad_norm": 0.48283838790412154, + "learning_rate": 5.355410590633837e-07, + "loss": 0.5635, + "step": 17821 + }, + { + "epoch": 4.731182795698925, + "grad_norm": 0.48369023724818666, + "learning_rate": 5.353251420607342e-07, + "loss": 0.555, + "step": 17822 + }, + { + "epoch": 4.731448294172308, + "grad_norm": 0.47993500126706407, + "learning_rate": 5.351092633740535e-07, + "loss": 0.4962, + "step": 17823 + }, + { + "epoch": 4.731713792645692, + "grad_norm": 0.4876845465463659, + "learning_rate": 5.34893423007554e-07, + "loss": 0.6008, + "step": 17824 + }, + { + "epoch": 4.731979291119076, + "grad_norm": 0.4995686420789376, + "learning_rate": 5.34677620965443e-07, + "loss": 0.5504, + "step": 17825 + }, + { + "epoch": 4.7322447895924595, + "grad_norm": 0.47472462982823826, + "learning_rate": 5.344618572519314e-07, + "loss": 0.4944, + "step": 17826 + }, + { + "epoch": 4.732510288065844, + "grad_norm": 0.47977697904143607, + "learning_rate": 5.342461318712252e-07, + "loss": 0.5267, + "step": 17827 + }, + { + "epoch": 4.732775786539228, + "grad_norm": 0.4891405012587452, + "learning_rate": 5.34030444827533e-07, + "loss": 0.5325, + "step": 17828 + }, + { + "epoch": 4.733041285012611, + "grad_norm": 0.48416261435235364, + "learning_rate": 5.338147961250609e-07, + "loss": 0.5726, + "step": 17829 + }, + { + "epoch": 4.733306783485995, + "grad_norm": 0.4822392410141068, + "learning_rate": 5.335991857680134e-07, + "loss": 0.5511, + "step": 17830 + }, + { + "epoch": 4.733572281959379, + "grad_norm": 0.49142617049579085, + "learning_rate": 5.333836137605972e-07, + "loss": 0.5821, + "step": 17831 + }, + { + "epoch": 4.733837780432762, + "grad_norm": 0.49579224423848245, + "learning_rate": 5.33168080107015e-07, + "loss": 0.5409, + "step": 17832 + }, + { + "epoch": 4.734103278906146, + "grad_norm": 0.4784210969400949, + "learning_rate": 5.329525848114714e-07, + "loss": 0.5127, + "step": 17833 + }, + { + "epoch": 4.73436877737953, + "grad_norm": 0.4821335956651352, + "learning_rate": 5.327371278781687e-07, + "loss": 0.5141, + "step": 17834 + }, + { + "epoch": 4.734634275852914, + "grad_norm": 0.6219567450843645, + "learning_rate": 5.325217093113081e-07, + "loss": 0.5731, + "step": 17835 + }, + { + "epoch": 4.734899774326298, + "grad_norm": 0.48768407851887385, + "learning_rate": 5.323063291150918e-07, + "loss": 0.5426, + "step": 17836 + }, + { + "epoch": 4.735165272799682, + "grad_norm": 0.48931731965500336, + "learning_rate": 5.320909872937193e-07, + "loss": 0.5429, + "step": 17837 + }, + { + "epoch": 4.735430771273065, + "grad_norm": 0.47214192820631357, + "learning_rate": 5.318756838513914e-07, + "loss": 0.521, + "step": 17838 + }, + { + "epoch": 4.735696269746449, + "grad_norm": 0.4755773624768587, + "learning_rate": 5.316604187923061e-07, + "loss": 0.5318, + "step": 17839 + }, + { + "epoch": 4.735961768219832, + "grad_norm": 0.4871698385010478, + "learning_rate": 5.314451921206628e-07, + "loss": 0.5373, + "step": 17840 + }, + { + "epoch": 4.736227266693216, + "grad_norm": 0.48149180897571026, + "learning_rate": 5.31230003840657e-07, + "loss": 0.5333, + "step": 17841 + }, + { + "epoch": 4.7364927651666004, + "grad_norm": 0.4946073261261152, + "learning_rate": 5.310148539564875e-07, + "loss": 0.5285, + "step": 17842 + }, + { + "epoch": 4.7367582636399845, + "grad_norm": 0.4821929696750198, + "learning_rate": 5.307997424723482e-07, + "loss": 0.5535, + "step": 17843 + }, + { + "epoch": 4.737023762113368, + "grad_norm": 0.46948646453571613, + "learning_rate": 5.30584669392436e-07, + "loss": 0.516, + "step": 17844 + }, + { + "epoch": 4.737289260586752, + "grad_norm": 0.47290285940746773, + "learning_rate": 5.30369634720945e-07, + "loss": 0.5437, + "step": 17845 + }, + { + "epoch": 4.737554759060135, + "grad_norm": 0.49611573199031583, + "learning_rate": 5.301546384620676e-07, + "loss": 0.5525, + "step": 17846 + }, + { + "epoch": 4.737820257533519, + "grad_norm": 0.4781772101680418, + "learning_rate": 5.299396806199985e-07, + "loss": 0.5749, + "step": 17847 + }, + { + "epoch": 4.738085756006903, + "grad_norm": 0.4785786865365697, + "learning_rate": 5.297247611989287e-07, + "loss": 0.4968, + "step": 17848 + }, + { + "epoch": 4.738351254480286, + "grad_norm": 0.4839685349661115, + "learning_rate": 5.295098802030508e-07, + "loss": 0.528, + "step": 17849 + }, + { + "epoch": 4.7386167529536705, + "grad_norm": 0.4992330422642702, + "learning_rate": 5.292950376365538e-07, + "loss": 0.5844, + "step": 17850 + }, + { + "epoch": 4.7388822514270545, + "grad_norm": 0.47452235010199206, + "learning_rate": 5.290802335036305e-07, + "loss": 0.5581, + "step": 17851 + }, + { + "epoch": 4.739147749900438, + "grad_norm": 0.5012832851396919, + "learning_rate": 5.288654678084667e-07, + "loss": 0.5165, + "step": 17852 + }, + { + "epoch": 4.739413248373822, + "grad_norm": 0.4900019888244732, + "learning_rate": 5.286507405552529e-07, + "loss": 0.5535, + "step": 17853 + }, + { + "epoch": 4.739678746847206, + "grad_norm": 0.48990715308594884, + "learning_rate": 5.28436051748176e-07, + "loss": 0.5499, + "step": 17854 + }, + { + "epoch": 4.739944245320589, + "grad_norm": 0.47780676195317995, + "learning_rate": 5.282214013914239e-07, + "loss": 0.5378, + "step": 17855 + }, + { + "epoch": 4.740209743793973, + "grad_norm": 0.48041583958724254, + "learning_rate": 5.280067894891816e-07, + "loss": 0.577, + "step": 17856 + }, + { + "epoch": 4.740475242267357, + "grad_norm": 0.48920429200985316, + "learning_rate": 5.277922160456359e-07, + "loss": 0.5383, + "step": 17857 + }, + { + "epoch": 4.7407407407407405, + "grad_norm": 0.5036382516933882, + "learning_rate": 5.275776810649705e-07, + "loss": 0.5378, + "step": 17858 + }, + { + "epoch": 4.741006239214125, + "grad_norm": 0.4817731354875225, + "learning_rate": 5.273631845513694e-07, + "loss": 0.5443, + "step": 17859 + }, + { + "epoch": 4.741271737687509, + "grad_norm": 0.4855748358380614, + "learning_rate": 5.271487265090163e-07, + "loss": 0.5129, + "step": 17860 + }, + { + "epoch": 4.741537236160892, + "grad_norm": 0.4873658075620909, + "learning_rate": 5.269343069420929e-07, + "loss": 0.522, + "step": 17861 + }, + { + "epoch": 4.741802734634276, + "grad_norm": 0.49449433311161883, + "learning_rate": 5.267199258547823e-07, + "loss": 0.5392, + "step": 17862 + }, + { + "epoch": 4.74206823310766, + "grad_norm": 0.4897701980181507, + "learning_rate": 5.265055832512642e-07, + "loss": 0.532, + "step": 17863 + }, + { + "epoch": 4.742333731581043, + "grad_norm": 0.48009249532659065, + "learning_rate": 5.262912791357186e-07, + "loss": 0.5152, + "step": 17864 + }, + { + "epoch": 4.742599230054427, + "grad_norm": 0.49299664221327266, + "learning_rate": 5.260770135123264e-07, + "loss": 0.5607, + "step": 17865 + }, + { + "epoch": 4.742864728527811, + "grad_norm": 0.48056243567319673, + "learning_rate": 5.258627863852645e-07, + "loss": 0.5243, + "step": 17866 + }, + { + "epoch": 4.743130227001195, + "grad_norm": 0.48029086009398775, + "learning_rate": 5.256485977587125e-07, + "loss": 0.5286, + "step": 17867 + }, + { + "epoch": 4.743395725474579, + "grad_norm": 0.48929674048932, + "learning_rate": 5.254344476368464e-07, + "loss": 0.5217, + "step": 17868 + }, + { + "epoch": 4.743661223947962, + "grad_norm": 0.4858147899327403, + "learning_rate": 5.252203360238444e-07, + "loss": 0.5682, + "step": 17869 + }, + { + "epoch": 4.743926722421346, + "grad_norm": 0.48458490904277157, + "learning_rate": 5.250062629238795e-07, + "loss": 0.509, + "step": 17870 + }, + { + "epoch": 4.74419222089473, + "grad_norm": 0.4731475084205726, + "learning_rate": 5.247922283411289e-07, + "loss": 0.5156, + "step": 17871 + }, + { + "epoch": 4.744457719368114, + "grad_norm": 0.4787421572516064, + "learning_rate": 5.245782322797652e-07, + "loss": 0.5484, + "step": 17872 + }, + { + "epoch": 4.744723217841497, + "grad_norm": 0.4874932232981817, + "learning_rate": 5.243642747439634e-07, + "loss": 0.541, + "step": 17873 + }, + { + "epoch": 4.744988716314881, + "grad_norm": 0.48364104917233885, + "learning_rate": 5.241503557378955e-07, + "loss": 0.5073, + "step": 17874 + }, + { + "epoch": 4.745254214788265, + "grad_norm": 0.4658280581470388, + "learning_rate": 5.239364752657325e-07, + "loss": 0.5369, + "step": 17875 + }, + { + "epoch": 4.745519713261649, + "grad_norm": 0.48171165780406, + "learning_rate": 5.237226333316473e-07, + "loss": 0.5575, + "step": 17876 + }, + { + "epoch": 4.745785211735033, + "grad_norm": 0.48208409461387125, + "learning_rate": 5.235088299398089e-07, + "loss": 0.5318, + "step": 17877 + }, + { + "epoch": 4.746050710208416, + "grad_norm": 0.48627812758945066, + "learning_rate": 5.23295065094388e-07, + "loss": 0.5159, + "step": 17878 + }, + { + "epoch": 4.7463162086818, + "grad_norm": 0.47543509111298726, + "learning_rate": 5.230813387995526e-07, + "loss": 0.5214, + "step": 17879 + }, + { + "epoch": 4.746581707155184, + "grad_norm": 0.4900454127980515, + "learning_rate": 5.228676510594729e-07, + "loss": 0.5547, + "step": 17880 + }, + { + "epoch": 4.746847205628567, + "grad_norm": 0.4872030856030643, + "learning_rate": 5.226540018783135e-07, + "loss": 0.4989, + "step": 17881 + }, + { + "epoch": 4.747112704101951, + "grad_norm": 0.48074368310599463, + "learning_rate": 5.224403912602421e-07, + "loss": 0.5458, + "step": 17882 + }, + { + "epoch": 4.7473782025753355, + "grad_norm": 0.47451362721899926, + "learning_rate": 5.22226819209426e-07, + "loss": 0.5228, + "step": 17883 + }, + { + "epoch": 4.747643701048719, + "grad_norm": 0.47336968920937994, + "learning_rate": 5.220132857300286e-07, + "loss": 0.5181, + "step": 17884 + }, + { + "epoch": 4.747909199522103, + "grad_norm": 0.488278870143053, + "learning_rate": 5.217997908262156e-07, + "loss": 0.5774, + "step": 17885 + }, + { + "epoch": 4.748174697995487, + "grad_norm": 0.48861088360431393, + "learning_rate": 5.215863345021504e-07, + "loss": 0.5394, + "step": 17886 + }, + { + "epoch": 4.74844019646887, + "grad_norm": 0.48481392186195105, + "learning_rate": 5.213729167619955e-07, + "loss": 0.5623, + "step": 17887 + }, + { + "epoch": 4.748705694942254, + "grad_norm": 0.48985043599973693, + "learning_rate": 5.211595376099127e-07, + "loss": 0.5523, + "step": 17888 + }, + { + "epoch": 4.748971193415638, + "grad_norm": 0.4855192542574241, + "learning_rate": 5.209461970500646e-07, + "loss": 0.532, + "step": 17889 + }, + { + "epoch": 4.749236691889021, + "grad_norm": 0.48531222067919866, + "learning_rate": 5.207328950866103e-07, + "loss": 0.5532, + "step": 17890 + }, + { + "epoch": 4.7495021903624055, + "grad_norm": 0.48986217041125296, + "learning_rate": 5.205196317237116e-07, + "loss": 0.5203, + "step": 17891 + }, + { + "epoch": 4.749767688835789, + "grad_norm": 0.4845208375274857, + "learning_rate": 5.203064069655267e-07, + "loss": 0.5108, + "step": 17892 + }, + { + "epoch": 4.750033187309173, + "grad_norm": 0.47293316343275177, + "learning_rate": 5.20093220816213e-07, + "loss": 0.5232, + "step": 17893 + }, + { + "epoch": 4.750298685782557, + "grad_norm": 0.4685382210727578, + "learning_rate": 5.1988007327993e-07, + "loss": 0.5087, + "step": 17894 + }, + { + "epoch": 4.750564184255941, + "grad_norm": 0.48188921501061877, + "learning_rate": 5.196669643608332e-07, + "loss": 0.5495, + "step": 17895 + }, + { + "epoch": 4.750829682729324, + "grad_norm": 0.48190939784719933, + "learning_rate": 5.194538940630797e-07, + "loss": 0.5408, + "step": 17896 + }, + { + "epoch": 4.751095181202708, + "grad_norm": 0.4917213646389072, + "learning_rate": 5.192408623908246e-07, + "loss": 0.5307, + "step": 17897 + }, + { + "epoch": 4.751360679676091, + "grad_norm": 0.48580259334528425, + "learning_rate": 5.190278693482223e-07, + "loss": 0.5548, + "step": 17898 + }, + { + "epoch": 4.7516261781494755, + "grad_norm": 0.4834763923441684, + "learning_rate": 5.188149149394265e-07, + "loss": 0.5401, + "step": 17899 + }, + { + "epoch": 4.75189167662286, + "grad_norm": 0.5045726428295548, + "learning_rate": 5.186019991685908e-07, + "loss": 0.5429, + "step": 17900 + }, + { + "epoch": 4.752157175096244, + "grad_norm": 0.48783341637129474, + "learning_rate": 5.183891220398671e-07, + "loss": 0.5474, + "step": 17901 + }, + { + "epoch": 4.752422673569627, + "grad_norm": 0.4811210208871398, + "learning_rate": 5.181762835574072e-07, + "loss": 0.5225, + "step": 17902 + }, + { + "epoch": 4.752688172043011, + "grad_norm": 0.49374317360592773, + "learning_rate": 5.179634837253636e-07, + "loss": 0.5629, + "step": 17903 + }, + { + "epoch": 4.752953670516394, + "grad_norm": 0.48719439799501607, + "learning_rate": 5.177507225478834e-07, + "loss": 0.5543, + "step": 17904 + }, + { + "epoch": 4.753219168989778, + "grad_norm": 0.474326964396191, + "learning_rate": 5.175380000291183e-07, + "loss": 0.5086, + "step": 17905 + }, + { + "epoch": 4.753484667463162, + "grad_norm": 0.4827963566019136, + "learning_rate": 5.173253161732153e-07, + "loss": 0.5628, + "step": 17906 + }, + { + "epoch": 4.7537501659365455, + "grad_norm": 0.4927884638320415, + "learning_rate": 5.171126709843239e-07, + "loss": 0.5439, + "step": 17907 + }, + { + "epoch": 4.75401566440993, + "grad_norm": 0.4743941885771043, + "learning_rate": 5.169000644665895e-07, + "loss": 0.5254, + "step": 17908 + }, + { + "epoch": 4.754281162883314, + "grad_norm": 0.47208689482823535, + "learning_rate": 5.166874966241603e-07, + "loss": 0.4969, + "step": 17909 + }, + { + "epoch": 4.754546661356697, + "grad_norm": 0.47163712092322796, + "learning_rate": 5.164749674611805e-07, + "loss": 0.5217, + "step": 17910 + }, + { + "epoch": 4.754812159830081, + "grad_norm": 0.4847964076644604, + "learning_rate": 5.162624769817948e-07, + "loss": 0.5193, + "step": 17911 + }, + { + "epoch": 4.755077658303465, + "grad_norm": 0.48028106159435885, + "learning_rate": 5.160500251901487e-07, + "loss": 0.5359, + "step": 17912 + }, + { + "epoch": 4.755343156776848, + "grad_norm": 0.48025008009487075, + "learning_rate": 5.158376120903838e-07, + "loss": 0.5489, + "step": 17913 + }, + { + "epoch": 4.755608655250232, + "grad_norm": 0.4806269640401778, + "learning_rate": 5.156252376866441e-07, + "loss": 0.5338, + "step": 17914 + }, + { + "epoch": 4.755874153723616, + "grad_norm": 0.47999443219255883, + "learning_rate": 5.154129019830712e-07, + "loss": 0.5123, + "step": 17915 + }, + { + "epoch": 4.756139652197, + "grad_norm": 0.468921481054419, + "learning_rate": 5.152006049838054e-07, + "loss": 0.4917, + "step": 17916 + }, + { + "epoch": 4.756405150670384, + "grad_norm": 0.4835779455628183, + "learning_rate": 5.14988346692987e-07, + "loss": 0.513, + "step": 17917 + }, + { + "epoch": 4.756670649143768, + "grad_norm": 0.4777725284622087, + "learning_rate": 5.147761271147567e-07, + "loss": 0.5573, + "step": 17918 + }, + { + "epoch": 4.756936147617151, + "grad_norm": 0.48552926948335057, + "learning_rate": 5.145639462532517e-07, + "loss": 0.5583, + "step": 17919 + }, + { + "epoch": 4.757201646090535, + "grad_norm": 0.4803955917499256, + "learning_rate": 5.14351804112612e-07, + "loss": 0.5363, + "step": 17920 + }, + { + "epoch": 4.757467144563918, + "grad_norm": 0.49788221277017647, + "learning_rate": 5.141397006969734e-07, + "loss": 0.579, + "step": 17921 + }, + { + "epoch": 4.757732643037302, + "grad_norm": 0.4881799136032062, + "learning_rate": 5.139276360104726e-07, + "loss": 0.5635, + "step": 17922 + }, + { + "epoch": 4.7579981415106865, + "grad_norm": 0.501212464325793, + "learning_rate": 5.137156100572457e-07, + "loss": 0.6115, + "step": 17923 + }, + { + "epoch": 4.7582636399840705, + "grad_norm": 0.492331187666112, + "learning_rate": 5.135036228414275e-07, + "loss": 0.5231, + "step": 17924 + }, + { + "epoch": 4.758529138457454, + "grad_norm": 0.48549881108151655, + "learning_rate": 5.132916743671529e-07, + "loss": 0.5176, + "step": 17925 + }, + { + "epoch": 4.758794636930838, + "grad_norm": 0.4883286352545921, + "learning_rate": 5.130797646385551e-07, + "loss": 0.5486, + "step": 17926 + }, + { + "epoch": 4.759060135404221, + "grad_norm": 0.48783689371453287, + "learning_rate": 5.128678936597658e-07, + "loss": 0.5487, + "step": 17927 + }, + { + "epoch": 4.759325633877605, + "grad_norm": 0.48037008516744284, + "learning_rate": 5.126560614349189e-07, + "loss": 0.5559, + "step": 17928 + }, + { + "epoch": 4.759591132350989, + "grad_norm": 0.48421568964901096, + "learning_rate": 5.124442679681438e-07, + "loss": 0.5551, + "step": 17929 + }, + { + "epoch": 4.759856630824372, + "grad_norm": 0.47347003584851377, + "learning_rate": 5.122325132635724e-07, + "loss": 0.5428, + "step": 17930 + }, + { + "epoch": 4.7601221292977565, + "grad_norm": 0.4901966133754933, + "learning_rate": 5.120207973253336e-07, + "loss": 0.5613, + "step": 17931 + }, + { + "epoch": 4.7603876277711406, + "grad_norm": 0.4815469858212861, + "learning_rate": 5.118091201575578e-07, + "loss": 0.5341, + "step": 17932 + }, + { + "epoch": 4.760653126244524, + "grad_norm": 0.4799333996442336, + "learning_rate": 5.115974817643707e-07, + "loss": 0.5417, + "step": 17933 + }, + { + "epoch": 4.760918624717908, + "grad_norm": 0.47202794477798515, + "learning_rate": 5.113858821499018e-07, + "loss": 0.5338, + "step": 17934 + }, + { + "epoch": 4.761184123191292, + "grad_norm": 0.4854419638023756, + "learning_rate": 5.111743213182765e-07, + "loss": 0.5336, + "step": 17935 + }, + { + "epoch": 4.761449621664675, + "grad_norm": 0.47736216886216437, + "learning_rate": 5.109627992736221e-07, + "loss": 0.5623, + "step": 17936 + }, + { + "epoch": 4.761715120138059, + "grad_norm": 0.4848890508828434, + "learning_rate": 5.107513160200625e-07, + "loss": 0.5119, + "step": 17937 + }, + { + "epoch": 4.761980618611443, + "grad_norm": 0.4732577720970517, + "learning_rate": 5.105398715617235e-07, + "loss": 0.5323, + "step": 17938 + }, + { + "epoch": 4.7622461170848265, + "grad_norm": 0.4723088264776877, + "learning_rate": 5.10328465902728e-07, + "loss": 0.527, + "step": 17939 + }, + { + "epoch": 4.762511615558211, + "grad_norm": 0.4811508142267022, + "learning_rate": 5.101170990471982e-07, + "loss": 0.5315, + "step": 17940 + }, + { + "epoch": 4.762777114031595, + "grad_norm": 0.48242907486658976, + "learning_rate": 5.099057709992577e-07, + "loss": 0.5612, + "step": 17941 + }, + { + "epoch": 4.763042612504978, + "grad_norm": 0.4859614605068404, + "learning_rate": 5.096944817630267e-07, + "loss": 0.5199, + "step": 17942 + }, + { + "epoch": 4.763308110978362, + "grad_norm": 0.4749249901315378, + "learning_rate": 5.094832313426276e-07, + "loss": 0.5383, + "step": 17943 + }, + { + "epoch": 4.763573609451746, + "grad_norm": 0.5045171490318809, + "learning_rate": 5.09272019742178e-07, + "loss": 0.5452, + "step": 17944 + }, + { + "epoch": 4.763839107925129, + "grad_norm": 0.47788859236726694, + "learning_rate": 5.090608469657987e-07, + "loss": 0.5369, + "step": 17945 + }, + { + "epoch": 4.764104606398513, + "grad_norm": 0.46960584792995125, + "learning_rate": 5.088497130176068e-07, + "loss": 0.5102, + "step": 17946 + }, + { + "epoch": 4.764370104871897, + "grad_norm": 0.47424828535665375, + "learning_rate": 5.086386179017208e-07, + "loss": 0.5224, + "step": 17947 + }, + { + "epoch": 4.764635603345281, + "grad_norm": 0.49090557505809523, + "learning_rate": 5.084275616222581e-07, + "loss": 0.5196, + "step": 17948 + }, + { + "epoch": 4.764901101818665, + "grad_norm": 0.4739687791773348, + "learning_rate": 5.08216544183334e-07, + "loss": 0.4848, + "step": 17949 + }, + { + "epoch": 4.765166600292048, + "grad_norm": 0.4958530307022878, + "learning_rate": 5.080055655890642e-07, + "loss": 0.5962, + "step": 17950 + }, + { + "epoch": 4.765432098765432, + "grad_norm": 0.48015002213866975, + "learning_rate": 5.077946258435624e-07, + "loss": 0.5181, + "step": 17951 + }, + { + "epoch": 4.765697597238816, + "grad_norm": 0.4751311976269479, + "learning_rate": 5.075837249509438e-07, + "loss": 0.5365, + "step": 17952 + }, + { + "epoch": 4.7659630957122, + "grad_norm": 0.4895303183181923, + "learning_rate": 5.073728629153201e-07, + "loss": 0.5657, + "step": 17953 + }, + { + "epoch": 4.766228594185583, + "grad_norm": 0.4794424901830242, + "learning_rate": 5.071620397408053e-07, + "loss": 0.5614, + "step": 17954 + }, + { + "epoch": 4.766494092658967, + "grad_norm": 0.4815297694787281, + "learning_rate": 5.069512554315098e-07, + "loss": 0.5188, + "step": 17955 + }, + { + "epoch": 4.766759591132351, + "grad_norm": 0.49009559761974075, + "learning_rate": 5.06740509991544e-07, + "loss": 0.5427, + "step": 17956 + }, + { + "epoch": 4.767025089605735, + "grad_norm": 0.4650005958166525, + "learning_rate": 5.065298034250194e-07, + "loss": 0.5458, + "step": 17957 + }, + { + "epoch": 4.767290588079119, + "grad_norm": 0.4741141592963785, + "learning_rate": 5.063191357360437e-07, + "loss": 0.5598, + "step": 17958 + }, + { + "epoch": 4.767556086552502, + "grad_norm": 0.4920850418606148, + "learning_rate": 5.061085069287267e-07, + "loss": 0.553, + "step": 17959 + }, + { + "epoch": 4.767821585025886, + "grad_norm": 0.49322030854640186, + "learning_rate": 5.058979170071754e-07, + "loss": 0.4915, + "step": 17960 + }, + { + "epoch": 4.76808708349927, + "grad_norm": 0.48341125843990435, + "learning_rate": 5.056873659754982e-07, + "loss": 0.5876, + "step": 17961 + }, + { + "epoch": 4.768352581972653, + "grad_norm": 0.4776790773986275, + "learning_rate": 5.054768538377991e-07, + "loss": 0.5565, + "step": 17962 + }, + { + "epoch": 4.768618080446037, + "grad_norm": 0.485103299948917, + "learning_rate": 5.052663805981853e-07, + "loss": 0.5518, + "step": 17963 + }, + { + "epoch": 4.7688835789194215, + "grad_norm": 0.4754670898136354, + "learning_rate": 5.050559462607605e-07, + "loss": 0.5337, + "step": 17964 + }, + { + "epoch": 4.769149077392805, + "grad_norm": 0.47136126517508237, + "learning_rate": 5.048455508296299e-07, + "loss": 0.5122, + "step": 17965 + }, + { + "epoch": 4.769414575866189, + "grad_norm": 0.482062471416318, + "learning_rate": 5.04635194308896e-07, + "loss": 0.5131, + "step": 17966 + }, + { + "epoch": 4.769680074339573, + "grad_norm": 0.4796535054689214, + "learning_rate": 5.044248767026605e-07, + "loss": 0.5483, + "step": 17967 + }, + { + "epoch": 4.769945572812956, + "grad_norm": 0.4797795483510902, + "learning_rate": 5.042145980150268e-07, + "loss": 0.5826, + "step": 17968 + }, + { + "epoch": 4.77021107128634, + "grad_norm": 0.48733148488387107, + "learning_rate": 5.040043582500939e-07, + "loss": 0.5648, + "step": 17969 + }, + { + "epoch": 4.770476569759724, + "grad_norm": 0.49188546172471137, + "learning_rate": 5.037941574119642e-07, + "loss": 0.5266, + "step": 17970 + }, + { + "epoch": 4.770742068233107, + "grad_norm": 0.5203461590322918, + "learning_rate": 5.035839955047351e-07, + "loss": 0.5219, + "step": 17971 + }, + { + "epoch": 4.7710075667064915, + "grad_norm": 0.4825189718460147, + "learning_rate": 5.033738725325069e-07, + "loss": 0.5524, + "step": 17972 + }, + { + "epoch": 4.771273065179876, + "grad_norm": 0.47650800032065554, + "learning_rate": 5.031637884993768e-07, + "loss": 0.5275, + "step": 17973 + }, + { + "epoch": 4.771538563653259, + "grad_norm": 0.49027195208704977, + "learning_rate": 5.029537434094411e-07, + "loss": 0.5415, + "step": 17974 + }, + { + "epoch": 4.771804062126643, + "grad_norm": 0.47321546031407014, + "learning_rate": 5.027437372667976e-07, + "loss": 0.5426, + "step": 17975 + }, + { + "epoch": 4.772069560600027, + "grad_norm": 0.47694543927081745, + "learning_rate": 5.025337700755409e-07, + "loss": 0.5453, + "step": 17976 + }, + { + "epoch": 4.77233505907341, + "grad_norm": 0.4944207033297632, + "learning_rate": 5.023238418397669e-07, + "loss": 0.5184, + "step": 17977 + }, + { + "epoch": 4.772600557546794, + "grad_norm": 0.4935000509331629, + "learning_rate": 5.021139525635693e-07, + "loss": 0.5657, + "step": 17978 + }, + { + "epoch": 4.772866056020177, + "grad_norm": 0.46909103904099425, + "learning_rate": 5.019041022510413e-07, + "loss": 0.5307, + "step": 17979 + }, + { + "epoch": 4.7731315544935615, + "grad_norm": 0.4845856763415569, + "learning_rate": 5.016942909062744e-07, + "loss": 0.5372, + "step": 17980 + }, + { + "epoch": 4.773397052966946, + "grad_norm": 0.4969572340541802, + "learning_rate": 5.014845185333625e-07, + "loss": 0.5176, + "step": 17981 + }, + { + "epoch": 4.77366255144033, + "grad_norm": 0.4704764211053215, + "learning_rate": 5.012747851363949e-07, + "loss": 0.4902, + "step": 17982 + }, + { + "epoch": 4.773928049913713, + "grad_norm": 0.49391565351521594, + "learning_rate": 5.010650907194636e-07, + "loss": 0.5146, + "step": 17983 + }, + { + "epoch": 4.774193548387097, + "grad_norm": 0.48607878756327755, + "learning_rate": 5.008554352866574e-07, + "loss": 0.548, + "step": 17984 + }, + { + "epoch": 4.77445904686048, + "grad_norm": 0.4726870812051951, + "learning_rate": 5.00645818842064e-07, + "loss": 0.5405, + "step": 17985 + }, + { + "epoch": 4.774724545333864, + "grad_norm": 0.472521350403769, + "learning_rate": 5.004362413897732e-07, + "loss": 0.521, + "step": 17986 + }, + { + "epoch": 4.774990043807248, + "grad_norm": 0.4920755405573564, + "learning_rate": 5.002267029338706e-07, + "loss": 0.5647, + "step": 17987 + }, + { + "epoch": 4.7752555422806315, + "grad_norm": 0.4679958859970393, + "learning_rate": 5.000172034784442e-07, + "loss": 0.4773, + "step": 17988 + }, + { + "epoch": 4.775521040754016, + "grad_norm": 0.47033909448224076, + "learning_rate": 4.998077430275789e-07, + "loss": 0.5527, + "step": 17989 + }, + { + "epoch": 4.7757865392274, + "grad_norm": 0.49211092173185367, + "learning_rate": 4.995983215853609e-07, + "loss": 0.5383, + "step": 17990 + }, + { + "epoch": 4.776052037700783, + "grad_norm": 0.478128156626304, + "learning_rate": 4.993889391558721e-07, + "loss": 0.5451, + "step": 17991 + }, + { + "epoch": 4.776317536174167, + "grad_norm": 0.4907558027169939, + "learning_rate": 4.99179595743198e-07, + "loss": 0.5618, + "step": 17992 + }, + { + "epoch": 4.776583034647551, + "grad_norm": 0.48639768900763636, + "learning_rate": 4.989702913514202e-07, + "loss": 0.564, + "step": 17993 + }, + { + "epoch": 4.776848533120934, + "grad_norm": 0.4998120079017491, + "learning_rate": 4.987610259846209e-07, + "loss": 0.5238, + "step": 17994 + }, + { + "epoch": 4.777114031594318, + "grad_norm": 0.4697520678111949, + "learning_rate": 4.98551799646883e-07, + "loss": 0.5339, + "step": 17995 + }, + { + "epoch": 4.777379530067702, + "grad_norm": 0.4875955869912905, + "learning_rate": 4.983426123422838e-07, + "loss": 0.5385, + "step": 17996 + }, + { + "epoch": 4.777645028541086, + "grad_norm": 0.479003098587137, + "learning_rate": 4.981334640749056e-07, + "loss": 0.5599, + "step": 17997 + }, + { + "epoch": 4.77791052701447, + "grad_norm": 0.48278670573028626, + "learning_rate": 4.979243548488252e-07, + "loss": 0.5318, + "step": 17998 + }, + { + "epoch": 4.778176025487854, + "grad_norm": 0.48341103935871743, + "learning_rate": 4.977152846681227e-07, + "loss": 0.5272, + "step": 17999 + }, + { + "epoch": 4.778441523961237, + "grad_norm": 0.4756196890187936, + "learning_rate": 4.975062535368739e-07, + "loss": 0.5433, + "step": 18000 + }, + { + "epoch": 4.778707022434621, + "grad_norm": 0.48160775073401735, + "learning_rate": 4.972972614591568e-07, + "loss": 0.5452, + "step": 18001 + }, + { + "epoch": 4.778972520908004, + "grad_norm": 0.49010757349740214, + "learning_rate": 4.970883084390463e-07, + "loss": 0.5536, + "step": 18002 + }, + { + "epoch": 4.779238019381388, + "grad_norm": 0.4648630186166757, + "learning_rate": 4.968793944806171e-07, + "loss": 0.5223, + "step": 18003 + }, + { + "epoch": 4.7795035178547725, + "grad_norm": 0.49366987902299253, + "learning_rate": 4.966705195879454e-07, + "loss": 0.5901, + "step": 18004 + }, + { + "epoch": 4.7797690163281565, + "grad_norm": 0.48845875544002404, + "learning_rate": 4.964616837651024e-07, + "loss": 0.5119, + "step": 18005 + }, + { + "epoch": 4.78003451480154, + "grad_norm": 0.47370209877336367, + "learning_rate": 4.962528870161629e-07, + "loss": 0.5367, + "step": 18006 + }, + { + "epoch": 4.780300013274924, + "grad_norm": 0.47661059646892523, + "learning_rate": 4.960441293451981e-07, + "loss": 0.5134, + "step": 18007 + }, + { + "epoch": 4.780565511748307, + "grad_norm": 0.47190686787763464, + "learning_rate": 4.958354107562793e-07, + "loss": 0.4963, + "step": 18008 + }, + { + "epoch": 4.780831010221691, + "grad_norm": 0.48666748430530027, + "learning_rate": 4.956267312534762e-07, + "loss": 0.5306, + "step": 18009 + }, + { + "epoch": 4.781096508695075, + "grad_norm": 0.48077044730429325, + "learning_rate": 4.954180908408604e-07, + "loss": 0.5131, + "step": 18010 + }, + { + "epoch": 4.781362007168459, + "grad_norm": 0.4926430032018436, + "learning_rate": 4.95209489522499e-07, + "loss": 0.5455, + "step": 18011 + }, + { + "epoch": 4.7816275056418425, + "grad_norm": 0.47520193951153944, + "learning_rate": 4.950009273024619e-07, + "loss": 0.4892, + "step": 18012 + }, + { + "epoch": 4.7818930041152266, + "grad_norm": 0.47987384381434717, + "learning_rate": 4.947924041848157e-07, + "loss": 0.5358, + "step": 18013 + }, + { + "epoch": 4.78215850258861, + "grad_norm": 0.47823294223916746, + "learning_rate": 4.945839201736266e-07, + "loss": 0.553, + "step": 18014 + }, + { + "epoch": 4.782424001061994, + "grad_norm": 0.4812049530182868, + "learning_rate": 4.943754752729618e-07, + "loss": 0.5464, + "step": 18015 + }, + { + "epoch": 4.782689499535378, + "grad_norm": 0.46993030851701145, + "learning_rate": 4.941670694868853e-07, + "loss": 0.5216, + "step": 18016 + }, + { + "epoch": 4.782954998008761, + "grad_norm": 0.47952954602096765, + "learning_rate": 4.939587028194625e-07, + "loss": 0.5597, + "step": 18017 + }, + { + "epoch": 4.783220496482145, + "grad_norm": 0.47811568430812457, + "learning_rate": 4.937503752747569e-07, + "loss": 0.5307, + "step": 18018 + }, + { + "epoch": 4.783485994955529, + "grad_norm": 0.48591690381905783, + "learning_rate": 4.935420868568303e-07, + "loss": 0.5542, + "step": 18019 + }, + { + "epoch": 4.7837514934289125, + "grad_norm": 0.4806279745635947, + "learning_rate": 4.933338375697464e-07, + "loss": 0.5541, + "step": 18020 + }, + { + "epoch": 4.784016991902297, + "grad_norm": 0.48046943643012774, + "learning_rate": 4.931256274175652e-07, + "loss": 0.5778, + "step": 18021 + }, + { + "epoch": 4.784282490375681, + "grad_norm": 0.4849189602190248, + "learning_rate": 4.929174564043484e-07, + "loss": 0.5465, + "step": 18022 + }, + { + "epoch": 4.784547988849064, + "grad_norm": 0.47786896744000656, + "learning_rate": 4.92709324534155e-07, + "loss": 0.5326, + "step": 18023 + }, + { + "epoch": 4.784813487322448, + "grad_norm": 0.4760970245939848, + "learning_rate": 4.925012318110455e-07, + "loss": 0.517, + "step": 18024 + }, + { + "epoch": 4.785078985795832, + "grad_norm": 0.49509001498487903, + "learning_rate": 4.922931782390761e-07, + "loss": 0.5306, + "step": 18025 + }, + { + "epoch": 4.785344484269215, + "grad_norm": 0.48295788476680324, + "learning_rate": 4.920851638223062e-07, + "loss": 0.5618, + "step": 18026 + }, + { + "epoch": 4.785609982742599, + "grad_norm": 0.4710697182910148, + "learning_rate": 4.918771885647908e-07, + "loss": 0.5683, + "step": 18027 + }, + { + "epoch": 4.785875481215983, + "grad_norm": 0.4741252788130125, + "learning_rate": 4.916692524705879e-07, + "loss": 0.5287, + "step": 18028 + }, + { + "epoch": 4.786140979689367, + "grad_norm": 0.4707627291170045, + "learning_rate": 4.914613555437511e-07, + "loss": 0.5354, + "step": 18029 + }, + { + "epoch": 4.786406478162751, + "grad_norm": 0.478199321523249, + "learning_rate": 4.912534977883363e-07, + "loss": 0.5541, + "step": 18030 + }, + { + "epoch": 4.786671976636134, + "grad_norm": 0.4673310226376173, + "learning_rate": 4.910456792083965e-07, + "loss": 0.5102, + "step": 18031 + }, + { + "epoch": 4.786937475109518, + "grad_norm": 0.48598384448173665, + "learning_rate": 4.908378998079839e-07, + "loss": 0.5095, + "step": 18032 + }, + { + "epoch": 4.787202973582902, + "grad_norm": 0.46688879767938213, + "learning_rate": 4.906301595911523e-07, + "loss": 0.514, + "step": 18033 + }, + { + "epoch": 4.787468472056286, + "grad_norm": 0.4759717991044667, + "learning_rate": 4.904224585619519e-07, + "loss": 0.5368, + "step": 18034 + }, + { + "epoch": 4.787733970529669, + "grad_norm": 0.486806039995556, + "learning_rate": 4.902147967244342e-07, + "loss": 0.5312, + "step": 18035 + }, + { + "epoch": 4.787999469003053, + "grad_norm": 0.4751445623996909, + "learning_rate": 4.90007174082649e-07, + "loss": 0.5307, + "step": 18036 + }, + { + "epoch": 4.788264967476437, + "grad_norm": 0.46700504255277153, + "learning_rate": 4.897995906406453e-07, + "loss": 0.5372, + "step": 18037 + }, + { + "epoch": 4.788530465949821, + "grad_norm": 0.49595746518074574, + "learning_rate": 4.895920464024703e-07, + "loss": 0.5552, + "step": 18038 + }, + { + "epoch": 4.788795964423205, + "grad_norm": 0.48252636881136807, + "learning_rate": 4.893845413721732e-07, + "loss": 0.4963, + "step": 18039 + }, + { + "epoch": 4.789061462896588, + "grad_norm": 0.4817440671839129, + "learning_rate": 4.891770755538008e-07, + "loss": 0.5383, + "step": 18040 + }, + { + "epoch": 4.789326961369972, + "grad_norm": 0.48808918807872487, + "learning_rate": 4.889696489513987e-07, + "loss": 0.5328, + "step": 18041 + }, + { + "epoch": 4.789592459843356, + "grad_norm": 0.48080614391787435, + "learning_rate": 4.887622615690124e-07, + "loss": 0.5383, + "step": 18042 + }, + { + "epoch": 4.789857958316739, + "grad_norm": 0.48891648476418476, + "learning_rate": 4.885549134106859e-07, + "loss": 0.534, + "step": 18043 + }, + { + "epoch": 4.790123456790123, + "grad_norm": 0.48198921729106275, + "learning_rate": 4.883476044804638e-07, + "loss": 0.5147, + "step": 18044 + }, + { + "epoch": 4.7903889552635075, + "grad_norm": 0.4780619129473246, + "learning_rate": 4.881403347823885e-07, + "loss": 0.5287, + "step": 18045 + }, + { + "epoch": 4.790654453736891, + "grad_norm": 0.47193975189744247, + "learning_rate": 4.879331043205029e-07, + "loss": 0.5189, + "step": 18046 + }, + { + "epoch": 4.790919952210275, + "grad_norm": 0.49309846216577247, + "learning_rate": 4.877259130988482e-07, + "loss": 0.5391, + "step": 18047 + }, + { + "epoch": 4.791185450683659, + "grad_norm": 0.47887047798048377, + "learning_rate": 4.875187611214644e-07, + "loss": 0.5262, + "step": 18048 + }, + { + "epoch": 4.791450949157042, + "grad_norm": 0.49074135708434174, + "learning_rate": 4.873116483923929e-07, + "loss": 0.5474, + "step": 18049 + }, + { + "epoch": 4.791716447630426, + "grad_norm": 0.4752380108267149, + "learning_rate": 4.871045749156713e-07, + "loss": 0.5517, + "step": 18050 + }, + { + "epoch": 4.79198194610381, + "grad_norm": 0.4847861712212924, + "learning_rate": 4.8689754069534e-07, + "loss": 0.5216, + "step": 18051 + }, + { + "epoch": 4.792247444577193, + "grad_norm": 0.47164673399218426, + "learning_rate": 4.866905457354346e-07, + "loss": 0.4983, + "step": 18052 + }, + { + "epoch": 4.7925129430505775, + "grad_norm": 0.486669600177474, + "learning_rate": 4.864835900399945e-07, + "loss": 0.558, + "step": 18053 + }, + { + "epoch": 4.792778441523962, + "grad_norm": 0.4703553359373661, + "learning_rate": 4.862766736130528e-07, + "loss": 0.4913, + "step": 18054 + }, + { + "epoch": 4.793043939997345, + "grad_norm": 0.47885172037638307, + "learning_rate": 4.860697964586472e-07, + "loss": 0.5278, + "step": 18055 + }, + { + "epoch": 4.793309438470729, + "grad_norm": 0.47643238443238634, + "learning_rate": 4.858629585808109e-07, + "loss": 0.5497, + "step": 18056 + }, + { + "epoch": 4.793574936944113, + "grad_norm": 0.4805325425023738, + "learning_rate": 4.856561599835787e-07, + "loss": 0.5053, + "step": 18057 + }, + { + "epoch": 4.793840435417496, + "grad_norm": 0.493304177954982, + "learning_rate": 4.854494006709831e-07, + "loss": 0.5622, + "step": 18058 + }, + { + "epoch": 4.79410593389088, + "grad_norm": 0.48153951469383405, + "learning_rate": 4.852426806470573e-07, + "loss": 0.5379, + "step": 18059 + }, + { + "epoch": 4.794371432364263, + "grad_norm": 0.47106527057176684, + "learning_rate": 4.850359999158319e-07, + "loss": 0.5132, + "step": 18060 + }, + { + "epoch": 4.7946369308376475, + "grad_norm": 0.49692382849406064, + "learning_rate": 4.848293584813377e-07, + "loss": 0.561, + "step": 18061 + }, + { + "epoch": 4.794902429311032, + "grad_norm": 0.48447572619569007, + "learning_rate": 4.846227563476052e-07, + "loss": 0.5436, + "step": 18062 + }, + { + "epoch": 4.795167927784416, + "grad_norm": 0.4788628276328648, + "learning_rate": 4.844161935186631e-07, + "loss": 0.5347, + "step": 18063 + }, + { + "epoch": 4.795433426257799, + "grad_norm": 0.49826638870186746, + "learning_rate": 4.842096699985408e-07, + "loss": 0.5487, + "step": 18064 + }, + { + "epoch": 4.795698924731183, + "grad_norm": 0.4830054635995838, + "learning_rate": 4.840031857912655e-07, + "loss": 0.5362, + "step": 18065 + }, + { + "epoch": 4.795964423204566, + "grad_norm": 0.4803891413626469, + "learning_rate": 4.837967409008634e-07, + "loss": 0.5377, + "step": 18066 + }, + { + "epoch": 4.79622992167795, + "grad_norm": 0.4811342475177499, + "learning_rate": 4.83590335331362e-07, + "loss": 0.546, + "step": 18067 + }, + { + "epoch": 4.796495420151334, + "grad_norm": 0.4941977940912998, + "learning_rate": 4.833839690867853e-07, + "loss": 0.56, + "step": 18068 + }, + { + "epoch": 4.7967609186247175, + "grad_norm": 0.4926254723278097, + "learning_rate": 4.831776421711595e-07, + "loss": 0.5667, + "step": 18069 + }, + { + "epoch": 4.797026417098102, + "grad_norm": 0.4862035652341599, + "learning_rate": 4.829713545885078e-07, + "loss": 0.5446, + "step": 18070 + }, + { + "epoch": 4.797291915571486, + "grad_norm": 0.47240278887666426, + "learning_rate": 4.827651063428532e-07, + "loss": 0.5546, + "step": 18071 + }, + { + "epoch": 4.797557414044869, + "grad_norm": 0.47790974443906487, + "learning_rate": 4.825588974382175e-07, + "loss": 0.54, + "step": 18072 + }, + { + "epoch": 4.797822912518253, + "grad_norm": 0.4864142006364324, + "learning_rate": 4.823527278786233e-07, + "loss": 0.5436, + "step": 18073 + }, + { + "epoch": 4.798088410991637, + "grad_norm": 0.4857802115007357, + "learning_rate": 4.821465976680903e-07, + "loss": 0.5475, + "step": 18074 + }, + { + "epoch": 4.79835390946502, + "grad_norm": 0.47750188603211785, + "learning_rate": 4.8194050681064e-07, + "loss": 0.5019, + "step": 18075 + }, + { + "epoch": 4.798619407938404, + "grad_norm": 0.49657485060990497, + "learning_rate": 4.81734455310291e-07, + "loss": 0.5543, + "step": 18076 + }, + { + "epoch": 4.798884906411788, + "grad_norm": 0.4702482105271751, + "learning_rate": 4.815284431710606e-07, + "loss": 0.5354, + "step": 18077 + }, + { + "epoch": 4.799150404885172, + "grad_norm": 0.485621242238804, + "learning_rate": 4.813224703969685e-07, + "loss": 0.5384, + "step": 18078 + }, + { + "epoch": 4.799415903358556, + "grad_norm": 0.4777676597048149, + "learning_rate": 4.811165369920304e-07, + "loss": 0.507, + "step": 18079 + }, + { + "epoch": 4.79968140183194, + "grad_norm": 0.4695559184168845, + "learning_rate": 4.809106429602633e-07, + "loss": 0.5221, + "step": 18080 + }, + { + "epoch": 4.799946900305323, + "grad_norm": 0.480461962549017, + "learning_rate": 4.807047883056815e-07, + "loss": 0.5271, + "step": 18081 + }, + { + "epoch": 4.800212398778707, + "grad_norm": 0.4806052795296357, + "learning_rate": 4.80498973032302e-07, + "loss": 0.5353, + "step": 18082 + }, + { + "epoch": 4.800477897252091, + "grad_norm": 0.4830690033753034, + "learning_rate": 4.802931971441354e-07, + "loss": 0.5561, + "step": 18083 + }, + { + "epoch": 4.800743395725474, + "grad_norm": 0.47219807663387287, + "learning_rate": 4.800874606451969e-07, + "loss": 0.5156, + "step": 18084 + }, + { + "epoch": 4.8010088941988585, + "grad_norm": 0.47760549244543227, + "learning_rate": 4.798817635394992e-07, + "loss": 0.5117, + "step": 18085 + }, + { + "epoch": 4.8012743926722425, + "grad_norm": 0.4711090619099747, + "learning_rate": 4.796761058310526e-07, + "loss": 0.5038, + "step": 18086 + }, + { + "epoch": 4.801539891145626, + "grad_norm": 0.4746281417910265, + "learning_rate": 4.79470487523869e-07, + "loss": 0.5375, + "step": 18087 + }, + { + "epoch": 4.80180538961901, + "grad_norm": 0.4950114929714865, + "learning_rate": 4.792649086219577e-07, + "loss": 0.5309, + "step": 18088 + }, + { + "epoch": 4.802070888092393, + "grad_norm": 0.4679886885819684, + "learning_rate": 4.790593691293288e-07, + "loss": 0.5022, + "step": 18089 + }, + { + "epoch": 4.802336386565777, + "grad_norm": 0.486960213970011, + "learning_rate": 4.788538690499894e-07, + "loss": 0.5343, + "step": 18090 + }, + { + "epoch": 4.802601885039161, + "grad_norm": 0.48447050951531645, + "learning_rate": 4.786484083879491e-07, + "loss": 0.5126, + "step": 18091 + }, + { + "epoch": 4.802867383512545, + "grad_norm": 0.482871808561654, + "learning_rate": 4.784429871472132e-07, + "loss": 0.5421, + "step": 18092 + }, + { + "epoch": 4.8031328819859285, + "grad_norm": 0.4792006832898944, + "learning_rate": 4.782376053317891e-07, + "loss": 0.5278, + "step": 18093 + }, + { + "epoch": 4.8033983804593126, + "grad_norm": 0.4970194550934078, + "learning_rate": 4.780322629456822e-07, + "loss": 0.5523, + "step": 18094 + }, + { + "epoch": 4.803663878932696, + "grad_norm": 0.49029906821320396, + "learning_rate": 4.778269599928959e-07, + "loss": 0.5303, + "step": 18095 + }, + { + "epoch": 4.80392937740608, + "grad_norm": 0.4867039217964488, + "learning_rate": 4.776216964774355e-07, + "loss": 0.5106, + "step": 18096 + }, + { + "epoch": 4.804194875879464, + "grad_norm": 0.4779262534036763, + "learning_rate": 4.774164724033034e-07, + "loss": 0.5185, + "step": 18097 + }, + { + "epoch": 4.804460374352847, + "grad_norm": 0.4814860079155979, + "learning_rate": 4.772112877745025e-07, + "loss": 0.5403, + "step": 18098 + }, + { + "epoch": 4.804725872826231, + "grad_norm": 0.4716925891695341, + "learning_rate": 4.770061425950345e-07, + "loss": 0.5453, + "step": 18099 + }, + { + "epoch": 4.804991371299615, + "grad_norm": 0.4882553779601527, + "learning_rate": 4.7680103686889967e-07, + "loss": 0.5207, + "step": 18100 + }, + { + "epoch": 4.8052568697729985, + "grad_norm": 0.4881965131752467, + "learning_rate": 4.765959706000975e-07, + "loss": 0.5573, + "step": 18101 + }, + { + "epoch": 4.805522368246383, + "grad_norm": 0.47582457175955745, + "learning_rate": 4.7639094379262895e-07, + "loss": 0.5185, + "step": 18102 + }, + { + "epoch": 4.805787866719767, + "grad_norm": 0.4735654749075319, + "learning_rate": 4.7618595645049085e-07, + "loss": 0.5081, + "step": 18103 + }, + { + "epoch": 4.80605336519315, + "grad_norm": 0.47415381038675497, + "learning_rate": 4.7598100857768225e-07, + "loss": 0.5401, + "step": 18104 + }, + { + "epoch": 4.806318863666534, + "grad_norm": 0.4973999258339989, + "learning_rate": 4.7577610017819993e-07, + "loss": 0.5381, + "step": 18105 + }, + { + "epoch": 4.806584362139918, + "grad_norm": 0.48847274027990056, + "learning_rate": 4.75571231256039e-07, + "loss": 0.559, + "step": 18106 + }, + { + "epoch": 4.806849860613301, + "grad_norm": 0.4836283631469511, + "learning_rate": 4.753664018151963e-07, + "loss": 0.5516, + "step": 18107 + }, + { + "epoch": 4.807115359086685, + "grad_norm": 0.47401409668649647, + "learning_rate": 4.7516161185966554e-07, + "loss": 0.5243, + "step": 18108 + }, + { + "epoch": 4.807380857560069, + "grad_norm": 0.47440283071387174, + "learning_rate": 4.749568613934413e-07, + "loss": 0.5457, + "step": 18109 + }, + { + "epoch": 4.807646356033453, + "grad_norm": 0.48993423402398245, + "learning_rate": 4.74752150420516e-07, + "loss": 0.5681, + "step": 18110 + }, + { + "epoch": 4.807911854506837, + "grad_norm": 0.48264029038372586, + "learning_rate": 4.7454747894488313e-07, + "loss": 0.5447, + "step": 18111 + }, + { + "epoch": 4.80817735298022, + "grad_norm": 0.4778735567774687, + "learning_rate": 4.743428469705336e-07, + "loss": 0.5359, + "step": 18112 + }, + { + "epoch": 4.808442851453604, + "grad_norm": 0.5071405805350742, + "learning_rate": 4.741382545014572e-07, + "loss": 0.5273, + "step": 18113 + }, + { + "epoch": 4.808708349926988, + "grad_norm": 0.4716033665844714, + "learning_rate": 4.739337015416462e-07, + "loss": 0.5478, + "step": 18114 + }, + { + "epoch": 4.808973848400372, + "grad_norm": 0.47590538175302516, + "learning_rate": 4.7372918809508756e-07, + "loss": 0.5211, + "step": 18115 + }, + { + "epoch": 4.809239346873755, + "grad_norm": 0.49302474112585337, + "learning_rate": 4.735247141657726e-07, + "loss": 0.532, + "step": 18116 + }, + { + "epoch": 4.809504845347139, + "grad_norm": 0.48734258933003977, + "learning_rate": 4.733202797576858e-07, + "loss": 0.567, + "step": 18117 + }, + { + "epoch": 4.809770343820523, + "grad_norm": 0.48143797998846577, + "learning_rate": 4.731158848748163e-07, + "loss": 0.5396, + "step": 18118 + }, + { + "epoch": 4.810035842293907, + "grad_norm": 0.4824228712274929, + "learning_rate": 4.729115295211492e-07, + "loss": 0.541, + "step": 18119 + }, + { + "epoch": 4.810301340767291, + "grad_norm": 0.4867457179269042, + "learning_rate": 4.727072137006714e-07, + "loss": 0.5172, + "step": 18120 + }, + { + "epoch": 4.810566839240675, + "grad_norm": 0.48854723419454854, + "learning_rate": 4.7250293741736563e-07, + "loss": 0.5345, + "step": 18121 + }, + { + "epoch": 4.810832337714058, + "grad_norm": 0.4768223803871091, + "learning_rate": 4.722987006752172e-07, + "loss": 0.5511, + "step": 18122 + }, + { + "epoch": 4.811097836187442, + "grad_norm": 0.49535996872987526, + "learning_rate": 4.72094503478209e-07, + "loss": 0.562, + "step": 18123 + }, + { + "epoch": 4.811363334660825, + "grad_norm": 0.48268802670533223, + "learning_rate": 4.718903458303226e-07, + "loss": 0.5443, + "step": 18124 + }, + { + "epoch": 4.811628833134209, + "grad_norm": 0.48018821427385866, + "learning_rate": 4.716862277355408e-07, + "loss": 0.5274, + "step": 18125 + }, + { + "epoch": 4.8118943316075935, + "grad_norm": 0.48088665841504524, + "learning_rate": 4.7148214919784275e-07, + "loss": 0.5124, + "step": 18126 + }, + { + "epoch": 4.812159830080977, + "grad_norm": 0.48176295696429045, + "learning_rate": 4.712781102212105e-07, + "loss": 0.543, + "step": 18127 + }, + { + "epoch": 4.812425328554361, + "grad_norm": 0.4982674844525797, + "learning_rate": 4.71074110809622e-07, + "loss": 0.513, + "step": 18128 + }, + { + "epoch": 4.812690827027745, + "grad_norm": 0.4677446549873634, + "learning_rate": 4.7087015096705603e-07, + "loss": 0.543, + "step": 18129 + }, + { + "epoch": 4.812956325501128, + "grad_norm": 0.4939002897779738, + "learning_rate": 4.706662306974899e-07, + "loss": 0.5991, + "step": 18130 + }, + { + "epoch": 4.813221823974512, + "grad_norm": 0.4853412190363089, + "learning_rate": 4.704623500049005e-07, + "loss": 0.5319, + "step": 18131 + }, + { + "epoch": 4.813487322447896, + "grad_norm": 0.49305192430154066, + "learning_rate": 4.702585088932654e-07, + "loss": 0.5475, + "step": 18132 + }, + { + "epoch": 4.813752820921279, + "grad_norm": 0.4815837904848243, + "learning_rate": 4.7005470736655846e-07, + "loss": 0.4847, + "step": 18133 + }, + { + "epoch": 4.8140183193946635, + "grad_norm": 0.4799775425045277, + "learning_rate": 4.698509454287559e-07, + "loss": 0.5442, + "step": 18134 + }, + { + "epoch": 4.814283817868048, + "grad_norm": 0.47966064458161495, + "learning_rate": 4.696472230838295e-07, + "loss": 0.5331, + "step": 18135 + }, + { + "epoch": 4.814549316341431, + "grad_norm": 0.4737009868692425, + "learning_rate": 4.694435403357539e-07, + "loss": 0.484, + "step": 18136 + }, + { + "epoch": 4.814814814814815, + "grad_norm": 0.4791416821208662, + "learning_rate": 4.692398971885004e-07, + "loss": 0.5481, + "step": 18137 + }, + { + "epoch": 4.815080313288199, + "grad_norm": 0.4732011119990352, + "learning_rate": 4.690362936460416e-07, + "loss": 0.5393, + "step": 18138 + }, + { + "epoch": 4.815345811761582, + "grad_norm": 0.4827608154763677, + "learning_rate": 4.6883272971234766e-07, + "loss": 0.5269, + "step": 18139 + }, + { + "epoch": 4.815611310234966, + "grad_norm": 0.47441625978358054, + "learning_rate": 4.686292053913882e-07, + "loss": 0.5322, + "step": 18140 + }, + { + "epoch": 4.815876808708349, + "grad_norm": 0.47989421335478794, + "learning_rate": 4.684257206871334e-07, + "loss": 0.4774, + "step": 18141 + }, + { + "epoch": 4.8161423071817335, + "grad_norm": 0.48865268856326216, + "learning_rate": 4.6822227560355067e-07, + "loss": 0.5489, + "step": 18142 + }, + { + "epoch": 4.816407805655118, + "grad_norm": 0.47886811139050633, + "learning_rate": 4.6801887014460897e-07, + "loss": 0.5264, + "step": 18143 + }, + { + "epoch": 4.816673304128502, + "grad_norm": 0.4683115566017144, + "learning_rate": 4.678155043142735e-07, + "loss": 0.4857, + "step": 18144 + }, + { + "epoch": 4.816938802601885, + "grad_norm": 0.48007986453529333, + "learning_rate": 4.676121781165127e-07, + "loss": 0.5118, + "step": 18145 + }, + { + "epoch": 4.817204301075269, + "grad_norm": 0.47362498224390864, + "learning_rate": 4.6740889155528936e-07, + "loss": 0.5381, + "step": 18146 + }, + { + "epoch": 4.817469799548652, + "grad_norm": 0.4824805111585414, + "learning_rate": 4.672056446345699e-07, + "loss": 0.5145, + "step": 18147 + }, + { + "epoch": 4.817735298022036, + "grad_norm": 0.5045695610769937, + "learning_rate": 4.6700243735831705e-07, + "loss": 0.5677, + "step": 18148 + }, + { + "epoch": 4.81800079649542, + "grad_norm": 0.4862918137323932, + "learning_rate": 4.6679926973049463e-07, + "loss": 0.5316, + "step": 18149 + }, + { + "epoch": 4.8182662949688035, + "grad_norm": 0.47055863785523566, + "learning_rate": 4.6659614175506406e-07, + "loss": 0.525, + "step": 18150 + }, + { + "epoch": 4.818531793442188, + "grad_norm": 0.4855345887166264, + "learning_rate": 4.6639305343598813e-07, + "loss": 0.5414, + "step": 18151 + }, + { + "epoch": 4.818797291915572, + "grad_norm": 0.47851235252749896, + "learning_rate": 4.6619000477722667e-07, + "loss": 0.5441, + "step": 18152 + }, + { + "epoch": 4.819062790388955, + "grad_norm": 0.4824769438137112, + "learning_rate": 4.65986995782739e-07, + "loss": 0.5338, + "step": 18153 + }, + { + "epoch": 4.819328288862339, + "grad_norm": 0.4837672136707721, + "learning_rate": 4.6578402645648583e-07, + "loss": 0.5603, + "step": 18154 + }, + { + "epoch": 4.819593787335723, + "grad_norm": 0.4899059287755638, + "learning_rate": 4.655810968024241e-07, + "loss": 0.5138, + "step": 18155 + }, + { + "epoch": 4.819859285809106, + "grad_norm": 0.46850423650454737, + "learning_rate": 4.6537820682451273e-07, + "loss": 0.4995, + "step": 18156 + }, + { + "epoch": 4.82012478428249, + "grad_norm": 0.48479285382385445, + "learning_rate": 4.6517535652670783e-07, + "loss": 0.5392, + "step": 18157 + }, + { + "epoch": 4.8203902827558744, + "grad_norm": 0.4790650207220103, + "learning_rate": 4.64972545912965e-07, + "loss": 0.5581, + "step": 18158 + }, + { + "epoch": 4.820655781229258, + "grad_norm": 0.4836268539881204, + "learning_rate": 4.6476977498724096e-07, + "loss": 0.5591, + "step": 18159 + }, + { + "epoch": 4.820921279702642, + "grad_norm": 0.47822459089724667, + "learning_rate": 4.6456704375348854e-07, + "loss": 0.5117, + "step": 18160 + }, + { + "epoch": 4.821186778176026, + "grad_norm": 0.4699591622068747, + "learning_rate": 4.6436435221566324e-07, + "loss": 0.5097, + "step": 18161 + }, + { + "epoch": 4.821452276649409, + "grad_norm": 0.48198029505544215, + "learning_rate": 4.6416170037771684e-07, + "loss": 0.512, + "step": 18162 + }, + { + "epoch": 4.821717775122793, + "grad_norm": 0.47489210279638394, + "learning_rate": 4.639590882436021e-07, + "loss": 0.5431, + "step": 18163 + }, + { + "epoch": 4.821983273596177, + "grad_norm": 0.4809508372540027, + "learning_rate": 4.6375651581726967e-07, + "loss": 0.5129, + "step": 18164 + }, + { + "epoch": 4.82224877206956, + "grad_norm": 0.4776208445584323, + "learning_rate": 4.6355398310267113e-07, + "loss": 0.543, + "step": 18165 + }, + { + "epoch": 4.8225142705429445, + "grad_norm": 0.4843297757247386, + "learning_rate": 4.633514901037556e-07, + "loss": 0.5362, + "step": 18166 + }, + { + "epoch": 4.8227797690163285, + "grad_norm": 0.4824303225651672, + "learning_rate": 4.6314903682447314e-07, + "loss": 0.5366, + "step": 18167 + }, + { + "epoch": 4.823045267489712, + "grad_norm": 0.48605894939820843, + "learning_rate": 4.6294662326877153e-07, + "loss": 0.5469, + "step": 18168 + }, + { + "epoch": 4.823310765963096, + "grad_norm": 0.48139663113332626, + "learning_rate": 4.627442494405976e-07, + "loss": 0.5571, + "step": 18169 + }, + { + "epoch": 4.823576264436479, + "grad_norm": 0.4927290232874083, + "learning_rate": 4.625419153438998e-07, + "loss": 0.5409, + "step": 18170 + }, + { + "epoch": 4.823841762909863, + "grad_norm": 0.4734861802890927, + "learning_rate": 4.6233962098262227e-07, + "loss": 0.53, + "step": 18171 + }, + { + "epoch": 4.824107261383247, + "grad_norm": 0.47913699765607326, + "learning_rate": 4.6213736636071206e-07, + "loss": 0.5605, + "step": 18172 + }, + { + "epoch": 4.824372759856631, + "grad_norm": 0.4847538285866058, + "learning_rate": 4.619351514821119e-07, + "loss": 0.5456, + "step": 18173 + }, + { + "epoch": 4.8246382583300145, + "grad_norm": 0.47324927929082805, + "learning_rate": 4.6173297635076777e-07, + "loss": 0.5267, + "step": 18174 + }, + { + "epoch": 4.824903756803399, + "grad_norm": 0.4770241092552351, + "learning_rate": 4.6153084097061956e-07, + "loss": 0.546, + "step": 18175 + }, + { + "epoch": 4.825169255276782, + "grad_norm": 0.48818549571127556, + "learning_rate": 4.613287453456111e-07, + "loss": 0.5433, + "step": 18176 + }, + { + "epoch": 4.825434753750166, + "grad_norm": 0.48018850705115834, + "learning_rate": 4.611266894796845e-07, + "loss": 0.5441, + "step": 18177 + }, + { + "epoch": 4.82570025222355, + "grad_norm": 0.4684016441834493, + "learning_rate": 4.609246733767786e-07, + "loss": 0.5234, + "step": 18178 + }, + { + "epoch": 4.825965750696933, + "grad_norm": 0.47033956474640726, + "learning_rate": 4.607226970408349e-07, + "loss": 0.5131, + "step": 18179 + }, + { + "epoch": 4.826231249170317, + "grad_norm": 0.48658010291152065, + "learning_rate": 4.6052076047579164e-07, + "loss": 0.5303, + "step": 18180 + }, + { + "epoch": 4.826496747643701, + "grad_norm": 0.48793584579626553, + "learning_rate": 4.603188636855871e-07, + "loss": 0.5437, + "step": 18181 + }, + { + "epoch": 4.8267622461170845, + "grad_norm": 0.47445806267992957, + "learning_rate": 4.601170066741578e-07, + "loss": 0.5219, + "step": 18182 + }, + { + "epoch": 4.827027744590469, + "grad_norm": 0.4749005446564915, + "learning_rate": 4.5991518944544205e-07, + "loss": 0.5219, + "step": 18183 + }, + { + "epoch": 4.827293243063853, + "grad_norm": 0.4817087833539402, + "learning_rate": 4.597134120033747e-07, + "loss": 0.5262, + "step": 18184 + }, + { + "epoch": 4.827558741537236, + "grad_norm": 0.4760468240128116, + "learning_rate": 4.595116743518918e-07, + "loss": 0.5211, + "step": 18185 + }, + { + "epoch": 4.82782424001062, + "grad_norm": 0.4866302088437665, + "learning_rate": 4.5930997649492714e-07, + "loss": 0.5233, + "step": 18186 + }, + { + "epoch": 4.828089738484004, + "grad_norm": 0.49661268674400105, + "learning_rate": 4.5910831843641395e-07, + "loss": 0.5282, + "step": 18187 + }, + { + "epoch": 4.828355236957387, + "grad_norm": 0.48557770498767167, + "learning_rate": 4.5890670018028606e-07, + "loss": 0.5426, + "step": 18188 + }, + { + "epoch": 4.828620735430771, + "grad_norm": 0.4830864070788451, + "learning_rate": 4.5870512173047443e-07, + "loss": 0.5402, + "step": 18189 + }, + { + "epoch": 4.828886233904155, + "grad_norm": 0.49150802696870766, + "learning_rate": 4.5850358309091126e-07, + "loss": 0.5529, + "step": 18190 + }, + { + "epoch": 4.829151732377539, + "grad_norm": 0.4957062608003524, + "learning_rate": 4.5830208426552695e-07, + "loss": 0.5399, + "step": 18191 + }, + { + "epoch": 4.829417230850923, + "grad_norm": 0.48304110841302, + "learning_rate": 4.5810062525825065e-07, + "loss": 0.5359, + "step": 18192 + }, + { + "epoch": 4.829682729324307, + "grad_norm": 0.4891686879486978, + "learning_rate": 4.5789920607301114e-07, + "loss": 0.5316, + "step": 18193 + }, + { + "epoch": 4.82994822779769, + "grad_norm": 0.47356423400101555, + "learning_rate": 4.5769782671373773e-07, + "loss": 0.5076, + "step": 18194 + }, + { + "epoch": 4.830213726271074, + "grad_norm": 0.47515981423606446, + "learning_rate": 4.574964871843562e-07, + "loss": 0.5132, + "step": 18195 + }, + { + "epoch": 4.830479224744458, + "grad_norm": 0.4750746730522271, + "learning_rate": 4.572951874887943e-07, + "loss": 0.5486, + "step": 18196 + }, + { + "epoch": 4.830744723217841, + "grad_norm": 0.49365837543658897, + "learning_rate": 4.570939276309791e-07, + "loss": 0.5379, + "step": 18197 + }, + { + "epoch": 4.831010221691225, + "grad_norm": 0.4878811430545162, + "learning_rate": 4.568927076148327e-07, + "loss": 0.5298, + "step": 18198 + }, + { + "epoch": 4.831275720164609, + "grad_norm": 0.4853627979123752, + "learning_rate": 4.566915274442815e-07, + "loss": 0.53, + "step": 18199 + }, + { + "epoch": 4.831541218637993, + "grad_norm": 0.46725238031499355, + "learning_rate": 4.564903871232479e-07, + "loss": 0.5162, + "step": 18200 + }, + { + "epoch": 4.831806717111377, + "grad_norm": 0.48373388527294836, + "learning_rate": 4.562892866556556e-07, + "loss": 0.5325, + "step": 18201 + }, + { + "epoch": 4.832072215584761, + "grad_norm": 0.4724450341628017, + "learning_rate": 4.560882260454255e-07, + "loss": 0.5185, + "step": 18202 + }, + { + "epoch": 4.832337714058144, + "grad_norm": 0.4849813928971904, + "learning_rate": 4.5588720529647997e-07, + "loss": 0.5363, + "step": 18203 + }, + { + "epoch": 4.832603212531528, + "grad_norm": 0.4913263352483113, + "learning_rate": 4.5568622441273887e-07, + "loss": 0.5062, + "step": 18204 + }, + { + "epoch": 4.832868711004911, + "grad_norm": 0.4870198405355169, + "learning_rate": 4.5548528339812105e-07, + "loss": 0.5378, + "step": 18205 + }, + { + "epoch": 4.833134209478295, + "grad_norm": 0.48662564170345235, + "learning_rate": 4.552843822565467e-07, + "loss": 0.5668, + "step": 18206 + }, + { + "epoch": 4.8333997079516795, + "grad_norm": 0.46700308405032365, + "learning_rate": 4.550835209919327e-07, + "loss": 0.5194, + "step": 18207 + }, + { + "epoch": 4.833665206425063, + "grad_norm": 0.48875253438239225, + "learning_rate": 4.548826996081973e-07, + "loss": 0.544, + "step": 18208 + }, + { + "epoch": 4.833930704898447, + "grad_norm": 0.49350496050109544, + "learning_rate": 4.5468191810925657e-07, + "loss": 0.4957, + "step": 18209 + }, + { + "epoch": 4.834196203371831, + "grad_norm": 0.4853233057802422, + "learning_rate": 4.5448117649902645e-07, + "loss": 0.5202, + "step": 18210 + }, + { + "epoch": 4.834461701845214, + "grad_norm": 0.4788494891193707, + "learning_rate": 4.5428047478142086e-07, + "loss": 0.5495, + "step": 18211 + }, + { + "epoch": 4.834727200318598, + "grad_norm": 0.4712102044167938, + "learning_rate": 4.540798129603555e-07, + "loss": 0.4849, + "step": 18212 + }, + { + "epoch": 4.834992698791982, + "grad_norm": 0.49614897070249664, + "learning_rate": 4.538791910397422e-07, + "loss": 0.5137, + "step": 18213 + }, + { + "epoch": 4.835258197265365, + "grad_norm": 0.4921753917728957, + "learning_rate": 4.5367860902349543e-07, + "loss": 0.5471, + "step": 18214 + }, + { + "epoch": 4.8355236957387495, + "grad_norm": 0.48425578510103867, + "learning_rate": 4.5347806691552586e-07, + "loss": 0.5448, + "step": 18215 + }, + { + "epoch": 4.835789194212134, + "grad_norm": 0.49339433617640477, + "learning_rate": 4.5327756471974406e-07, + "loss": 0.5368, + "step": 18216 + }, + { + "epoch": 4.836054692685517, + "grad_norm": 0.49204854645679075, + "learning_rate": 4.5307710244006183e-07, + "loss": 0.5206, + "step": 18217 + }, + { + "epoch": 4.836320191158901, + "grad_norm": 0.48904457464597656, + "learning_rate": 4.5287668008038686e-07, + "loss": 0.5722, + "step": 18218 + }, + { + "epoch": 4.836585689632285, + "grad_norm": 0.4871505466359667, + "learning_rate": 4.5267629764462973e-07, + "loss": 0.5415, + "step": 18219 + }, + { + "epoch": 4.836851188105668, + "grad_norm": 0.4803969649868923, + "learning_rate": 4.524759551366975e-07, + "loss": 0.5432, + "step": 18220 + }, + { + "epoch": 4.837116686579052, + "grad_norm": 0.46756671668052974, + "learning_rate": 4.522756525604968e-07, + "loss": 0.5456, + "step": 18221 + }, + { + "epoch": 4.837382185052436, + "grad_norm": 0.4832044148201414, + "learning_rate": 4.520753899199351e-07, + "loss": 0.5396, + "step": 18222 + }, + { + "epoch": 4.8376476835258195, + "grad_norm": 0.4796197062455762, + "learning_rate": 4.518751672189167e-07, + "loss": 0.5299, + "step": 18223 + }, + { + "epoch": 4.837913181999204, + "grad_norm": 0.48019675896819974, + "learning_rate": 4.51674984461348e-07, + "loss": 0.5643, + "step": 18224 + }, + { + "epoch": 4.838178680472588, + "grad_norm": 0.4947748731846147, + "learning_rate": 4.5147484165113163e-07, + "loss": 0.5403, + "step": 18225 + }, + { + "epoch": 4.838444178945971, + "grad_norm": 0.4802050819031008, + "learning_rate": 4.5127473879217287e-07, + "loss": 0.5214, + "step": 18226 + }, + { + "epoch": 4.838709677419355, + "grad_norm": 0.4899538452008818, + "learning_rate": 4.510746758883713e-07, + "loss": 0.521, + "step": 18227 + }, + { + "epoch": 4.838975175892738, + "grad_norm": 0.49285298244474074, + "learning_rate": 4.508746529436311e-07, + "loss": 0.5003, + "step": 18228 + }, + { + "epoch": 4.839240674366122, + "grad_norm": 0.4929044803721831, + "learning_rate": 4.506746699618514e-07, + "loss": 0.5715, + "step": 18229 + }, + { + "epoch": 4.839506172839506, + "grad_norm": 0.4745155841416195, + "learning_rate": 4.504747269469342e-07, + "loss": 0.5688, + "step": 18230 + }, + { + "epoch": 4.83977167131289, + "grad_norm": 0.46442873115919686, + "learning_rate": 4.502748239027771e-07, + "loss": 0.5297, + "step": 18231 + }, + { + "epoch": 4.840037169786274, + "grad_norm": 0.48352127910570664, + "learning_rate": 4.500749608332802e-07, + "loss": 0.5173, + "step": 18232 + }, + { + "epoch": 4.840302668259658, + "grad_norm": 0.49404479596608086, + "learning_rate": 4.4987513774234046e-07, + "loss": 0.5419, + "step": 18233 + }, + { + "epoch": 4.840568166733041, + "grad_norm": 0.47963627401077424, + "learning_rate": 4.4967535463385473e-07, + "loss": 0.5253, + "step": 18234 + }, + { + "epoch": 4.840833665206425, + "grad_norm": 0.4885439922256354, + "learning_rate": 4.494756115117202e-07, + "loss": 0.5473, + "step": 18235 + }, + { + "epoch": 4.841099163679809, + "grad_norm": 0.4849336342702325, + "learning_rate": 4.492759083798312e-07, + "loss": 0.5374, + "step": 18236 + }, + { + "epoch": 4.841364662153192, + "grad_norm": 0.4916676254833159, + "learning_rate": 4.490762452420841e-07, + "loss": 0.518, + "step": 18237 + }, + { + "epoch": 4.841630160626576, + "grad_norm": 0.4886490040434697, + "learning_rate": 4.4887662210237077e-07, + "loss": 0.5308, + "step": 18238 + }, + { + "epoch": 4.8418956590999604, + "grad_norm": 0.49726314273208444, + "learning_rate": 4.486770389645856e-07, + "loss": 0.5349, + "step": 18239 + }, + { + "epoch": 4.842161157573344, + "grad_norm": 0.47590246318515445, + "learning_rate": 4.484774958326202e-07, + "loss": 0.5099, + "step": 18240 + }, + { + "epoch": 4.842426656046728, + "grad_norm": 0.49860990709074327, + "learning_rate": 4.4827799271036726e-07, + "loss": 0.5487, + "step": 18241 + }, + { + "epoch": 4.842692154520112, + "grad_norm": 0.5076029422708265, + "learning_rate": 4.480785296017165e-07, + "loss": 0.5698, + "step": 18242 + }, + { + "epoch": 4.842957652993495, + "grad_norm": 0.4898359385744992, + "learning_rate": 4.4787910651055895e-07, + "loss": 0.5417, + "step": 18243 + }, + { + "epoch": 4.843223151466879, + "grad_norm": 0.47424842287744257, + "learning_rate": 4.4767972344078313e-07, + "loss": 0.501, + "step": 18244 + }, + { + "epoch": 4.843488649940263, + "grad_norm": 0.4814491146193889, + "learning_rate": 4.474803803962771e-07, + "loss": 0.5405, + "step": 18245 + }, + { + "epoch": 4.843754148413646, + "grad_norm": 0.4996965243954745, + "learning_rate": 4.472810773809297e-07, + "loss": 0.542, + "step": 18246 + }, + { + "epoch": 4.8440196468870305, + "grad_norm": 0.48664894160352984, + "learning_rate": 4.4708181439862664e-07, + "loss": 0.5078, + "step": 18247 + }, + { + "epoch": 4.8442851453604145, + "grad_norm": 0.49326884127997844, + "learning_rate": 4.4688259145325513e-07, + "loss": 0.5544, + "step": 18248 + }, + { + "epoch": 4.844550643833798, + "grad_norm": 0.4839033103331838, + "learning_rate": 4.466834085487001e-07, + "loss": 0.5566, + "step": 18249 + }, + { + "epoch": 4.844816142307182, + "grad_norm": 0.4892763481700645, + "learning_rate": 4.464842656888452e-07, + "loss": 0.5392, + "step": 18250 + }, + { + "epoch": 4.845081640780565, + "grad_norm": 0.4912769114930382, + "learning_rate": 4.4628516287757577e-07, + "loss": 0.5673, + "step": 18251 + }, + { + "epoch": 4.845347139253949, + "grad_norm": 0.48215085914139333, + "learning_rate": 4.4608610011877327e-07, + "loss": 0.5502, + "step": 18252 + }, + { + "epoch": 4.845612637727333, + "grad_norm": 0.46597944576406264, + "learning_rate": 4.4588707741632153e-07, + "loss": 0.544, + "step": 18253 + }, + { + "epoch": 4.845878136200717, + "grad_norm": 0.4766857587248487, + "learning_rate": 4.4568809477410017e-07, + "loss": 0.5166, + "step": 18254 + }, + { + "epoch": 4.8461436346741005, + "grad_norm": 0.4690858602459936, + "learning_rate": 4.4548915219599225e-07, + "loss": 0.5288, + "step": 18255 + }, + { + "epoch": 4.846409133147485, + "grad_norm": 0.4774485275070768, + "learning_rate": 4.452902496858749e-07, + "loss": 0.525, + "step": 18256 + }, + { + "epoch": 4.846674631620868, + "grad_norm": 0.4879781403870331, + "learning_rate": 4.4509138724762894e-07, + "loss": 0.5589, + "step": 18257 + }, + { + "epoch": 4.846940130094252, + "grad_norm": 0.5055840146162429, + "learning_rate": 4.4489256488513185e-07, + "loss": 0.5181, + "step": 18258 + }, + { + "epoch": 4.847205628567636, + "grad_norm": 0.4784207536330953, + "learning_rate": 4.446937826022621e-07, + "loss": 0.5294, + "step": 18259 + }, + { + "epoch": 4.84747112704102, + "grad_norm": 0.46506180789815005, + "learning_rate": 4.4449504040289556e-07, + "loss": 0.5426, + "step": 18260 + }, + { + "epoch": 4.847736625514403, + "grad_norm": 0.49062190296753627, + "learning_rate": 4.4429633829090826e-07, + "loss": 0.55, + "step": 18261 + }, + { + "epoch": 4.848002123987787, + "grad_norm": 0.4867807625462896, + "learning_rate": 4.44097676270176e-07, + "loss": 0.5539, + "step": 18262 + }, + { + "epoch": 4.8482676224611705, + "grad_norm": 0.4784792603668485, + "learning_rate": 4.4389905434457206e-07, + "loss": 0.5113, + "step": 18263 + }, + { + "epoch": 4.848533120934555, + "grad_norm": 0.48150939191993825, + "learning_rate": 4.437004725179714e-07, + "loss": 0.5505, + "step": 18264 + }, + { + "epoch": 4.848798619407939, + "grad_norm": 0.4807576879116201, + "learning_rate": 4.435019307942456e-07, + "loss": 0.535, + "step": 18265 + }, + { + "epoch": 4.849064117881322, + "grad_norm": 0.47467386767008, + "learning_rate": 4.4330342917726865e-07, + "loss": 0.5425, + "step": 18266 + }, + { + "epoch": 4.849329616354706, + "grad_norm": 0.47139180136038833, + "learning_rate": 4.431049676709093e-07, + "loss": 0.5323, + "step": 18267 + }, + { + "epoch": 4.84959511482809, + "grad_norm": 0.47395333030386544, + "learning_rate": 4.4290654627903913e-07, + "loss": 0.5135, + "step": 18268 + }, + { + "epoch": 4.849860613301473, + "grad_norm": 0.48729254602856314, + "learning_rate": 4.4270816500552873e-07, + "loss": 0.5611, + "step": 18269 + }, + { + "epoch": 4.850126111774857, + "grad_norm": 0.4905670327092857, + "learning_rate": 4.425098238542455e-07, + "loss": 0.5468, + "step": 18270 + }, + { + "epoch": 4.850391610248241, + "grad_norm": 0.4942800627082329, + "learning_rate": 4.423115228290589e-07, + "loss": 0.5341, + "step": 18271 + }, + { + "epoch": 4.850657108721625, + "grad_norm": 0.48776186386997894, + "learning_rate": 4.4211326193383556e-07, + "loss": 0.5501, + "step": 18272 + }, + { + "epoch": 4.850922607195009, + "grad_norm": 0.46830835397990495, + "learning_rate": 4.419150411724424e-07, + "loss": 0.5149, + "step": 18273 + }, + { + "epoch": 4.851188105668393, + "grad_norm": 0.4781424274544481, + "learning_rate": 4.417168605487443e-07, + "loss": 0.558, + "step": 18274 + }, + { + "epoch": 4.851453604141776, + "grad_norm": 0.47903613914742793, + "learning_rate": 4.4151872006660767e-07, + "loss": 0.5262, + "step": 18275 + }, + { + "epoch": 4.85171910261516, + "grad_norm": 0.46715317502719883, + "learning_rate": 4.413206197298953e-07, + "loss": 0.5515, + "step": 18276 + }, + { + "epoch": 4.851984601088544, + "grad_norm": 0.48665928929664454, + "learning_rate": 4.41122559542472e-07, + "loss": 0.5343, + "step": 18277 + }, + { + "epoch": 4.852250099561927, + "grad_norm": 0.49341714990139246, + "learning_rate": 4.4092453950819987e-07, + "loss": 0.5337, + "step": 18278 + }, + { + "epoch": 4.852515598035311, + "grad_norm": 0.48156720776399903, + "learning_rate": 4.407265596309399e-07, + "loss": 0.5337, + "step": 18279 + }, + { + "epoch": 4.852781096508695, + "grad_norm": 0.49431267943017476, + "learning_rate": 4.4052861991455477e-07, + "loss": 0.5211, + "step": 18280 + }, + { + "epoch": 4.853046594982079, + "grad_norm": 0.47494032367250116, + "learning_rate": 4.403307203629034e-07, + "loss": 0.5305, + "step": 18281 + }, + { + "epoch": 4.853312093455463, + "grad_norm": 0.4669439687539629, + "learning_rate": 4.4013286097984635e-07, + "loss": 0.4886, + "step": 18282 + }, + { + "epoch": 4.853577591928847, + "grad_norm": 0.4659243942496253, + "learning_rate": 4.399350417692422e-07, + "loss": 0.5417, + "step": 18283 + }, + { + "epoch": 4.85384309040223, + "grad_norm": 0.4699833696960174, + "learning_rate": 4.397372627349486e-07, + "loss": 0.5608, + "step": 18284 + }, + { + "epoch": 4.854108588875614, + "grad_norm": 0.48611024280735016, + "learning_rate": 4.39539523880822e-07, + "loss": 0.5893, + "step": 18285 + }, + { + "epoch": 4.854374087348997, + "grad_norm": 0.4727216185859443, + "learning_rate": 4.393418252107201e-07, + "loss": 0.5746, + "step": 18286 + }, + { + "epoch": 4.854639585822381, + "grad_norm": 0.4825396894258424, + "learning_rate": 4.3914416672849765e-07, + "loss": 0.5305, + "step": 18287 + }, + { + "epoch": 4.8549050842957655, + "grad_norm": 0.4631118347073883, + "learning_rate": 4.3894654843800955e-07, + "loss": 0.5195, + "step": 18288 + }, + { + "epoch": 4.855170582769149, + "grad_norm": 0.4815487419035392, + "learning_rate": 4.3874897034311135e-07, + "loss": 0.5249, + "step": 18289 + }, + { + "epoch": 4.855436081242533, + "grad_norm": 0.47901427907153116, + "learning_rate": 4.385514324476539e-07, + "loss": 0.5306, + "step": 18290 + }, + { + "epoch": 4.855701579715917, + "grad_norm": 0.4743473026236478, + "learning_rate": 4.383539347554916e-07, + "loss": 0.5406, + "step": 18291 + }, + { + "epoch": 4.8559670781893, + "grad_norm": 0.47672892382941806, + "learning_rate": 4.381564772704744e-07, + "loss": 0.5109, + "step": 18292 + }, + { + "epoch": 4.856232576662684, + "grad_norm": 0.4703273562898987, + "learning_rate": 4.37959059996455e-07, + "loss": 0.5247, + "step": 18293 + }, + { + "epoch": 4.856498075136068, + "grad_norm": 0.5000011467933769, + "learning_rate": 4.377616829372819e-07, + "loss": 0.5286, + "step": 18294 + }, + { + "epoch": 4.856763573609451, + "grad_norm": 0.47903690460561604, + "learning_rate": 4.3756434609680596e-07, + "loss": 0.5193, + "step": 18295 + }, + { + "epoch": 4.8570290720828355, + "grad_norm": 0.4900974222353822, + "learning_rate": 4.3736704947887484e-07, + "loss": 0.5354, + "step": 18296 + }, + { + "epoch": 4.85729457055622, + "grad_norm": 0.48754374634206304, + "learning_rate": 4.3716979308733585e-07, + "loss": 0.547, + "step": 18297 + }, + { + "epoch": 4.857560069029603, + "grad_norm": 0.49183816952895903, + "learning_rate": 4.3697257692603746e-07, + "loss": 0.5823, + "step": 18298 + }, + { + "epoch": 4.857825567502987, + "grad_norm": 0.4935093666861926, + "learning_rate": 4.3677540099882396e-07, + "loss": 0.574, + "step": 18299 + }, + { + "epoch": 4.858091065976371, + "grad_norm": 0.4753492178349335, + "learning_rate": 4.3657826530954265e-07, + "loss": 0.5241, + "step": 18300 + }, + { + "epoch": 4.858356564449754, + "grad_norm": 0.4762913084357427, + "learning_rate": 4.363811698620374e-07, + "loss": 0.508, + "step": 18301 + }, + { + "epoch": 4.858622062923138, + "grad_norm": 0.48661581907854756, + "learning_rate": 4.361841146601517e-07, + "loss": 0.5338, + "step": 18302 + }, + { + "epoch": 4.858887561396522, + "grad_norm": 0.49258432449991685, + "learning_rate": 4.359870997077284e-07, + "loss": 0.5125, + "step": 18303 + }, + { + "epoch": 4.8591530598699055, + "grad_norm": 0.47835515930088757, + "learning_rate": 4.3579012500861077e-07, + "loss": 0.516, + "step": 18304 + }, + { + "epoch": 4.85941855834329, + "grad_norm": 0.46401972363365607, + "learning_rate": 4.3559319056663913e-07, + "loss": 0.5002, + "step": 18305 + }, + { + "epoch": 4.859684056816674, + "grad_norm": 0.4660094628200897, + "learning_rate": 4.3539629638565564e-07, + "loss": 0.4862, + "step": 18306 + }, + { + "epoch": 4.859949555290057, + "grad_norm": 0.4744740975242065, + "learning_rate": 4.3519944246949916e-07, + "loss": 0.5436, + "step": 18307 + }, + { + "epoch": 4.860215053763441, + "grad_norm": 0.49298669598264994, + "learning_rate": 4.350026288220083e-07, + "loss": 0.5731, + "step": 18308 + }, + { + "epoch": 4.860480552236824, + "grad_norm": 0.4859497351426526, + "learning_rate": 4.3480585544702303e-07, + "loss": 0.5504, + "step": 18309 + }, + { + "epoch": 4.860746050710208, + "grad_norm": 0.48817212132266324, + "learning_rate": 4.346091223483795e-07, + "loss": 0.5367, + "step": 18310 + }, + { + "epoch": 4.861011549183592, + "grad_norm": 0.4874449910658859, + "learning_rate": 4.344124295299154e-07, + "loss": 0.5873, + "step": 18311 + }, + { + "epoch": 4.861277047656976, + "grad_norm": 0.46369317067882787, + "learning_rate": 4.3421577699546664e-07, + "loss": 0.4988, + "step": 18312 + }, + { + "epoch": 4.86154254613036, + "grad_norm": 0.47484947910381026, + "learning_rate": 4.340191647488673e-07, + "loss": 0.5415, + "step": 18313 + }, + { + "epoch": 4.861808044603744, + "grad_norm": 0.48757078264101217, + "learning_rate": 4.3382259279395324e-07, + "loss": 0.5462, + "step": 18314 + }, + { + "epoch": 4.862073543077127, + "grad_norm": 0.46874855865464987, + "learning_rate": 4.336260611345569e-07, + "loss": 0.538, + "step": 18315 + }, + { + "epoch": 4.862339041550511, + "grad_norm": 0.48526604921869676, + "learning_rate": 4.334295697745125e-07, + "loss": 0.5396, + "step": 18316 + }, + { + "epoch": 4.862604540023895, + "grad_norm": 0.47850374365919873, + "learning_rate": 4.332331187176506e-07, + "loss": 0.5512, + "step": 18317 + }, + { + "epoch": 4.862870038497278, + "grad_norm": 0.483379625297864, + "learning_rate": 4.3303670796780437e-07, + "loss": 0.5556, + "step": 18318 + }, + { + "epoch": 4.863135536970662, + "grad_norm": 0.4948248268211596, + "learning_rate": 4.3284033752880204e-07, + "loss": 0.5168, + "step": 18319 + }, + { + "epoch": 4.8634010354440464, + "grad_norm": 0.4716475426914833, + "learning_rate": 4.3264400740447493e-07, + "loss": 0.556, + "step": 18320 + }, + { + "epoch": 4.86366653391743, + "grad_norm": 0.48275748309020167, + "learning_rate": 4.3244771759865077e-07, + "loss": 0.5268, + "step": 18321 + }, + { + "epoch": 4.863932032390814, + "grad_norm": 0.49213430238850403, + "learning_rate": 4.3225146811515905e-07, + "loss": 0.5434, + "step": 18322 + }, + { + "epoch": 4.864197530864198, + "grad_norm": 0.4845106807170754, + "learning_rate": 4.3205525895782583e-07, + "loss": 0.5464, + "step": 18323 + }, + { + "epoch": 4.864463029337581, + "grad_norm": 0.493687376700083, + "learning_rate": 4.3185909013047895e-07, + "loss": 0.5344, + "step": 18324 + }, + { + "epoch": 4.864728527810965, + "grad_norm": 0.48835114409912345, + "learning_rate": 4.3166296163694335e-07, + "loss": 0.5487, + "step": 18325 + }, + { + "epoch": 4.864994026284349, + "grad_norm": 0.4855202072451562, + "learning_rate": 4.3146687348104343e-07, + "loss": 0.5389, + "step": 18326 + }, + { + "epoch": 4.865259524757732, + "grad_norm": 0.4713399823454973, + "learning_rate": 4.3127082566660513e-07, + "loss": 0.4949, + "step": 18327 + }, + { + "epoch": 4.8655250232311165, + "grad_norm": 0.47773640536140555, + "learning_rate": 4.3107481819745e-07, + "loss": 0.5354, + "step": 18328 + }, + { + "epoch": 4.8657905217045005, + "grad_norm": 0.4859872363105243, + "learning_rate": 4.30878851077402e-07, + "loss": 0.5689, + "step": 18329 + }, + { + "epoch": 4.866056020177884, + "grad_norm": 0.48302689644037183, + "learning_rate": 4.306829243102828e-07, + "loss": 0.5221, + "step": 18330 + }, + { + "epoch": 4.866321518651268, + "grad_norm": 0.46060417865132963, + "learning_rate": 4.3048703789991293e-07, + "loss": 0.5086, + "step": 18331 + }, + { + "epoch": 4.866587017124652, + "grad_norm": 0.48343277839427656, + "learning_rate": 4.302911918501121e-07, + "loss": 0.518, + "step": 18332 + }, + { + "epoch": 4.866852515598035, + "grad_norm": 0.48172000263431325, + "learning_rate": 4.3009538616470086e-07, + "loss": 0.5344, + "step": 18333 + }, + { + "epoch": 4.867118014071419, + "grad_norm": 0.46326566281550374, + "learning_rate": 4.2989962084749813e-07, + "loss": 0.5124, + "step": 18334 + }, + { + "epoch": 4.867383512544803, + "grad_norm": 0.47931751993884397, + "learning_rate": 4.2970389590232115e-07, + "loss": 0.5275, + "step": 18335 + }, + { + "epoch": 4.8676490110181865, + "grad_norm": 0.4949485801208798, + "learning_rate": 4.295082113329871e-07, + "loss": 0.5216, + "step": 18336 + }, + { + "epoch": 4.867914509491571, + "grad_norm": 0.4811969202303654, + "learning_rate": 4.293125671433118e-07, + "loss": 0.5298, + "step": 18337 + }, + { + "epoch": 4.868180007964954, + "grad_norm": 0.4790510325048018, + "learning_rate": 4.291169633371123e-07, + "loss": 0.5409, + "step": 18338 + }, + { + "epoch": 4.868445506438338, + "grad_norm": 0.4982732152367065, + "learning_rate": 4.2892139991820155e-07, + "loss": 0.5286, + "step": 18339 + }, + { + "epoch": 4.868711004911722, + "grad_norm": 0.4833922763599852, + "learning_rate": 4.2872587689039486e-07, + "loss": 0.5614, + "step": 18340 + }, + { + "epoch": 4.868976503385106, + "grad_norm": 0.4864055240320268, + "learning_rate": 4.2853039425750504e-07, + "loss": 0.5351, + "step": 18341 + }, + { + "epoch": 4.869242001858489, + "grad_norm": 0.48218077225771605, + "learning_rate": 4.2833495202334374e-07, + "loss": 0.5249, + "step": 18342 + }, + { + "epoch": 4.869507500331873, + "grad_norm": 0.4751794345863071, + "learning_rate": 4.2813955019172377e-07, + "loss": 0.5584, + "step": 18343 + }, + { + "epoch": 4.8697729988052565, + "grad_norm": 0.4813275314226737, + "learning_rate": 4.279441887664548e-07, + "loss": 0.5306, + "step": 18344 + }, + { + "epoch": 4.870038497278641, + "grad_norm": 0.4756377729291008, + "learning_rate": 4.2774886775134805e-07, + "loss": 0.5034, + "step": 18345 + }, + { + "epoch": 4.870303995752025, + "grad_norm": 0.49312856886810574, + "learning_rate": 4.2755358715021153e-07, + "loss": 0.5422, + "step": 18346 + }, + { + "epoch": 4.870569494225408, + "grad_norm": 0.4908499628146212, + "learning_rate": 4.2735834696685546e-07, + "loss": 0.5613, + "step": 18347 + }, + { + "epoch": 4.870834992698792, + "grad_norm": 0.47977338055579594, + "learning_rate": 4.271631472050852e-07, + "loss": 0.4989, + "step": 18348 + }, + { + "epoch": 4.871100491172176, + "grad_norm": 0.49012474114926224, + "learning_rate": 4.269679878687091e-07, + "loss": 0.5464, + "step": 18349 + }, + { + "epoch": 4.871365989645559, + "grad_norm": 0.4856052040788197, + "learning_rate": 4.2677286896153246e-07, + "loss": 0.5466, + "step": 18350 + }, + { + "epoch": 4.871631488118943, + "grad_norm": 0.48966160842269185, + "learning_rate": 4.2657779048736166e-07, + "loss": 0.5371, + "step": 18351 + }, + { + "epoch": 4.871896986592327, + "grad_norm": 0.4980364903622559, + "learning_rate": 4.2638275245e-07, + "loss": 0.5495, + "step": 18352 + }, + { + "epoch": 4.872162485065711, + "grad_norm": 0.5163094063901065, + "learning_rate": 4.2618775485325227e-07, + "loss": 0.5562, + "step": 18353 + }, + { + "epoch": 4.872427983539095, + "grad_norm": 0.5009413326110693, + "learning_rate": 4.2599279770092093e-07, + "loss": 0.5114, + "step": 18354 + }, + { + "epoch": 4.872693482012479, + "grad_norm": 0.5107163430498799, + "learning_rate": 4.257978809968075e-07, + "loss": 0.5381, + "step": 18355 + }, + { + "epoch": 4.872958980485862, + "grad_norm": 0.4860251587138234, + "learning_rate": 4.256030047447143e-07, + "loss": 0.5421, + "step": 18356 + }, + { + "epoch": 4.873224478959246, + "grad_norm": 0.47779130887325744, + "learning_rate": 4.25408168948441e-07, + "loss": 0.533, + "step": 18357 + }, + { + "epoch": 4.87348997743263, + "grad_norm": 0.48679416287399835, + "learning_rate": 4.252133736117886e-07, + "loss": 0.5419, + "step": 18358 + }, + { + "epoch": 4.873755475906013, + "grad_norm": 0.4708893080108311, + "learning_rate": 4.2501861873855523e-07, + "loss": 0.5016, + "step": 18359 + }, + { + "epoch": 4.874020974379397, + "grad_norm": 0.4721462520961156, + "learning_rate": 4.248239043325389e-07, + "loss": 0.5457, + "step": 18360 + }, + { + "epoch": 4.874286472852781, + "grad_norm": 0.48281378544516595, + "learning_rate": 4.2462923039753773e-07, + "loss": 0.5179, + "step": 18361 + }, + { + "epoch": 4.874551971326165, + "grad_norm": 0.4796895380351003, + "learning_rate": 4.244345969373473e-07, + "loss": 0.539, + "step": 18362 + }, + { + "epoch": 4.874817469799549, + "grad_norm": 0.4807227578314435, + "learning_rate": 4.242400039557648e-07, + "loss": 0.5027, + "step": 18363 + }, + { + "epoch": 4.875082968272933, + "grad_norm": 0.4897859449715257, + "learning_rate": 4.240454514565845e-07, + "loss": 0.5629, + "step": 18364 + }, + { + "epoch": 4.875348466746316, + "grad_norm": 0.48739320303971895, + "learning_rate": 4.238509394436008e-07, + "loss": 0.5615, + "step": 18365 + }, + { + "epoch": 4.8756139652197, + "grad_norm": 0.4912821025482378, + "learning_rate": 4.236564679206062e-07, + "loss": 0.5437, + "step": 18366 + }, + { + "epoch": 4.875879463693083, + "grad_norm": 0.4673104812641789, + "learning_rate": 4.2346203689139496e-07, + "loss": 0.5158, + "step": 18367 + }, + { + "epoch": 4.876144962166467, + "grad_norm": 0.4937501096770388, + "learning_rate": 4.232676463597579e-07, + "loss": 0.5203, + "step": 18368 + }, + { + "epoch": 4.8764104606398515, + "grad_norm": 0.4801489823516729, + "learning_rate": 4.2307329632948674e-07, + "loss": 0.5479, + "step": 18369 + }, + { + "epoch": 4.876675959113236, + "grad_norm": 0.48233570381407764, + "learning_rate": 4.228789868043717e-07, + "loss": 0.5398, + "step": 18370 + }, + { + "epoch": 4.876941457586619, + "grad_norm": 0.4747119792356855, + "learning_rate": 4.226847177882015e-07, + "loss": 0.5475, + "step": 18371 + }, + { + "epoch": 4.877206956060003, + "grad_norm": 0.4732960639484342, + "learning_rate": 4.224904892847659e-07, + "loss": 0.5262, + "step": 18372 + }, + { + "epoch": 4.877472454533386, + "grad_norm": 0.5022349703582525, + "learning_rate": 4.2229630129785177e-07, + "loss": 0.5487, + "step": 18373 + }, + { + "epoch": 4.87773795300677, + "grad_norm": 0.4858987405689483, + "learning_rate": 4.221021538312475e-07, + "loss": 0.5502, + "step": 18374 + }, + { + "epoch": 4.878003451480154, + "grad_norm": 0.48850277880214665, + "learning_rate": 4.219080468887382e-07, + "loss": 0.5874, + "step": 18375 + }, + { + "epoch": 4.878268949953537, + "grad_norm": 0.4819318916630262, + "learning_rate": 4.217139804741113e-07, + "loss": 0.5141, + "step": 18376 + }, + { + "epoch": 4.8785344484269215, + "grad_norm": 0.48319873590506474, + "learning_rate": 4.2151995459114914e-07, + "loss": 0.5316, + "step": 18377 + }, + { + "epoch": 4.878799946900306, + "grad_norm": 0.4751816904790201, + "learning_rate": 4.2132596924363666e-07, + "loss": 0.5546, + "step": 18378 + }, + { + "epoch": 4.879065445373689, + "grad_norm": 0.4911216679462683, + "learning_rate": 4.211320244353581e-07, + "loss": 0.5476, + "step": 18379 + }, + { + "epoch": 4.879330943847073, + "grad_norm": 0.49113664844913535, + "learning_rate": 4.2093812017009435e-07, + "loss": 0.5606, + "step": 18380 + }, + { + "epoch": 4.879596442320457, + "grad_norm": 0.508419431758893, + "learning_rate": 4.207442564516287e-07, + "loss": 0.5364, + "step": 18381 + }, + { + "epoch": 4.87986194079384, + "grad_norm": 0.4990457146422748, + "learning_rate": 4.2055043328373986e-07, + "loss": 0.5978, + "step": 18382 + }, + { + "epoch": 4.880127439267224, + "grad_norm": 0.4776995102640195, + "learning_rate": 4.203566506702095e-07, + "loss": 0.5442, + "step": 18383 + }, + { + "epoch": 4.880392937740608, + "grad_norm": 0.4989231219101574, + "learning_rate": 4.2016290861481594e-07, + "loss": 0.5463, + "step": 18384 + }, + { + "epoch": 4.8806584362139915, + "grad_norm": 0.4929326685793675, + "learning_rate": 4.199692071213382e-07, + "loss": 0.5211, + "step": 18385 + }, + { + "epoch": 4.880923934687376, + "grad_norm": 0.4733748801751327, + "learning_rate": 4.197755461935535e-07, + "loss": 0.5181, + "step": 18386 + }, + { + "epoch": 4.88118943316076, + "grad_norm": 0.47189202901291055, + "learning_rate": 4.1958192583523933e-07, + "loss": 0.5096, + "step": 18387 + }, + { + "epoch": 4.881454931634143, + "grad_norm": 0.4701019391446451, + "learning_rate": 4.1938834605017133e-07, + "loss": 0.5012, + "step": 18388 + }, + { + "epoch": 4.881720430107527, + "grad_norm": 0.48673683063968365, + "learning_rate": 4.191948068421239e-07, + "loss": 0.5746, + "step": 18389 + }, + { + "epoch": 4.88198592858091, + "grad_norm": 0.4764424312405619, + "learning_rate": 4.1900130821487364e-07, + "loss": 0.541, + "step": 18390 + }, + { + "epoch": 4.882251427054294, + "grad_norm": 0.49890411321842876, + "learning_rate": 4.1880785017219207e-07, + "loss": 0.5323, + "step": 18391 + }, + { + "epoch": 4.882516925527678, + "grad_norm": 0.49590494527481394, + "learning_rate": 4.1861443271785375e-07, + "loss": 0.5566, + "step": 18392 + }, + { + "epoch": 4.882782424001062, + "grad_norm": 0.4654112185478426, + "learning_rate": 4.1842105585562983e-07, + "loss": 0.5113, + "step": 18393 + }, + { + "epoch": 4.883047922474446, + "grad_norm": 0.5148089946837342, + "learning_rate": 4.1822771958929225e-07, + "loss": 0.5435, + "step": 18394 + }, + { + "epoch": 4.88331342094783, + "grad_norm": 0.4998546571759412, + "learning_rate": 4.1803442392261057e-07, + "loss": 0.5344, + "step": 18395 + }, + { + "epoch": 4.883578919421213, + "grad_norm": 0.4850552841319684, + "learning_rate": 4.178411688593556e-07, + "loss": 0.5111, + "step": 18396 + }, + { + "epoch": 4.883844417894597, + "grad_norm": 0.48460109747212365, + "learning_rate": 4.176479544032952e-07, + "loss": 0.5341, + "step": 18397 + }, + { + "epoch": 4.884109916367981, + "grad_norm": 0.49271501971659776, + "learning_rate": 4.1745478055819884e-07, + "loss": 0.5114, + "step": 18398 + }, + { + "epoch": 4.884375414841364, + "grad_norm": 0.4954059306983671, + "learning_rate": 4.172616473278332e-07, + "loss": 0.5178, + "step": 18399 + }, + { + "epoch": 4.884640913314748, + "grad_norm": 0.48306772868386066, + "learning_rate": 4.1706855471596446e-07, + "loss": 0.5298, + "step": 18400 + }, + { + "epoch": 4.8849064117881325, + "grad_norm": 0.4786021070958391, + "learning_rate": 4.168755027263591e-07, + "loss": 0.5519, + "step": 18401 + }, + { + "epoch": 4.885171910261516, + "grad_norm": 0.48412201472714805, + "learning_rate": 4.166824913627815e-07, + "loss": 0.5187, + "step": 18402 + }, + { + "epoch": 4.8854374087349, + "grad_norm": 0.4843226980062491, + "learning_rate": 4.164895206289965e-07, + "loss": 0.5294, + "step": 18403 + }, + { + "epoch": 4.885702907208284, + "grad_norm": 0.48542181014244334, + "learning_rate": 4.162965905287672e-07, + "loss": 0.5517, + "step": 18404 + }, + { + "epoch": 4.885968405681667, + "grad_norm": 0.48028580718071917, + "learning_rate": 4.1610370106585586e-07, + "loss": 0.5372, + "step": 18405 + }, + { + "epoch": 4.886233904155051, + "grad_norm": 0.49130598904825407, + "learning_rate": 4.15910852244025e-07, + "loss": 0.5485, + "step": 18406 + }, + { + "epoch": 4.886499402628435, + "grad_norm": 0.4888697734491091, + "learning_rate": 4.157180440670344e-07, + "loss": 0.5954, + "step": 18407 + }, + { + "epoch": 4.886764901101818, + "grad_norm": 0.4846521356232636, + "learning_rate": 4.155252765386461e-07, + "loss": 0.5953, + "step": 18408 + }, + { + "epoch": 4.8870303995752025, + "grad_norm": 0.4911611927125826, + "learning_rate": 4.1533254966261787e-07, + "loss": 0.5706, + "step": 18409 + }, + { + "epoch": 4.8872958980485866, + "grad_norm": 0.4879318565105792, + "learning_rate": 4.151398634427103e-07, + "loss": 0.5224, + "step": 18410 + }, + { + "epoch": 4.88756139652197, + "grad_norm": 0.4725602566725544, + "learning_rate": 4.1494721788267854e-07, + "loss": 0.5417, + "step": 18411 + }, + { + "epoch": 4.887826894995354, + "grad_norm": 0.48994874100731417, + "learning_rate": 4.14754612986282e-07, + "loss": 0.5515, + "step": 18412 + }, + { + "epoch": 4.888092393468738, + "grad_norm": 0.49380180248989286, + "learning_rate": 4.1456204875727545e-07, + "loss": 0.5512, + "step": 18413 + }, + { + "epoch": 4.888357891942121, + "grad_norm": 0.47075381119159593, + "learning_rate": 4.1436952519941564e-07, + "loss": 0.5211, + "step": 18414 + }, + { + "epoch": 4.888623390415505, + "grad_norm": 0.4963607221965845, + "learning_rate": 4.1417704231645594e-07, + "loss": 0.5316, + "step": 18415 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.48921921958476, + "learning_rate": 4.139846001121514e-07, + "loss": 0.5414, + "step": 18416 + }, + { + "epoch": 4.8891543873622725, + "grad_norm": 0.48685124256258455, + "learning_rate": 4.137921985902546e-07, + "loss": 0.5224, + "step": 18417 + }, + { + "epoch": 4.889419885835657, + "grad_norm": 0.4846439398388234, + "learning_rate": 4.1359983775451724e-07, + "loss": 0.5429, + "step": 18418 + }, + { + "epoch": 4.88968538430904, + "grad_norm": 0.47937026845934433, + "learning_rate": 4.1340751760869216e-07, + "loss": 0.548, + "step": 18419 + }, + { + "epoch": 4.889950882782424, + "grad_norm": 0.471580656178394, + "learning_rate": 4.132152381565288e-07, + "loss": 0.5351, + "step": 18420 + }, + { + "epoch": 4.890216381255808, + "grad_norm": 0.49544373258000335, + "learning_rate": 4.1302299940177785e-07, + "loss": 0.553, + "step": 18421 + }, + { + "epoch": 4.890481879729192, + "grad_norm": 0.4737998174151155, + "learning_rate": 4.1283080134818853e-07, + "loss": 0.5274, + "step": 18422 + }, + { + "epoch": 4.890747378202575, + "grad_norm": 0.4867390890113528, + "learning_rate": 4.126386439995089e-07, + "loss": 0.5467, + "step": 18423 + }, + { + "epoch": 4.891012876675959, + "grad_norm": 0.4762126250908237, + "learning_rate": 4.124465273594858e-07, + "loss": 0.5484, + "step": 18424 + }, + { + "epoch": 4.8912783751493425, + "grad_norm": 0.4718947677489696, + "learning_rate": 4.122544514318666e-07, + "loss": 0.5497, + "step": 18425 + }, + { + "epoch": 4.891543873622727, + "grad_norm": 0.4780129714432393, + "learning_rate": 4.1206241622039763e-07, + "loss": 0.5646, + "step": 18426 + }, + { + "epoch": 4.891809372096111, + "grad_norm": 0.482147985436244, + "learning_rate": 4.1187042172882414e-07, + "loss": 0.529, + "step": 18427 + }, + { + "epoch": 4.892074870569494, + "grad_norm": 0.46716268000097677, + "learning_rate": 4.1167846796088983e-07, + "loss": 0.5335, + "step": 18428 + }, + { + "epoch": 4.892340369042878, + "grad_norm": 0.49129417012713433, + "learning_rate": 4.1148655492033786e-07, + "loss": 0.5309, + "step": 18429 + }, + { + "epoch": 4.892605867516262, + "grad_norm": 0.47774999971490945, + "learning_rate": 4.112946826109121e-07, + "loss": 0.5484, + "step": 18430 + }, + { + "epoch": 4.892871365989645, + "grad_norm": 0.4757632842027513, + "learning_rate": 4.111028510363538e-07, + "loss": 0.513, + "step": 18431 + }, + { + "epoch": 4.893136864463029, + "grad_norm": 0.4933324270655687, + "learning_rate": 4.109110602004046e-07, + "loss": 0.521, + "step": 18432 + }, + { + "epoch": 4.893402362936413, + "grad_norm": 0.4814300463409591, + "learning_rate": 4.107193101068049e-07, + "loss": 0.5413, + "step": 18433 + }, + { + "epoch": 4.893667861409797, + "grad_norm": 0.4834698435760068, + "learning_rate": 4.105276007592937e-07, + "loss": 0.5323, + "step": 18434 + }, + { + "epoch": 4.893933359883181, + "grad_norm": 0.4752344866048308, + "learning_rate": 4.1033593216161047e-07, + "loss": 0.5272, + "step": 18435 + }, + { + "epoch": 4.894198858356565, + "grad_norm": 0.48028625811257475, + "learning_rate": 4.101443043174924e-07, + "loss": 0.5421, + "step": 18436 + }, + { + "epoch": 4.894464356829948, + "grad_norm": 0.4834073871047219, + "learning_rate": 4.0995271723067804e-07, + "loss": 0.5546, + "step": 18437 + }, + { + "epoch": 4.894729855303332, + "grad_norm": 0.48160827285640334, + "learning_rate": 4.0976117090490234e-07, + "loss": 0.5168, + "step": 18438 + }, + { + "epoch": 4.894995353776716, + "grad_norm": 0.46428187831004203, + "learning_rate": 4.0956966534390267e-07, + "loss": 0.5585, + "step": 18439 + }, + { + "epoch": 4.895260852250099, + "grad_norm": 0.47083026619909496, + "learning_rate": 4.0937820055141184e-07, + "loss": 0.5343, + "step": 18440 + }, + { + "epoch": 4.895526350723483, + "grad_norm": 0.4608205803943905, + "learning_rate": 4.0918677653116523e-07, + "loss": 0.5073, + "step": 18441 + }, + { + "epoch": 4.8957918491968675, + "grad_norm": 0.48486301344327903, + "learning_rate": 4.089953932868951e-07, + "loss": 0.5234, + "step": 18442 + }, + { + "epoch": 4.896057347670251, + "grad_norm": 0.49201647852666075, + "learning_rate": 4.088040508223351e-07, + "loss": 0.5529, + "step": 18443 + }, + { + "epoch": 4.896322846143635, + "grad_norm": 0.4800721186511747, + "learning_rate": 4.0861274914121534e-07, + "loss": 0.5555, + "step": 18444 + }, + { + "epoch": 4.896588344617019, + "grad_norm": 0.49060033990975155, + "learning_rate": 4.0842148824726866e-07, + "loss": 0.5084, + "step": 18445 + }, + { + "epoch": 4.896853843090402, + "grad_norm": 0.4877058000778876, + "learning_rate": 4.082302681442238e-07, + "loss": 0.5614, + "step": 18446 + }, + { + "epoch": 4.897119341563786, + "grad_norm": 0.4630546679428516, + "learning_rate": 4.080390888358096e-07, + "loss": 0.5191, + "step": 18447 + }, + { + "epoch": 4.897384840037169, + "grad_norm": 0.4958745600192032, + "learning_rate": 4.078479503257557e-07, + "loss": 0.5295, + "step": 18448 + }, + { + "epoch": 4.897650338510553, + "grad_norm": 0.4982075995240193, + "learning_rate": 4.076568526177888e-07, + "loss": 0.5335, + "step": 18449 + }, + { + "epoch": 4.8979158369839375, + "grad_norm": 0.4868892885918251, + "learning_rate": 4.0746579571563667e-07, + "loss": 0.5499, + "step": 18450 + }, + { + "epoch": 4.898181335457322, + "grad_norm": 0.4857059544818654, + "learning_rate": 4.07274779623025e-07, + "loss": 0.5398, + "step": 18451 + }, + { + "epoch": 4.898446833930705, + "grad_norm": 0.4819152727112816, + "learning_rate": 4.070838043436787e-07, + "loss": 0.5596, + "step": 18452 + }, + { + "epoch": 4.898712332404089, + "grad_norm": 0.4768515191830725, + "learning_rate": 4.0689286988132263e-07, + "loss": 0.5113, + "step": 18453 + }, + { + "epoch": 4.898977830877472, + "grad_norm": 0.4714311116391779, + "learning_rate": 4.067019762396801e-07, + "loss": 0.5262, + "step": 18454 + }, + { + "epoch": 4.899243329350856, + "grad_norm": 0.4645356801345613, + "learning_rate": 4.06511123422475e-07, + "loss": 0.5425, + "step": 18455 + }, + { + "epoch": 4.89950882782424, + "grad_norm": 0.4786660574805506, + "learning_rate": 4.063203114334285e-07, + "loss": 0.5207, + "step": 18456 + }, + { + "epoch": 4.899774326297623, + "grad_norm": 0.48979638518458085, + "learning_rate": 4.0612954027626215e-07, + "loss": 0.5082, + "step": 18457 + }, + { + "epoch": 4.9000398247710075, + "grad_norm": 0.4782625960353114, + "learning_rate": 4.05938809954696e-07, + "loss": 0.5252, + "step": 18458 + }, + { + "epoch": 4.900305323244392, + "grad_norm": 0.4966983409946847, + "learning_rate": 4.057481204724509e-07, + "loss": 0.5009, + "step": 18459 + }, + { + "epoch": 4.900570821717775, + "grad_norm": 0.480135260760648, + "learning_rate": 4.055574718332442e-07, + "loss": 0.5413, + "step": 18460 + }, + { + "epoch": 4.900836320191159, + "grad_norm": 0.49228855592047943, + "learning_rate": 4.053668640407957e-07, + "loss": 0.5293, + "step": 18461 + }, + { + "epoch": 4.901101818664543, + "grad_norm": 0.4991104434773967, + "learning_rate": 4.0517629709882185e-07, + "loss": 0.5537, + "step": 18462 + }, + { + "epoch": 4.901367317137926, + "grad_norm": 0.4848441092343395, + "learning_rate": 4.049857710110383e-07, + "loss": 0.5453, + "step": 18463 + }, + { + "epoch": 4.90163281561131, + "grad_norm": 0.4802812102801752, + "learning_rate": 4.0479528578116266e-07, + "loss": 0.5517, + "step": 18464 + }, + { + "epoch": 4.901898314084694, + "grad_norm": 0.5010223849110018, + "learning_rate": 4.0460484141290807e-07, + "loss": 0.5383, + "step": 18465 + }, + { + "epoch": 4.9021638125580775, + "grad_norm": 0.49163946234859396, + "learning_rate": 4.044144379099904e-07, + "loss": 0.5478, + "step": 18466 + }, + { + "epoch": 4.902429311031462, + "grad_norm": 0.48166823573972256, + "learning_rate": 4.0422407527612116e-07, + "loss": 0.5604, + "step": 18467 + }, + { + "epoch": 4.902694809504846, + "grad_norm": 0.4866767661300027, + "learning_rate": 4.0403375351501515e-07, + "loss": 0.5114, + "step": 18468 + }, + { + "epoch": 4.902960307978229, + "grad_norm": 0.4805071361612573, + "learning_rate": 4.038434726303814e-07, + "loss": 0.5385, + "step": 18469 + }, + { + "epoch": 4.903225806451613, + "grad_norm": 0.4791933171802118, + "learning_rate": 4.036532326259324e-07, + "loss": 0.5205, + "step": 18470 + }, + { + "epoch": 4.903491304924996, + "grad_norm": 0.4955759113248641, + "learning_rate": 4.0346303350537853e-07, + "loss": 0.5555, + "step": 18471 + }, + { + "epoch": 4.90375680339838, + "grad_norm": 0.48804101614789513, + "learning_rate": 4.032728752724283e-07, + "loss": 0.5497, + "step": 18472 + }, + { + "epoch": 4.904022301871764, + "grad_norm": 0.48754085262225133, + "learning_rate": 4.030827579307911e-07, + "loss": 0.5209, + "step": 18473 + }, + { + "epoch": 4.904287800345148, + "grad_norm": 0.46968906619538753, + "learning_rate": 4.0289268148417437e-07, + "loss": 0.523, + "step": 18474 + }, + { + "epoch": 4.904553298818532, + "grad_norm": 0.47739486215422067, + "learning_rate": 4.0270264593628485e-07, + "loss": 0.5357, + "step": 18475 + }, + { + "epoch": 4.904818797291916, + "grad_norm": 0.4661677278280224, + "learning_rate": 4.025126512908284e-07, + "loss": 0.4996, + "step": 18476 + }, + { + "epoch": 4.905084295765299, + "grad_norm": 0.4821901112508252, + "learning_rate": 4.0232269755151126e-07, + "loss": 0.5124, + "step": 18477 + }, + { + "epoch": 4.905349794238683, + "grad_norm": 0.4843198988973941, + "learning_rate": 4.021327847220369e-07, + "loss": 0.5358, + "step": 18478 + }, + { + "epoch": 4.905615292712067, + "grad_norm": 0.48987834295660043, + "learning_rate": 4.019429128061103e-07, + "loss": 0.5241, + "step": 18479 + }, + { + "epoch": 4.905880791185451, + "grad_norm": 0.4866309815177567, + "learning_rate": 4.017530818074339e-07, + "loss": 0.5261, + "step": 18480 + }, + { + "epoch": 4.906146289658834, + "grad_norm": 0.49597631488660615, + "learning_rate": 4.0156329172970907e-07, + "loss": 0.5613, + "step": 18481 + }, + { + "epoch": 4.9064117881322185, + "grad_norm": 0.48317651561446956, + "learning_rate": 4.013735425766388e-07, + "loss": 0.5256, + "step": 18482 + }, + { + "epoch": 4.906677286605602, + "grad_norm": 0.47907128565238893, + "learning_rate": 4.011838343519217e-07, + "loss": 0.5168, + "step": 18483 + }, + { + "epoch": 4.906942785078986, + "grad_norm": 0.48552897716575216, + "learning_rate": 4.009941670592596e-07, + "loss": 0.508, + "step": 18484 + }, + { + "epoch": 4.90720828355237, + "grad_norm": 0.4925522713194775, + "learning_rate": 4.008045407023503e-07, + "loss": 0.5306, + "step": 18485 + }, + { + "epoch": 4.907473782025753, + "grad_norm": 0.47141868739228854, + "learning_rate": 4.006149552848923e-07, + "loss": 0.5495, + "step": 18486 + }, + { + "epoch": 4.907739280499137, + "grad_norm": 0.5010911304282613, + "learning_rate": 4.0042541081058233e-07, + "loss": 0.5548, + "step": 18487 + }, + { + "epoch": 4.908004778972521, + "grad_norm": 0.48496202925962095, + "learning_rate": 4.0023590728311803e-07, + "loss": 0.6015, + "step": 18488 + }, + { + "epoch": 4.908270277445904, + "grad_norm": 0.4783687116046605, + "learning_rate": 4.0004644470619387e-07, + "loss": 0.5148, + "step": 18489 + }, + { + "epoch": 4.9085357759192885, + "grad_norm": 0.47008191416413353, + "learning_rate": 3.9985702308350584e-07, + "loss": 0.5305, + "step": 18490 + }, + { + "epoch": 4.9088012743926726, + "grad_norm": 0.478511553942559, + "learning_rate": 3.9966764241874874e-07, + "loss": 0.5145, + "step": 18491 + }, + { + "epoch": 4.909066772866056, + "grad_norm": 0.4995702624103274, + "learning_rate": 3.994783027156143e-07, + "loss": 0.5736, + "step": 18492 + }, + { + "epoch": 4.90933227133944, + "grad_norm": 0.4781811665389976, + "learning_rate": 3.9928900397779637e-07, + "loss": 0.5309, + "step": 18493 + }, + { + "epoch": 4.909597769812824, + "grad_norm": 0.482656080505446, + "learning_rate": 3.9909974620898574e-07, + "loss": 0.5345, + "step": 18494 + }, + { + "epoch": 4.909863268286207, + "grad_norm": 0.5063366573195083, + "learning_rate": 3.989105294128745e-07, + "loss": 0.546, + "step": 18495 + }, + { + "epoch": 4.910128766759591, + "grad_norm": 0.4753574593879358, + "learning_rate": 3.9872135359315165e-07, + "loss": 0.5381, + "step": 18496 + }, + { + "epoch": 4.910394265232975, + "grad_norm": 0.5057387420964584, + "learning_rate": 3.985322187535079e-07, + "loss": 0.5764, + "step": 18497 + }, + { + "epoch": 4.9106597637063585, + "grad_norm": 0.4754794799989428, + "learning_rate": 3.9834312489763105e-07, + "loss": 0.5507, + "step": 18498 + }, + { + "epoch": 4.910925262179743, + "grad_norm": 0.47542382266471056, + "learning_rate": 3.9815407202920845e-07, + "loss": 0.515, + "step": 18499 + }, + { + "epoch": 4.911190760653126, + "grad_norm": 0.4914924354150179, + "learning_rate": 3.979650601519283e-07, + "loss": 0.5519, + "step": 18500 + }, + { + "epoch": 4.91145625912651, + "grad_norm": 0.4793312098374365, + "learning_rate": 3.977760892694757e-07, + "loss": 0.5543, + "step": 18501 + }, + { + "epoch": 4.911721757599894, + "grad_norm": 0.4996713772854791, + "learning_rate": 3.975871593855374e-07, + "loss": 0.5426, + "step": 18502 + }, + { + "epoch": 4.911987256073278, + "grad_norm": 0.4911547718273215, + "learning_rate": 3.97398270503796e-07, + "loss": 0.5713, + "step": 18503 + }, + { + "epoch": 4.912252754546661, + "grad_norm": 0.4791871555851505, + "learning_rate": 3.9720942262793714e-07, + "loss": 0.5428, + "step": 18504 + }, + { + "epoch": 4.912518253020045, + "grad_norm": 0.4776306721020352, + "learning_rate": 3.970206157616421e-07, + "loss": 0.5118, + "step": 18505 + }, + { + "epoch": 4.9127837514934285, + "grad_norm": 0.4671634164674486, + "learning_rate": 3.968318499085949e-07, + "loss": 0.5152, + "step": 18506 + }, + { + "epoch": 4.913049249966813, + "grad_norm": 0.4857562801231992, + "learning_rate": 3.966431250724756e-07, + "loss": 0.493, + "step": 18507 + }, + { + "epoch": 4.913314748440197, + "grad_norm": 0.478472167706853, + "learning_rate": 3.964544412569657e-07, + "loss": 0.5213, + "step": 18508 + }, + { + "epoch": 4.91358024691358, + "grad_norm": 0.48059785723801673, + "learning_rate": 3.962657984657445e-07, + "loss": 0.5216, + "step": 18509 + }, + { + "epoch": 4.913845745386964, + "grad_norm": 0.48665086487252285, + "learning_rate": 3.960771967024904e-07, + "loss": 0.5525, + "step": 18510 + }, + { + "epoch": 4.914111243860348, + "grad_norm": 0.7240881027538407, + "learning_rate": 3.95888635970883e-07, + "loss": 0.5419, + "step": 18511 + }, + { + "epoch": 4.914376742333731, + "grad_norm": 0.48476410048770946, + "learning_rate": 3.9570011627459826e-07, + "loss": 0.5297, + "step": 18512 + }, + { + "epoch": 4.914642240807115, + "grad_norm": 0.4824206757330293, + "learning_rate": 3.955116376173143e-07, + "loss": 0.516, + "step": 18513 + }, + { + "epoch": 4.914907739280499, + "grad_norm": 0.479489862742604, + "learning_rate": 3.953232000027057e-07, + "loss": 0.512, + "step": 18514 + }, + { + "epoch": 4.915173237753883, + "grad_norm": 0.48042652914045975, + "learning_rate": 3.9513480343444767e-07, + "loss": 0.5282, + "step": 18515 + }, + { + "epoch": 4.915438736227267, + "grad_norm": 0.476208601326996, + "learning_rate": 3.949464479162149e-07, + "loss": 0.5158, + "step": 18516 + }, + { + "epoch": 4.915704234700651, + "grad_norm": 0.4783775570938376, + "learning_rate": 3.947581334516798e-07, + "loss": 0.5315, + "step": 18517 + }, + { + "epoch": 4.915969733174034, + "grad_norm": 0.4825384710768808, + "learning_rate": 3.945698600445164e-07, + "loss": 0.5226, + "step": 18518 + }, + { + "epoch": 4.916235231647418, + "grad_norm": 0.517856000659449, + "learning_rate": 3.94381627698395e-07, + "loss": 0.5261, + "step": 18519 + }, + { + "epoch": 4.916500730120802, + "grad_norm": 0.46861481281274486, + "learning_rate": 3.941934364169886e-07, + "loss": 0.5395, + "step": 18520 + }, + { + "epoch": 4.916766228594185, + "grad_norm": 0.4792002069101839, + "learning_rate": 3.9400528620396476e-07, + "loss": 0.5536, + "step": 18521 + }, + { + "epoch": 4.917031727067569, + "grad_norm": 0.49079393897979434, + "learning_rate": 3.938171770629948e-07, + "loss": 0.525, + "step": 18522 + }, + { + "epoch": 4.9172972255409535, + "grad_norm": 0.47827088443541654, + "learning_rate": 3.936291089977462e-07, + "loss": 0.5375, + "step": 18523 + }, + { + "epoch": 4.917562724014337, + "grad_norm": 0.47453747008458924, + "learning_rate": 3.934410820118878e-07, + "loss": 0.5285, + "step": 18524 + }, + { + "epoch": 4.917828222487721, + "grad_norm": 0.495049605967072, + "learning_rate": 3.9325309610908605e-07, + "loss": 0.5319, + "step": 18525 + }, + { + "epoch": 4.918093720961105, + "grad_norm": 0.49075001426179393, + "learning_rate": 3.930651512930067e-07, + "loss": 0.5129, + "step": 18526 + }, + { + "epoch": 4.918359219434488, + "grad_norm": 0.49585670143567556, + "learning_rate": 3.928772475673159e-07, + "loss": 0.5748, + "step": 18527 + }, + { + "epoch": 4.918624717907872, + "grad_norm": 0.4867543719554076, + "learning_rate": 3.926893849356775e-07, + "loss": 0.4963, + "step": 18528 + }, + { + "epoch": 4.918890216381255, + "grad_norm": 0.4963868164907265, + "learning_rate": 3.925015634017562e-07, + "loss": 0.5401, + "step": 18529 + }, + { + "epoch": 4.919155714854639, + "grad_norm": 0.5000253852292696, + "learning_rate": 3.923137829692139e-07, + "loss": 0.5501, + "step": 18530 + }, + { + "epoch": 4.9194212133280235, + "grad_norm": 0.4868866511394403, + "learning_rate": 3.921260436417146e-07, + "loss": 0.5304, + "step": 18531 + }, + { + "epoch": 4.919686711801408, + "grad_norm": 0.47810845377949884, + "learning_rate": 3.9193834542291697e-07, + "loss": 0.5669, + "step": 18532 + }, + { + "epoch": 4.919952210274791, + "grad_norm": 0.4711032835708588, + "learning_rate": 3.91750688316484e-07, + "loss": 0.5353, + "step": 18533 + }, + { + "epoch": 4.920217708748175, + "grad_norm": 0.4676897210146569, + "learning_rate": 3.9156307232607356e-07, + "loss": 0.5191, + "step": 18534 + }, + { + "epoch": 4.920483207221558, + "grad_norm": 0.47762298468185405, + "learning_rate": 3.913754974553463e-07, + "loss": 0.5189, + "step": 18535 + }, + { + "epoch": 4.920748705694942, + "grad_norm": 0.4825479158136567, + "learning_rate": 3.911879637079591e-07, + "loss": 0.5777, + "step": 18536 + }, + { + "epoch": 4.921014204168326, + "grad_norm": 0.4930226277792407, + "learning_rate": 3.9100047108757035e-07, + "loss": 0.5164, + "step": 18537 + }, + { + "epoch": 4.921279702641709, + "grad_norm": 0.48990775286020416, + "learning_rate": 3.90813019597836e-07, + "loss": 0.5486, + "step": 18538 + }, + { + "epoch": 4.9215452011150935, + "grad_norm": 0.47574640914097976, + "learning_rate": 3.9062560924241157e-07, + "loss": 0.5236, + "step": 18539 + }, + { + "epoch": 4.921810699588478, + "grad_norm": 0.49012084744826817, + "learning_rate": 3.904382400249529e-07, + "loss": 0.4853, + "step": 18540 + }, + { + "epoch": 4.922076198061861, + "grad_norm": 0.48410955923556176, + "learning_rate": 3.9025091194911324e-07, + "loss": 0.5738, + "step": 18541 + }, + { + "epoch": 4.922341696535245, + "grad_norm": 0.4840025452121903, + "learning_rate": 3.900636250185469e-07, + "loss": 0.5732, + "step": 18542 + }, + { + "epoch": 4.922607195008629, + "grad_norm": 0.490437790818542, + "learning_rate": 3.8987637923690563e-07, + "loss": 0.5089, + "step": 18543 + }, + { + "epoch": 4.922872693482012, + "grad_norm": 0.48291252182453726, + "learning_rate": 3.89689174607841e-07, + "loss": 0.493, + "step": 18544 + }, + { + "epoch": 4.923138191955396, + "grad_norm": 0.47536743612066074, + "learning_rate": 3.8950201113500506e-07, + "loss": 0.5348, + "step": 18545 + }, + { + "epoch": 4.92340369042878, + "grad_norm": 0.4962383988934327, + "learning_rate": 3.893148888220466e-07, + "loss": 0.5904, + "step": 18546 + }, + { + "epoch": 4.9236691889021635, + "grad_norm": 0.4785118027013009, + "learning_rate": 3.8912780767261624e-07, + "loss": 0.5614, + "step": 18547 + }, + { + "epoch": 4.923934687375548, + "grad_norm": 0.4830810397040389, + "learning_rate": 3.88940767690362e-07, + "loss": 0.4981, + "step": 18548 + }, + { + "epoch": 4.924200185848932, + "grad_norm": 0.4821007701102928, + "learning_rate": 3.8875376887893167e-07, + "loss": 0.5406, + "step": 18549 + }, + { + "epoch": 4.924465684322315, + "grad_norm": 0.48888873471141336, + "learning_rate": 3.8856681124197134e-07, + "loss": 0.5424, + "step": 18550 + }, + { + "epoch": 4.924731182795699, + "grad_norm": 0.4805356512392803, + "learning_rate": 3.8837989478312826e-07, + "loss": 0.523, + "step": 18551 + }, + { + "epoch": 4.924996681269083, + "grad_norm": 0.47657229307970145, + "learning_rate": 3.881930195060471e-07, + "loss": 0.5525, + "step": 18552 + }, + { + "epoch": 4.925262179742466, + "grad_norm": 0.47415844691002984, + "learning_rate": 3.8800618541437325e-07, + "loss": 0.5011, + "step": 18553 + }, + { + "epoch": 4.92552767821585, + "grad_norm": 0.4828400353913829, + "learning_rate": 3.878193925117496e-07, + "loss": 0.5505, + "step": 18554 + }, + { + "epoch": 4.925793176689234, + "grad_norm": 0.47792941207804485, + "learning_rate": 3.8763264080181884e-07, + "loss": 0.5538, + "step": 18555 + }, + { + "epoch": 4.926058675162618, + "grad_norm": 0.48792268823134527, + "learning_rate": 3.8744593028822413e-07, + "loss": 0.5405, + "step": 18556 + }, + { + "epoch": 4.926324173636002, + "grad_norm": 0.4916759277318384, + "learning_rate": 3.8725926097460564e-07, + "loss": 0.5662, + "step": 18557 + }, + { + "epoch": 4.926589672109385, + "grad_norm": 0.48987155003560207, + "learning_rate": 3.870726328646049e-07, + "loss": 0.5165, + "step": 18558 + }, + { + "epoch": 4.926855170582769, + "grad_norm": 0.492260618175659, + "learning_rate": 3.8688604596186064e-07, + "loss": 0.587, + "step": 18559 + }, + { + "epoch": 4.927120669056153, + "grad_norm": 0.47788339944142905, + "learning_rate": 3.866995002700136e-07, + "loss": 0.5732, + "step": 18560 + }, + { + "epoch": 4.927386167529537, + "grad_norm": 0.5048354955337484, + "learning_rate": 3.865129957926991e-07, + "loss": 0.5508, + "step": 18561 + }, + { + "epoch": 4.92765166600292, + "grad_norm": 0.49001145767624876, + "learning_rate": 3.8632653253355634e-07, + "loss": 0.5358, + "step": 18562 + }, + { + "epoch": 4.9279171644763045, + "grad_norm": 0.48819152785113407, + "learning_rate": 3.8614011049622146e-07, + "loss": 0.5632, + "step": 18563 + }, + { + "epoch": 4.928182662949688, + "grad_norm": 0.49074566907092343, + "learning_rate": 3.8595372968432963e-07, + "loss": 0.5539, + "step": 18564 + }, + { + "epoch": 4.928448161423072, + "grad_norm": 0.4828841185215907, + "learning_rate": 3.8576739010151685e-07, + "loss": 0.5372, + "step": 18565 + }, + { + "epoch": 4.928713659896456, + "grad_norm": 0.47576791153936066, + "learning_rate": 3.8558109175141635e-07, + "loss": 0.5221, + "step": 18566 + }, + { + "epoch": 4.928979158369839, + "grad_norm": 0.4974036727332416, + "learning_rate": 3.853948346376618e-07, + "loss": 0.5419, + "step": 18567 + }, + { + "epoch": 4.929244656843223, + "grad_norm": 0.490460630744477, + "learning_rate": 3.8520861876388455e-07, + "loss": 0.541, + "step": 18568 + }, + { + "epoch": 4.929510155316607, + "grad_norm": 0.4822493102783241, + "learning_rate": 3.8502244413371777e-07, + "loss": 0.539, + "step": 18569 + }, + { + "epoch": 4.92977565378999, + "grad_norm": 0.4800024751663519, + "learning_rate": 3.848363107507913e-07, + "loss": 0.5533, + "step": 18570 + }, + { + "epoch": 4.9300411522633745, + "grad_norm": 0.48878815364254047, + "learning_rate": 3.846502186187359e-07, + "loss": 0.5371, + "step": 18571 + }, + { + "epoch": 4.9303066507367586, + "grad_norm": 0.4834995899126379, + "learning_rate": 3.8446416774118063e-07, + "loss": 0.5385, + "step": 18572 + }, + { + "epoch": 4.930572149210142, + "grad_norm": 0.49032832515294433, + "learning_rate": 3.8427815812175335e-07, + "loss": 0.5391, + "step": 18573 + }, + { + "epoch": 4.930837647683526, + "grad_norm": 0.482556768370718, + "learning_rate": 3.840921897640826e-07, + "loss": 0.534, + "step": 18574 + }, + { + "epoch": 4.93110314615691, + "grad_norm": 0.46460504353809134, + "learning_rate": 3.839062626717943e-07, + "loss": 0.513, + "step": 18575 + }, + { + "epoch": 4.931368644630293, + "grad_norm": 0.4790184507684636, + "learning_rate": 3.837203768485154e-07, + "loss": 0.5346, + "step": 18576 + }, + { + "epoch": 4.931634143103677, + "grad_norm": 0.4907319441185008, + "learning_rate": 3.835345322978709e-07, + "loss": 0.5443, + "step": 18577 + }, + { + "epoch": 4.931899641577061, + "grad_norm": 0.4806731454409003, + "learning_rate": 3.83348729023485e-07, + "loss": 0.5383, + "step": 18578 + }, + { + "epoch": 4.9321651400504445, + "grad_norm": 0.48863168197168405, + "learning_rate": 3.831629670289805e-07, + "loss": 0.5333, + "step": 18579 + }, + { + "epoch": 4.932430638523829, + "grad_norm": 0.47635623359569684, + "learning_rate": 3.829772463179818e-07, + "loss": 0.552, + "step": 18580 + }, + { + "epoch": 4.932696136997212, + "grad_norm": 0.4831409769858091, + "learning_rate": 3.827915668941096e-07, + "loss": 0.5185, + "step": 18581 + }, + { + "epoch": 4.932961635470596, + "grad_norm": 0.4998529380478762, + "learning_rate": 3.826059287609857e-07, + "loss": 0.538, + "step": 18582 + }, + { + "epoch": 4.93322713394398, + "grad_norm": 0.4773249131333924, + "learning_rate": 3.824203319222317e-07, + "loss": 0.5022, + "step": 18583 + }, + { + "epoch": 4.933492632417364, + "grad_norm": 0.49026793531777124, + "learning_rate": 3.822347763814646e-07, + "loss": 0.54, + "step": 18584 + }, + { + "epoch": 4.933758130890747, + "grad_norm": 0.4965747908967882, + "learning_rate": 3.8204926214230527e-07, + "loss": 0.5576, + "step": 18585 + }, + { + "epoch": 4.934023629364131, + "grad_norm": 0.475106955268532, + "learning_rate": 3.8186378920837014e-07, + "loss": 0.5404, + "step": 18586 + }, + { + "epoch": 4.9342891278375145, + "grad_norm": 0.48533335843139475, + "learning_rate": 3.8167835758327804e-07, + "loss": 0.5138, + "step": 18587 + }, + { + "epoch": 4.934554626310899, + "grad_norm": 0.47911340497081223, + "learning_rate": 3.8149296727064383e-07, + "loss": 0.5052, + "step": 18588 + }, + { + "epoch": 4.934820124784283, + "grad_norm": 0.4767558354845794, + "learning_rate": 3.8130761827408436e-07, + "loss": 0.5366, + "step": 18589 + }, + { + "epoch": 4.935085623257667, + "grad_norm": 0.4726929127109575, + "learning_rate": 3.811223105972137e-07, + "loss": 0.5295, + "step": 18590 + }, + { + "epoch": 4.93535112173105, + "grad_norm": 0.4894448944838308, + "learning_rate": 3.809370442436455e-07, + "loss": 0.5422, + "step": 18591 + }, + { + "epoch": 4.935616620204434, + "grad_norm": 0.49040063434492476, + "learning_rate": 3.8075181921699375e-07, + "loss": 0.5279, + "step": 18592 + }, + { + "epoch": 4.935882118677817, + "grad_norm": 0.4835052449953252, + "learning_rate": 3.8056663552086983e-07, + "loss": 0.5618, + "step": 18593 + }, + { + "epoch": 4.936147617151201, + "grad_norm": 0.46555451023887073, + "learning_rate": 3.8038149315888616e-07, + "loss": 0.4958, + "step": 18594 + }, + { + "epoch": 4.936413115624585, + "grad_norm": 0.4740254715261996, + "learning_rate": 3.8019639213465327e-07, + "loss": 0.5473, + "step": 18595 + }, + { + "epoch": 4.936678614097969, + "grad_norm": 0.46919658406170817, + "learning_rate": 3.8001133245178094e-07, + "loss": 0.5321, + "step": 18596 + }, + { + "epoch": 4.936944112571353, + "grad_norm": 0.4899852953691373, + "learning_rate": 3.7982631411387776e-07, + "loss": 0.5583, + "step": 18597 + }, + { + "epoch": 4.937209611044737, + "grad_norm": 0.4749674759234017, + "learning_rate": 3.7964133712455303e-07, + "loss": 0.4933, + "step": 18598 + }, + { + "epoch": 4.93747510951812, + "grad_norm": 0.47994563383834343, + "learning_rate": 3.7945640148741304e-07, + "loss": 0.5268, + "step": 18599 + }, + { + "epoch": 4.937740607991504, + "grad_norm": 0.49265929455979257, + "learning_rate": 3.7927150720606596e-07, + "loss": 0.5311, + "step": 18600 + }, + { + "epoch": 4.938006106464888, + "grad_norm": 0.4852296548860318, + "learning_rate": 3.790866542841168e-07, + "loss": 0.5131, + "step": 18601 + }, + { + "epoch": 4.938271604938271, + "grad_norm": 0.4742500964562554, + "learning_rate": 3.7890184272517005e-07, + "loss": 0.5448, + "step": 18602 + }, + { + "epoch": 4.938537103411655, + "grad_norm": 0.4582623340860457, + "learning_rate": 3.7871707253283154e-07, + "loss": 0.5277, + "step": 18603 + }, + { + "epoch": 4.9388026018850395, + "grad_norm": 0.48440467348833105, + "learning_rate": 3.785323437107033e-07, + "loss": 0.534, + "step": 18604 + }, + { + "epoch": 4.939068100358423, + "grad_norm": 0.48861207440362897, + "learning_rate": 3.7834765626238917e-07, + "loss": 0.5338, + "step": 18605 + }, + { + "epoch": 4.939333598831807, + "grad_norm": 0.4961318281342257, + "learning_rate": 3.7816301019149044e-07, + "loss": 0.5413, + "step": 18606 + }, + { + "epoch": 4.939599097305191, + "grad_norm": 0.4819360356898252, + "learning_rate": 3.7797840550160755e-07, + "loss": 0.5234, + "step": 18607 + }, + { + "epoch": 4.939864595778574, + "grad_norm": 0.4788953899918431, + "learning_rate": 3.7779384219634204e-07, + "loss": 0.4852, + "step": 18608 + }, + { + "epoch": 4.940130094251958, + "grad_norm": 0.4826652755359662, + "learning_rate": 3.776093202792919e-07, + "loss": 0.4755, + "step": 18609 + }, + { + "epoch": 4.940395592725341, + "grad_norm": 0.4876500860098827, + "learning_rate": 3.7742483975405724e-07, + "loss": 0.5183, + "step": 18610 + }, + { + "epoch": 4.940661091198725, + "grad_norm": 0.49113042006583135, + "learning_rate": 3.772404006242347e-07, + "loss": 0.5059, + "step": 18611 + }, + { + "epoch": 4.9409265896721095, + "grad_norm": 0.4856748783687735, + "learning_rate": 3.770560028934228e-07, + "loss": 0.5551, + "step": 18612 + }, + { + "epoch": 4.941192088145494, + "grad_norm": 0.4821366001985778, + "learning_rate": 3.768716465652153e-07, + "loss": 0.5774, + "step": 18613 + }, + { + "epoch": 4.941457586618877, + "grad_norm": 0.4722988821796132, + "learning_rate": 3.7668733164320985e-07, + "loss": 0.5304, + "step": 18614 + }, + { + "epoch": 4.941723085092261, + "grad_norm": 0.47421245594591194, + "learning_rate": 3.765030581309997e-07, + "loss": 0.4971, + "step": 18615 + }, + { + "epoch": 4.941988583565644, + "grad_norm": 0.46324693720813953, + "learning_rate": 3.763188260321796e-07, + "loss": 0.528, + "step": 18616 + }, + { + "epoch": 4.942254082039028, + "grad_norm": 0.4819911822591266, + "learning_rate": 3.7613463535034146e-07, + "loss": 0.5153, + "step": 18617 + }, + { + "epoch": 4.942519580512412, + "grad_norm": 0.4673841807184212, + "learning_rate": 3.7595048608907863e-07, + "loss": 0.5112, + "step": 18618 + }, + { + "epoch": 4.942785078985795, + "grad_norm": 0.47966255228515575, + "learning_rate": 3.7576637825198185e-07, + "loss": 0.5409, + "step": 18619 + }, + { + "epoch": 4.9430505774591795, + "grad_norm": 0.47642925619490206, + "learning_rate": 3.7558231184264134e-07, + "loss": 0.5504, + "step": 18620 + }, + { + "epoch": 4.943316075932564, + "grad_norm": 0.4889325007237682, + "learning_rate": 3.753982868646475e-07, + "loss": 0.5407, + "step": 18621 + }, + { + "epoch": 4.943581574405947, + "grad_norm": 0.4861365265331885, + "learning_rate": 3.7521430332158853e-07, + "loss": 0.555, + "step": 18622 + }, + { + "epoch": 4.943847072879331, + "grad_norm": 0.48502231027701137, + "learning_rate": 3.750303612170542e-07, + "loss": 0.5213, + "step": 18623 + }, + { + "epoch": 4.944112571352715, + "grad_norm": 0.49282846907661104, + "learning_rate": 3.748464605546295e-07, + "loss": 0.5166, + "step": 18624 + }, + { + "epoch": 4.944378069826098, + "grad_norm": 0.47715250605183523, + "learning_rate": 3.746626013379026e-07, + "loss": 0.5516, + "step": 18625 + }, + { + "epoch": 4.944643568299482, + "grad_norm": 0.4771917755850158, + "learning_rate": 3.744787835704583e-07, + "loss": 0.5525, + "step": 18626 + }, + { + "epoch": 4.944909066772866, + "grad_norm": 0.47555438851240833, + "learning_rate": 3.7429500725588183e-07, + "loss": 0.5251, + "step": 18627 + }, + { + "epoch": 4.9451745652462495, + "grad_norm": 0.4779196693292834, + "learning_rate": 3.7411127239775774e-07, + "loss": 0.5109, + "step": 18628 + }, + { + "epoch": 4.945440063719634, + "grad_norm": 0.49224439961885713, + "learning_rate": 3.739275789996691e-07, + "loss": 0.5273, + "step": 18629 + }, + { + "epoch": 4.945705562193018, + "grad_norm": 0.4852469797287295, + "learning_rate": 3.7374392706519826e-07, + "loss": 0.4969, + "step": 18630 + }, + { + "epoch": 4.945971060666401, + "grad_norm": 0.4809848470611654, + "learning_rate": 3.7356031659792596e-07, + "loss": 0.5173, + "step": 18631 + }, + { + "epoch": 4.946236559139785, + "grad_norm": 0.48931889078185314, + "learning_rate": 3.7337674760143466e-07, + "loss": 0.5271, + "step": 18632 + }, + { + "epoch": 4.946502057613169, + "grad_norm": 0.48401441867161266, + "learning_rate": 3.7319322007930314e-07, + "loss": 0.5388, + "step": 18633 + }, + { + "epoch": 4.946767556086552, + "grad_norm": 0.4849443653856644, + "learning_rate": 3.730097340351116e-07, + "loss": 0.5544, + "step": 18634 + }, + { + "epoch": 4.947033054559936, + "grad_norm": 0.4867101828063559, + "learning_rate": 3.72826289472438e-07, + "loss": 0.533, + "step": 18635 + }, + { + "epoch": 4.9472985530333204, + "grad_norm": 0.49454880277379515, + "learning_rate": 3.7264288639485946e-07, + "loss": 0.543, + "step": 18636 + }, + { + "epoch": 4.947564051506704, + "grad_norm": 0.4758699611430989, + "learning_rate": 3.72459524805954e-07, + "loss": 0.5226, + "step": 18637 + }, + { + "epoch": 4.947829549980088, + "grad_norm": 0.4687696815904308, + "learning_rate": 3.722762047092962e-07, + "loss": 0.5188, + "step": 18638 + }, + { + "epoch": 4.948095048453471, + "grad_norm": 0.48853154596075177, + "learning_rate": 3.7209292610846243e-07, + "loss": 0.5516, + "step": 18639 + }, + { + "epoch": 4.948360546926855, + "grad_norm": 0.5109172374636374, + "learning_rate": 3.719096890070262e-07, + "loss": 0.5386, + "step": 18640 + }, + { + "epoch": 4.948626045400239, + "grad_norm": 0.4851181904669391, + "learning_rate": 3.7172649340856263e-07, + "loss": 0.527, + "step": 18641 + }, + { + "epoch": 4.948891543873623, + "grad_norm": 0.4737321112101461, + "learning_rate": 3.7154333931664235e-07, + "loss": 0.5092, + "step": 18642 + }, + { + "epoch": 4.949157042347006, + "grad_norm": 0.4798326051425066, + "learning_rate": 3.7136022673483876e-07, + "loss": 0.5163, + "step": 18643 + }, + { + "epoch": 4.9494225408203905, + "grad_norm": 0.4850624259823333, + "learning_rate": 3.711771556667218e-07, + "loss": 0.5029, + "step": 18644 + }, + { + "epoch": 4.949688039293774, + "grad_norm": 0.4783642148622547, + "learning_rate": 3.7099412611586336e-07, + "loss": 0.5191, + "step": 18645 + }, + { + "epoch": 4.949953537767158, + "grad_norm": 0.49106768459088873, + "learning_rate": 3.7081113808583204e-07, + "loss": 0.5413, + "step": 18646 + }, + { + "epoch": 4.950219036240542, + "grad_norm": 0.49034099586984053, + "learning_rate": 3.7062819158019627e-07, + "loss": 0.5311, + "step": 18647 + }, + { + "epoch": 4.950484534713925, + "grad_norm": 0.4915936222637951, + "learning_rate": 3.704452866025246e-07, + "loss": 0.5321, + "step": 18648 + }, + { + "epoch": 4.950750033187309, + "grad_norm": 0.4774199709177357, + "learning_rate": 3.7026242315638365e-07, + "loss": 0.4973, + "step": 18649 + }, + { + "epoch": 4.951015531660693, + "grad_norm": 0.4786519037140113, + "learning_rate": 3.700796012453406e-07, + "loss": 0.5724, + "step": 18650 + }, + { + "epoch": 4.951281030134076, + "grad_norm": 0.47887300831780266, + "learning_rate": 3.6989682087295946e-07, + "loss": 0.5099, + "step": 18651 + }, + { + "epoch": 4.9515465286074605, + "grad_norm": 0.4847848513893285, + "learning_rate": 3.697140820428063e-07, + "loss": 0.5332, + "step": 18652 + }, + { + "epoch": 4.951812027080845, + "grad_norm": 0.4807975944993597, + "learning_rate": 3.6953138475844446e-07, + "loss": 0.5629, + "step": 18653 + }, + { + "epoch": 4.952077525554228, + "grad_norm": 0.4904485424891691, + "learning_rate": 3.693487290234363e-07, + "loss": 0.5351, + "step": 18654 + }, + { + "epoch": 4.952343024027612, + "grad_norm": 0.47240343450083344, + "learning_rate": 3.69166114841345e-07, + "loss": 0.5528, + "step": 18655 + }, + { + "epoch": 4.952608522500996, + "grad_norm": 0.4836608213456476, + "learning_rate": 3.6898354221573145e-07, + "loss": 0.5528, + "step": 18656 + }, + { + "epoch": 4.952874020974379, + "grad_norm": 0.4888590013267215, + "learning_rate": 3.6880101115015695e-07, + "loss": 0.5611, + "step": 18657 + }, + { + "epoch": 4.953139519447763, + "grad_norm": 0.4756328682507688, + "learning_rate": 3.6861852164818054e-07, + "loss": 0.4875, + "step": 18658 + }, + { + "epoch": 4.953405017921147, + "grad_norm": 0.4729403962985496, + "learning_rate": 3.684360737133616e-07, + "loss": 0.5167, + "step": 18659 + }, + { + "epoch": 4.9536705163945305, + "grad_norm": 0.48789278788718127, + "learning_rate": 3.682536673492579e-07, + "loss": 0.5412, + "step": 18660 + }, + { + "epoch": 4.953936014867915, + "grad_norm": 0.4959424916247629, + "learning_rate": 3.6807130255942735e-07, + "loss": 0.5515, + "step": 18661 + }, + { + "epoch": 4.954201513341299, + "grad_norm": 0.4838180021637111, + "learning_rate": 3.678889793474258e-07, + "loss": 0.534, + "step": 18662 + }, + { + "epoch": 4.954467011814682, + "grad_norm": 0.48986983561832637, + "learning_rate": 3.6770669771681003e-07, + "loss": 0.4888, + "step": 18663 + }, + { + "epoch": 4.954732510288066, + "grad_norm": 0.4821046560728604, + "learning_rate": 3.6752445767113424e-07, + "loss": 0.5283, + "step": 18664 + }, + { + "epoch": 4.95499800876145, + "grad_norm": 0.4821905608386961, + "learning_rate": 3.673422592139525e-07, + "loss": 0.535, + "step": 18665 + }, + { + "epoch": 4.955263507234833, + "grad_norm": 0.4758972218483108, + "learning_rate": 3.671601023488186e-07, + "loss": 0.5463, + "step": 18666 + }, + { + "epoch": 4.955529005708217, + "grad_norm": 0.48950219014966956, + "learning_rate": 3.669779870792844e-07, + "loss": 0.5524, + "step": 18667 + }, + { + "epoch": 4.9557945041816005, + "grad_norm": 0.5017984247641869, + "learning_rate": 3.667959134089025e-07, + "loss": 0.5392, + "step": 18668 + }, + { + "epoch": 4.956060002654985, + "grad_norm": 0.4885380615647711, + "learning_rate": 3.6661388134122343e-07, + "loss": 0.549, + "step": 18669 + }, + { + "epoch": 4.956325501128369, + "grad_norm": 0.48157185421652493, + "learning_rate": 3.6643189087979704e-07, + "loss": 0.4585, + "step": 18670 + }, + { + "epoch": 4.956590999601753, + "grad_norm": 0.495493526089683, + "learning_rate": 3.6624994202817204e-07, + "loss": 0.5584, + "step": 18671 + }, + { + "epoch": 4.956856498075136, + "grad_norm": 0.4886310663845349, + "learning_rate": 3.660680347898979e-07, + "loss": 0.5748, + "step": 18672 + }, + { + "epoch": 4.95712199654852, + "grad_norm": 0.4818810479469065, + "learning_rate": 3.658861691685214e-07, + "loss": 0.5519, + "step": 18673 + }, + { + "epoch": 4.957387495021903, + "grad_norm": 0.47457472355625385, + "learning_rate": 3.6570434516758985e-07, + "loss": 0.5357, + "step": 18674 + }, + { + "epoch": 4.957652993495287, + "grad_norm": 0.49376838512270477, + "learning_rate": 3.655225627906503e-07, + "loss": 0.5243, + "step": 18675 + }, + { + "epoch": 4.957918491968671, + "grad_norm": 0.4825921600878026, + "learning_rate": 3.6534082204124577e-07, + "loss": 0.5052, + "step": 18676 + }, + { + "epoch": 4.958183990442055, + "grad_norm": 0.4937926286626435, + "learning_rate": 3.6515912292292235e-07, + "loss": 0.5628, + "step": 18677 + }, + { + "epoch": 4.958449488915439, + "grad_norm": 0.4842924655380837, + "learning_rate": 3.649774654392224e-07, + "loss": 0.5988, + "step": 18678 + }, + { + "epoch": 4.958714987388823, + "grad_norm": 0.47252435383699554, + "learning_rate": 3.647958495936901e-07, + "loss": 0.5293, + "step": 18679 + }, + { + "epoch": 4.958980485862206, + "grad_norm": 0.4937646149317888, + "learning_rate": 3.6461427538986574e-07, + "loss": 0.5307, + "step": 18680 + }, + { + "epoch": 4.95924598433559, + "grad_norm": 0.4859138426198157, + "learning_rate": 3.6443274283129215e-07, + "loss": 0.4955, + "step": 18681 + }, + { + "epoch": 4.959511482808974, + "grad_norm": 0.4823319912277644, + "learning_rate": 3.6425125192150856e-07, + "loss": 0.5136, + "step": 18682 + }, + { + "epoch": 4.959776981282357, + "grad_norm": 0.4923013154480677, + "learning_rate": 3.6406980266405464e-07, + "loss": 0.5313, + "step": 18683 + }, + { + "epoch": 4.960042479755741, + "grad_norm": 0.4968554047916396, + "learning_rate": 3.6388839506246943e-07, + "loss": 0.5593, + "step": 18684 + }, + { + "epoch": 4.9603079782291255, + "grad_norm": 0.4721044134151305, + "learning_rate": 3.6370702912029015e-07, + "loss": 0.495, + "step": 18685 + }, + { + "epoch": 4.960573476702509, + "grad_norm": 0.49744630719236416, + "learning_rate": 3.635257048410548e-07, + "loss": 0.5475, + "step": 18686 + }, + { + "epoch": 4.960838975175893, + "grad_norm": 0.47521184458999854, + "learning_rate": 3.633444222282995e-07, + "loss": 0.5199, + "step": 18687 + }, + { + "epoch": 4.961104473649277, + "grad_norm": 0.4783360711237551, + "learning_rate": 3.6316318128555903e-07, + "loss": 0.4891, + "step": 18688 + }, + { + "epoch": 4.96136997212266, + "grad_norm": 0.464569261198877, + "learning_rate": 3.62981982016368e-07, + "loss": 0.5092, + "step": 18689 + }, + { + "epoch": 4.961635470596044, + "grad_norm": 0.4909053327965359, + "learning_rate": 3.6280082442426146e-07, + "loss": 0.5281, + "step": 18690 + }, + { + "epoch": 4.961900969069428, + "grad_norm": 0.48548796997054855, + "learning_rate": 3.6261970851277056e-07, + "loss": 0.5389, + "step": 18691 + }, + { + "epoch": 4.962166467542811, + "grad_norm": 0.46895375443214554, + "learning_rate": 3.6243863428542943e-07, + "loss": 0.5035, + "step": 18692 + }, + { + "epoch": 4.9624319660161955, + "grad_norm": 0.4868794431415527, + "learning_rate": 3.622576017457685e-07, + "loss": 0.5326, + "step": 18693 + }, + { + "epoch": 4.96269746448958, + "grad_norm": 0.4847308208503863, + "learning_rate": 3.620766108973181e-07, + "loss": 0.5409, + "step": 18694 + }, + { + "epoch": 4.962962962962963, + "grad_norm": 0.4945754614439959, + "learning_rate": 3.618956617436087e-07, + "loss": 0.5375, + "step": 18695 + }, + { + "epoch": 4.963228461436347, + "grad_norm": 0.48248365827032014, + "learning_rate": 3.6171475428816825e-07, + "loss": 0.5582, + "step": 18696 + }, + { + "epoch": 4.96349395990973, + "grad_norm": 0.48600160698443134, + "learning_rate": 3.6153388853452597e-07, + "loss": 0.5374, + "step": 18697 + }, + { + "epoch": 4.963759458383114, + "grad_norm": 0.46848027781391877, + "learning_rate": 3.6135306448620897e-07, + "loss": 0.5311, + "step": 18698 + }, + { + "epoch": 4.964024956856498, + "grad_norm": 0.4877257568714426, + "learning_rate": 3.611722821467428e-07, + "loss": 0.5559, + "step": 18699 + }, + { + "epoch": 4.964290455329882, + "grad_norm": 0.48128282509655346, + "learning_rate": 3.6099154151965464e-07, + "loss": 0.5357, + "step": 18700 + }, + { + "epoch": 4.9645559538032655, + "grad_norm": 0.4846458357851168, + "learning_rate": 3.6081084260846807e-07, + "loss": 0.5463, + "step": 18701 + }, + { + "epoch": 4.96482145227665, + "grad_norm": 0.4789873477100868, + "learning_rate": 3.6063018541670797e-07, + "loss": 0.5443, + "step": 18702 + }, + { + "epoch": 4.965086950750033, + "grad_norm": 0.48242243812768804, + "learning_rate": 3.604495699478969e-07, + "loss": 0.5223, + "step": 18703 + }, + { + "epoch": 4.965352449223417, + "grad_norm": 0.4728823077510297, + "learning_rate": 3.602689962055589e-07, + "loss": 0.5364, + "step": 18704 + }, + { + "epoch": 4.965617947696801, + "grad_norm": 0.47092999274712277, + "learning_rate": 3.6008846419321343e-07, + "loss": 0.5207, + "step": 18705 + }, + { + "epoch": 4.965883446170184, + "grad_norm": 0.4803962799714533, + "learning_rate": 3.5990797391438266e-07, + "loss": 0.5783, + "step": 18706 + }, + { + "epoch": 4.966148944643568, + "grad_norm": 0.48301418984580624, + "learning_rate": 3.59727525372586e-07, + "loss": 0.5375, + "step": 18707 + }, + { + "epoch": 4.966414443116952, + "grad_norm": 0.4881872282437468, + "learning_rate": 3.595471185713431e-07, + "loss": 0.5293, + "step": 18708 + }, + { + "epoch": 4.9666799415903355, + "grad_norm": 0.4819097289209797, + "learning_rate": 3.5936675351417176e-07, + "loss": 0.5505, + "step": 18709 + }, + { + "epoch": 4.96694544006372, + "grad_norm": 0.49414411746553144, + "learning_rate": 3.5918643020459027e-07, + "loss": 0.545, + "step": 18710 + }, + { + "epoch": 4.967210938537104, + "grad_norm": 0.4750403270674448, + "learning_rate": 3.5900614864611524e-07, + "loss": 0.5613, + "step": 18711 + }, + { + "epoch": 4.967476437010487, + "grad_norm": 0.4747875007621468, + "learning_rate": 3.588259088422616e-07, + "loss": 0.513, + "step": 18712 + }, + { + "epoch": 4.967741935483871, + "grad_norm": 0.47660142663826566, + "learning_rate": 3.58645710796546e-07, + "loss": 0.5598, + "step": 18713 + }, + { + "epoch": 4.968007433957255, + "grad_norm": 0.4800378624514367, + "learning_rate": 3.5846555451248127e-07, + "loss": 0.5207, + "step": 18714 + }, + { + "epoch": 4.968272932430638, + "grad_norm": 0.4835767165732828, + "learning_rate": 3.582854399935823e-07, + "loss": 0.5502, + "step": 18715 + }, + { + "epoch": 4.968538430904022, + "grad_norm": 0.48918076458837106, + "learning_rate": 3.58105367243361e-07, + "loss": 0.5182, + "step": 18716 + }, + { + "epoch": 4.9688039293774064, + "grad_norm": 0.4796889758540998, + "learning_rate": 3.5792533626532926e-07, + "loss": 0.514, + "step": 18717 + }, + { + "epoch": 4.96906942785079, + "grad_norm": 0.48838156494881524, + "learning_rate": 3.577453470629977e-07, + "loss": 0.5412, + "step": 18718 + }, + { + "epoch": 4.969334926324174, + "grad_norm": 0.4854446964625709, + "learning_rate": 3.575653996398767e-07, + "loss": 0.5525, + "step": 18719 + }, + { + "epoch": 4.969600424797557, + "grad_norm": 0.4679947403581326, + "learning_rate": 3.5738549399947697e-07, + "loss": 0.5183, + "step": 18720 + }, + { + "epoch": 4.969865923270941, + "grad_norm": 0.4810974962719777, + "learning_rate": 3.5720563014530584e-07, + "loss": 0.5326, + "step": 18721 + }, + { + "epoch": 4.970131421744325, + "grad_norm": 0.48156671568334447, + "learning_rate": 3.570258080808717e-07, + "loss": 0.5424, + "step": 18722 + }, + { + "epoch": 4.970396920217709, + "grad_norm": 0.4814013087643834, + "learning_rate": 3.5684602780968023e-07, + "loss": 0.5131, + "step": 18723 + }, + { + "epoch": 4.970662418691092, + "grad_norm": 0.48730841724383445, + "learning_rate": 3.566662893352396e-07, + "loss": 0.5178, + "step": 18724 + }, + { + "epoch": 4.9709279171644765, + "grad_norm": 0.4903040381872622, + "learning_rate": 3.5648659266105306e-07, + "loss": 0.5462, + "step": 18725 + }, + { + "epoch": 4.97119341563786, + "grad_norm": 0.498251460336798, + "learning_rate": 3.563069377906273e-07, + "loss": 0.5628, + "step": 18726 + }, + { + "epoch": 4.971458914111244, + "grad_norm": 0.468467496122541, + "learning_rate": 3.561273247274644e-07, + "loss": 0.5489, + "step": 18727 + }, + { + "epoch": 4.971724412584628, + "grad_norm": 0.4743843567876023, + "learning_rate": 3.5594775347506744e-07, + "loss": 0.5056, + "step": 18728 + }, + { + "epoch": 4.971989911058012, + "grad_norm": 0.4728286225134879, + "learning_rate": 3.557682240369395e-07, + "loss": 0.5471, + "step": 18729 + }, + { + "epoch": 4.972255409531395, + "grad_norm": 0.47712501737487595, + "learning_rate": 3.5558873641658053e-07, + "loss": 0.535, + "step": 18730 + }, + { + "epoch": 4.972520908004779, + "grad_norm": 0.4880995126518145, + "learning_rate": 3.554092906174919e-07, + "loss": 0.5248, + "step": 18731 + }, + { + "epoch": 4.972786406478162, + "grad_norm": 0.4802526080105834, + "learning_rate": 3.5522988664317276e-07, + "loss": 0.5172, + "step": 18732 + }, + { + "epoch": 4.9730519049515465, + "grad_norm": 0.48035825422141615, + "learning_rate": 3.550505244971231e-07, + "loss": 0.5422, + "step": 18733 + }, + { + "epoch": 4.973317403424931, + "grad_norm": 0.49948058736344414, + "learning_rate": 3.548712041828389e-07, + "loss": 0.5496, + "step": 18734 + }, + { + "epoch": 4.973582901898314, + "grad_norm": 0.4810691356135354, + "learning_rate": 3.5469192570381895e-07, + "loss": 0.5364, + "step": 18735 + }, + { + "epoch": 4.973848400371698, + "grad_norm": 0.4919905711942463, + "learning_rate": 3.5451268906355863e-07, + "loss": 0.532, + "step": 18736 + }, + { + "epoch": 4.974113898845082, + "grad_norm": 0.4745660901866937, + "learning_rate": 3.5433349426555435e-07, + "loss": 0.5435, + "step": 18737 + }, + { + "epoch": 4.974379397318465, + "grad_norm": 0.46926626011927774, + "learning_rate": 3.5415434131329975e-07, + "loss": 0.5377, + "step": 18738 + }, + { + "epoch": 4.974644895791849, + "grad_norm": 0.48352491841440093, + "learning_rate": 3.539752302102903e-07, + "loss": 0.5355, + "step": 18739 + }, + { + "epoch": 4.974910394265233, + "grad_norm": 0.4892084017691704, + "learning_rate": 3.5379616096001793e-07, + "loss": 0.553, + "step": 18740 + }, + { + "epoch": 4.9751758927386165, + "grad_norm": 0.4709332902635177, + "learning_rate": 3.536171335659749e-07, + "loss": 0.5386, + "step": 18741 + }, + { + "epoch": 4.975441391212001, + "grad_norm": 0.49728074117137816, + "learning_rate": 3.5343814803165343e-07, + "loss": 0.5642, + "step": 18742 + }, + { + "epoch": 4.975706889685385, + "grad_norm": 0.47353901253014874, + "learning_rate": 3.5325920436054317e-07, + "loss": 0.5057, + "step": 18743 + }, + { + "epoch": 4.975972388158768, + "grad_norm": 0.48343660607836414, + "learning_rate": 3.530803025561352e-07, + "loss": 0.5089, + "step": 18744 + }, + { + "epoch": 4.976237886632152, + "grad_norm": 0.47010537631113863, + "learning_rate": 3.5290144262191793e-07, + "loss": 0.5004, + "step": 18745 + }, + { + "epoch": 4.976503385105536, + "grad_norm": 0.4854216007989131, + "learning_rate": 3.5272262456137907e-07, + "loss": 0.5064, + "step": 18746 + }, + { + "epoch": 4.976768883578919, + "grad_norm": 0.486344566893522, + "learning_rate": 3.5254384837800665e-07, + "loss": 0.5166, + "step": 18747 + }, + { + "epoch": 4.977034382052303, + "grad_norm": 0.47866713880850925, + "learning_rate": 3.5236511407528676e-07, + "loss": 0.5254, + "step": 18748 + }, + { + "epoch": 4.9772998805256865, + "grad_norm": 0.48422343487697445, + "learning_rate": 3.5218642165670615e-07, + "loss": 0.5174, + "step": 18749 + }, + { + "epoch": 4.977565378999071, + "grad_norm": 0.4775095961397552, + "learning_rate": 3.5200777112574887e-07, + "loss": 0.5485, + "step": 18750 + }, + { + "epoch": 4.977830877472455, + "grad_norm": 0.47522313546915, + "learning_rate": 3.5182916248589936e-07, + "loss": 0.5086, + "step": 18751 + }, + { + "epoch": 4.978096375945839, + "grad_norm": 0.47745446120793833, + "learning_rate": 3.516505957406402e-07, + "loss": 0.5324, + "step": 18752 + }, + { + "epoch": 4.978361874419222, + "grad_norm": 0.4768804011263821, + "learning_rate": 3.5147207089345517e-07, + "loss": 0.5602, + "step": 18753 + }, + { + "epoch": 4.978627372892606, + "grad_norm": 0.4837562991864166, + "learning_rate": 3.5129358794782437e-07, + "loss": 0.5799, + "step": 18754 + }, + { + "epoch": 4.978892871365989, + "grad_norm": 0.48729229971609395, + "learning_rate": 3.5111514690723047e-07, + "loss": 0.5647, + "step": 18755 + }, + { + "epoch": 4.979158369839373, + "grad_norm": 0.48636862146841775, + "learning_rate": 3.509367477751524e-07, + "loss": 0.5475, + "step": 18756 + }, + { + "epoch": 4.979423868312757, + "grad_norm": 0.480310178091797, + "learning_rate": 3.5075839055506904e-07, + "loss": 0.5531, + "step": 18757 + }, + { + "epoch": 4.979689366786141, + "grad_norm": 0.47727235095674175, + "learning_rate": 3.505800752504598e-07, + "loss": 0.4967, + "step": 18758 + }, + { + "epoch": 4.979954865259525, + "grad_norm": 0.48298450880407545, + "learning_rate": 3.5040180186480146e-07, + "loss": 0.5443, + "step": 18759 + }, + { + "epoch": 4.980220363732909, + "grad_norm": 0.48515029827444145, + "learning_rate": 3.5022357040157133e-07, + "loss": 0.5296, + "step": 18760 + }, + { + "epoch": 4.980485862206292, + "grad_norm": 0.48182284254107854, + "learning_rate": 3.500453808642448e-07, + "loss": 0.5453, + "step": 18761 + }, + { + "epoch": 4.980751360679676, + "grad_norm": 0.49290036154168454, + "learning_rate": 3.4986723325629826e-07, + "loss": 0.5473, + "step": 18762 + }, + { + "epoch": 4.98101685915306, + "grad_norm": 0.4689320893074932, + "learning_rate": 3.496891275812042e-07, + "loss": 0.537, + "step": 18763 + }, + { + "epoch": 4.981282357626443, + "grad_norm": 0.4864500505214823, + "learning_rate": 3.4951106384243705e-07, + "loss": 0.5539, + "step": 18764 + }, + { + "epoch": 4.981547856099827, + "grad_norm": 0.4788829032691235, + "learning_rate": 3.493330420434698e-07, + "loss": 0.5402, + "step": 18765 + }, + { + "epoch": 4.9818133545732115, + "grad_norm": 0.49873468843221863, + "learning_rate": 3.491550621877737e-07, + "loss": 0.5356, + "step": 18766 + }, + { + "epoch": 4.982078853046595, + "grad_norm": 0.47589641311019887, + "learning_rate": 3.489771242788209e-07, + "loss": 0.4985, + "step": 18767 + }, + { + "epoch": 4.982344351519979, + "grad_norm": 0.47754316253841833, + "learning_rate": 3.4879922832007973e-07, + "loss": 0.514, + "step": 18768 + }, + { + "epoch": 4.982609849993363, + "grad_norm": 0.49102389800488977, + "learning_rate": 3.486213743150213e-07, + "loss": 0.5825, + "step": 18769 + }, + { + "epoch": 4.982875348466746, + "grad_norm": 0.47979077965055483, + "learning_rate": 3.4844356226711313e-07, + "loss": 0.5403, + "step": 18770 + }, + { + "epoch": 4.98314084694013, + "grad_norm": 0.4891051746517009, + "learning_rate": 3.482657921798235e-07, + "loss": 0.5178, + "step": 18771 + }, + { + "epoch": 4.983406345413514, + "grad_norm": 0.4716265052757659, + "learning_rate": 3.4808806405661924e-07, + "loss": 0.4943, + "step": 18772 + }, + { + "epoch": 4.983671843886897, + "grad_norm": 0.492127212378581, + "learning_rate": 3.479103779009668e-07, + "loss": 0.5242, + "step": 18773 + }, + { + "epoch": 4.9839373423602815, + "grad_norm": 0.4723678385591992, + "learning_rate": 3.477327337163314e-07, + "loss": 0.5233, + "step": 18774 + }, + { + "epoch": 4.984202840833666, + "grad_norm": 0.48689285178041564, + "learning_rate": 3.475551315061765e-07, + "loss": 0.5419, + "step": 18775 + }, + { + "epoch": 4.984468339307049, + "grad_norm": 0.4942897862841746, + "learning_rate": 3.473775712739674e-07, + "loss": 0.5717, + "step": 18776 + }, + { + "epoch": 4.984733837780433, + "grad_norm": 0.47805759576703044, + "learning_rate": 3.4720005302316563e-07, + "loss": 0.529, + "step": 18777 + }, + { + "epoch": 4.984999336253816, + "grad_norm": 0.4976608598096262, + "learning_rate": 3.470225767572344e-07, + "loss": 0.5601, + "step": 18778 + }, + { + "epoch": 4.9852648347272, + "grad_norm": 0.470813742974377, + "learning_rate": 3.46845142479634e-07, + "loss": 0.5129, + "step": 18779 + }, + { + "epoch": 4.985530333200584, + "grad_norm": 0.4828222664911521, + "learning_rate": 3.466677501938254e-07, + "loss": 0.5034, + "step": 18780 + }, + { + "epoch": 4.985795831673968, + "grad_norm": 0.4783921888168491, + "learning_rate": 3.4649039990326735e-07, + "loss": 0.5326, + "step": 18781 + }, + { + "epoch": 4.9860613301473515, + "grad_norm": 0.468635628011706, + "learning_rate": 3.463130916114199e-07, + "loss": 0.5364, + "step": 18782 + }, + { + "epoch": 4.986326828620736, + "grad_norm": 0.47604844788297507, + "learning_rate": 3.461358253217395e-07, + "loss": 0.512, + "step": 18783 + }, + { + "epoch": 4.986592327094119, + "grad_norm": 0.47740841774481096, + "learning_rate": 3.459586010376847e-07, + "loss": 0.5148, + "step": 18784 + }, + { + "epoch": 4.986857825567503, + "grad_norm": 0.48359404501110004, + "learning_rate": 3.457814187627112e-07, + "loss": 0.4995, + "step": 18785 + }, + { + "epoch": 4.987123324040887, + "grad_norm": 0.48335223197119337, + "learning_rate": 3.4560427850027415e-07, + "loss": 0.5391, + "step": 18786 + }, + { + "epoch": 4.98738882251427, + "grad_norm": 0.470493980271917, + "learning_rate": 3.454271802538289e-07, + "loss": 0.5346, + "step": 18787 + }, + { + "epoch": 4.987654320987654, + "grad_norm": 0.48460474033786055, + "learning_rate": 3.4525012402682826e-07, + "loss": 0.5463, + "step": 18788 + }, + { + "epoch": 4.987919819461038, + "grad_norm": 0.4784347771818127, + "learning_rate": 3.4507310982272666e-07, + "loss": 0.5172, + "step": 18789 + }, + { + "epoch": 4.9881853179344215, + "grad_norm": 0.49480991019427667, + "learning_rate": 3.4489613764497497e-07, + "loss": 0.5693, + "step": 18790 + }, + { + "epoch": 4.988450816407806, + "grad_norm": 0.4810477804170264, + "learning_rate": 3.447192074970257e-07, + "loss": 0.5571, + "step": 18791 + }, + { + "epoch": 4.98871631488119, + "grad_norm": 0.4822363258022917, + "learning_rate": 3.445423193823291e-07, + "loss": 0.5498, + "step": 18792 + }, + { + "epoch": 4.988981813354573, + "grad_norm": 0.49220510717359184, + "learning_rate": 3.443654733043339e-07, + "loss": 0.5543, + "step": 18793 + }, + { + "epoch": 4.989247311827957, + "grad_norm": 0.48231534486548044, + "learning_rate": 3.441886692664906e-07, + "loss": 0.5184, + "step": 18794 + }, + { + "epoch": 4.989512810301341, + "grad_norm": 0.47782981980740347, + "learning_rate": 3.4401190727224613e-07, + "loss": 0.5309, + "step": 18795 + }, + { + "epoch": 4.989778308774724, + "grad_norm": 0.48630838268734555, + "learning_rate": 3.438351873250492e-07, + "loss": 0.4796, + "step": 18796 + }, + { + "epoch": 4.990043807248108, + "grad_norm": 0.4934706032721086, + "learning_rate": 3.436585094283443e-07, + "loss": 0.5266, + "step": 18797 + }, + { + "epoch": 4.9903093057214924, + "grad_norm": 0.4869275946686896, + "learning_rate": 3.4348187358557857e-07, + "loss": 0.542, + "step": 18798 + }, + { + "epoch": 4.990574804194876, + "grad_norm": 0.4805067624099405, + "learning_rate": 3.4330527980019594e-07, + "loss": 0.5601, + "step": 18799 + }, + { + "epoch": 4.99084030266826, + "grad_norm": 0.48307008859608574, + "learning_rate": 3.4312872807564123e-07, + "loss": 0.5455, + "step": 18800 + }, + { + "epoch": 4.991105801141644, + "grad_norm": 0.4823304534151557, + "learning_rate": 3.4295221841535694e-07, + "loss": 0.5639, + "step": 18801 + }, + { + "epoch": 4.991371299615027, + "grad_norm": 0.48811325987571563, + "learning_rate": 3.427757508227861e-07, + "loss": 0.5719, + "step": 18802 + }, + { + "epoch": 4.991636798088411, + "grad_norm": 0.4833029648962473, + "learning_rate": 3.4259932530136987e-07, + "loss": 0.5711, + "step": 18803 + }, + { + "epoch": 4.991902296561795, + "grad_norm": 0.4933835588482694, + "learning_rate": 3.4242294185454856e-07, + "loss": 0.5501, + "step": 18804 + }, + { + "epoch": 4.992167795035178, + "grad_norm": 0.463343344995082, + "learning_rate": 3.422466004857633e-07, + "loss": 0.4921, + "step": 18805 + }, + { + "epoch": 4.9924332935085625, + "grad_norm": 0.48564498302134157, + "learning_rate": 3.4207030119845164e-07, + "loss": 0.5308, + "step": 18806 + }, + { + "epoch": 4.992698791981946, + "grad_norm": 0.4970449323807783, + "learning_rate": 3.4189404399605304e-07, + "loss": 0.568, + "step": 18807 + }, + { + "epoch": 4.99296429045533, + "grad_norm": 0.48632847286141584, + "learning_rate": 3.4171782888200474e-07, + "loss": 0.5541, + "step": 18808 + }, + { + "epoch": 4.993229788928714, + "grad_norm": 0.47516016234793, + "learning_rate": 3.4154165585974315e-07, + "loss": 0.523, + "step": 18809 + }, + { + "epoch": 4.993495287402098, + "grad_norm": 0.48970214863871936, + "learning_rate": 3.4136552493270356e-07, + "loss": 0.5328, + "step": 18810 + }, + { + "epoch": 4.993760785875481, + "grad_norm": 0.4909447820809886, + "learning_rate": 3.4118943610432166e-07, + "loss": 0.5708, + "step": 18811 + }, + { + "epoch": 4.994026284348865, + "grad_norm": 0.4745921013786922, + "learning_rate": 3.410133893780315e-07, + "loss": 0.5025, + "step": 18812 + }, + { + "epoch": 4.994291782822248, + "grad_norm": 0.47901306157195167, + "learning_rate": 3.408373847572663e-07, + "loss": 0.5119, + "step": 18813 + }, + { + "epoch": 4.9945572812956325, + "grad_norm": 0.48667431313675213, + "learning_rate": 3.4066142224545935e-07, + "loss": 0.5368, + "step": 18814 + }, + { + "epoch": 4.994822779769017, + "grad_norm": 0.4831763553290102, + "learning_rate": 3.4048550184604095e-07, + "loss": 0.5381, + "step": 18815 + }, + { + "epoch": 4.9950882782424, + "grad_norm": 0.4883589707403283, + "learning_rate": 3.4030962356244286e-07, + "loss": 0.5636, + "step": 18816 + }, + { + "epoch": 4.995353776715784, + "grad_norm": 0.47951061930060734, + "learning_rate": 3.401337873980945e-07, + "loss": 0.5282, + "step": 18817 + }, + { + "epoch": 4.995619275189168, + "grad_norm": 0.5011478772790252, + "learning_rate": 3.3995799335642623e-07, + "loss": 0.5594, + "step": 18818 + }, + { + "epoch": 4.995884773662551, + "grad_norm": 0.4898536737671288, + "learning_rate": 3.3978224144086553e-07, + "loss": 0.5723, + "step": 18819 + }, + { + "epoch": 4.996150272135935, + "grad_norm": 0.47160743428203444, + "learning_rate": 3.3960653165484e-07, + "loss": 0.5444, + "step": 18820 + }, + { + "epoch": 4.996415770609319, + "grad_norm": 0.4799995902215757, + "learning_rate": 3.3943086400177684e-07, + "loss": 0.5698, + "step": 18821 + }, + { + "epoch": 4.9966812690827025, + "grad_norm": 0.4886629222639214, + "learning_rate": 3.3925523848510166e-07, + "loss": 0.5366, + "step": 18822 + }, + { + "epoch": 4.996946767556087, + "grad_norm": 0.4796562879291048, + "learning_rate": 3.3907965510824007e-07, + "loss": 0.534, + "step": 18823 + }, + { + "epoch": 4.997212266029471, + "grad_norm": 0.4979055413256308, + "learning_rate": 3.3890411387461545e-07, + "loss": 0.5084, + "step": 18824 + }, + { + "epoch": 4.997477764502854, + "grad_norm": 0.4765304279572943, + "learning_rate": 3.387286147876531e-07, + "loss": 0.5226, + "step": 18825 + }, + { + "epoch": 4.997743262976238, + "grad_norm": 0.4794582167661844, + "learning_rate": 3.385531578507734e-07, + "loss": 0.5682, + "step": 18826 + }, + { + "epoch": 4.998008761449622, + "grad_norm": 0.47680896308803433, + "learning_rate": 3.3837774306739985e-07, + "loss": 0.521, + "step": 18827 + }, + { + "epoch": 4.998274259923005, + "grad_norm": 0.4890485813167451, + "learning_rate": 3.3820237044095203e-07, + "loss": 0.5207, + "step": 18828 + }, + { + "epoch": 4.998539758396389, + "grad_norm": 0.4664643056741581, + "learning_rate": 3.380270399748517e-07, + "loss": 0.5292, + "step": 18829 + }, + { + "epoch": 4.9988052568697725, + "grad_norm": 0.6349181820465865, + "learning_rate": 3.378517516725172e-07, + "loss": 0.5385, + "step": 18830 + } + ], + "logging_steps": 1, + "max_steps": 22596, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 3766, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2429043891240960.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}