diff --git "a/DIAL-3B-fulldata/trainer_state.json" "b/DIAL-3B-fulldata/trainer_state.json" new file mode 100644--- /dev/null +++ "b/DIAL-3B-fulldata/trainer_state.json" @@ -0,0 +1,56034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.556029692847935, + "eval_steps": 500, + "global_step": 80000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004445037116059919, + "grad_norm": 0.27257904410362244, + "learning_rate": 2.25e-07, + "loss": 0.2673, + "step": 10 + }, + { + "epoch": 0.0008890074232119838, + "grad_norm": 0.15932303667068481, + "learning_rate": 4.75e-07, + "loss": 0.2688, + "step": 20 + }, + { + "epoch": 0.0013335111348179757, + "grad_norm": 0.3036741018295288, + "learning_rate": 7.25e-07, + "loss": 0.27, + "step": 30 + }, + { + "epoch": 0.0017780148464239677, + "grad_norm": 0.1513253003358841, + "learning_rate": 9.75e-07, + "loss": 0.2663, + "step": 40 + }, + { + "epoch": 0.0022225185580299596, + "grad_norm": 0.1699444055557251, + "learning_rate": 1.2250000000000001e-06, + "loss": 0.2666, + "step": 50 + }, + { + "epoch": 0.0026670222696359514, + "grad_norm": 0.1670369654893875, + "learning_rate": 1.475e-06, + "loss": 0.268, + "step": 60 + }, + { + "epoch": 0.0031115259812419436, + "grad_norm": 0.1640091836452484, + "learning_rate": 1.7250000000000002e-06, + "loss": 0.2622, + "step": 70 + }, + { + "epoch": 0.0035560296928479353, + "grad_norm": 0.1570144146680832, + "learning_rate": 1.975e-06, + "loss": 0.2674, + "step": 80 + }, + { + "epoch": 0.0040005334044539275, + "grad_norm": 0.14213776588439941, + "learning_rate": 2.225e-06, + "loss": 0.2669, + "step": 90 + }, + { + "epoch": 0.004445037116059919, + "grad_norm": 0.16105836629867554, + "learning_rate": 2.4750000000000004e-06, + "loss": 0.2689, + "step": 100 + }, + { + "epoch": 0.004889540827665911, + "grad_norm": 0.15249909460544586, + "learning_rate": 2.725e-06, + "loss": 0.2703, + "step": 110 + }, + { + "epoch": 0.005334044539271903, + "grad_norm": 0.16157744824886322, + "learning_rate": 2.975e-06, + "loss": 0.2693, + "step": 120 + }, + { + "epoch": 0.0057785482508778945, + "grad_norm": 0.17197813093662262, + "learning_rate": 3.225e-06, + "loss": 0.2681, + "step": 130 + }, + { + "epoch": 0.006223051962483887, + "grad_norm": 0.1652216613292694, + "learning_rate": 3.4750000000000006e-06, + "loss": 0.2679, + "step": 140 + }, + { + "epoch": 0.006667555674089879, + "grad_norm": 0.20123358070850372, + "learning_rate": 3.725e-06, + "loss": 0.2685, + "step": 150 + }, + { + "epoch": 0.007112059385695871, + "grad_norm": 0.17993593215942383, + "learning_rate": 3.975e-06, + "loss": 0.2688, + "step": 160 + }, + { + "epoch": 0.007556563097301862, + "grad_norm": 0.1702893078327179, + "learning_rate": 4.225e-06, + "loss": 0.2676, + "step": 170 + }, + { + "epoch": 0.008001066808907855, + "grad_norm": 0.17351070046424866, + "learning_rate": 4.475e-06, + "loss": 0.2694, + "step": 180 + }, + { + "epoch": 0.008445570520513847, + "grad_norm": 0.2104254513978958, + "learning_rate": 4.7250000000000005e-06, + "loss": 0.2678, + "step": 190 + }, + { + "epoch": 0.008890074232119839, + "grad_norm": 0.1667654812335968, + "learning_rate": 4.975000000000001e-06, + "loss": 0.2661, + "step": 200 + }, + { + "epoch": 0.00933457794372583, + "grad_norm": 0.16590003669261932, + "learning_rate": 5.225e-06, + "loss": 0.2687, + "step": 210 + }, + { + "epoch": 0.009779081655331822, + "grad_norm": 0.194223091006279, + "learning_rate": 5.475e-06, + "loss": 0.27, + "step": 220 + }, + { + "epoch": 0.010223585366937814, + "grad_norm": 0.17568840086460114, + "learning_rate": 5.725e-06, + "loss": 0.2683, + "step": 230 + }, + { + "epoch": 0.010668089078543806, + "grad_norm": 0.17286579310894012, + "learning_rate": 5.975e-06, + "loss": 0.2699, + "step": 240 + }, + { + "epoch": 0.011112592790149797, + "grad_norm": 0.20387697219848633, + "learning_rate": 6.2250000000000005e-06, + "loss": 0.2684, + "step": 250 + }, + { + "epoch": 0.011557096501755789, + "grad_norm": 0.17565008997917175, + "learning_rate": 6.475000000000001e-06, + "loss": 0.2683, + "step": 260 + }, + { + "epoch": 0.01200160021336178, + "grad_norm": 0.19339275360107422, + "learning_rate": 6.725000000000001e-06, + "loss": 0.2687, + "step": 270 + }, + { + "epoch": 0.012446103924967774, + "grad_norm": 0.23457615077495575, + "learning_rate": 6.975000000000001e-06, + "loss": 0.2678, + "step": 280 + }, + { + "epoch": 0.012890607636573766, + "grad_norm": 0.20514200627803802, + "learning_rate": 7.2249999999999994e-06, + "loss": 0.2681, + "step": 290 + }, + { + "epoch": 0.013335111348179758, + "grad_norm": 0.1985008418560028, + "learning_rate": 7.4750000000000004e-06, + "loss": 0.2705, + "step": 300 + }, + { + "epoch": 0.01377961505978575, + "grad_norm": 0.182047501206398, + "learning_rate": 7.725e-06, + "loss": 0.2686, + "step": 310 + }, + { + "epoch": 0.014224118771391741, + "grad_norm": 0.21274584531784058, + "learning_rate": 7.975e-06, + "loss": 0.2663, + "step": 320 + }, + { + "epoch": 0.014668622482997733, + "grad_norm": 0.1846754103899002, + "learning_rate": 8.225e-06, + "loss": 0.2666, + "step": 330 + }, + { + "epoch": 0.015113126194603725, + "grad_norm": 0.1793072521686554, + "learning_rate": 8.475000000000001e-06, + "loss": 0.2715, + "step": 340 + }, + { + "epoch": 0.015557629906209717, + "grad_norm": 0.1749434471130371, + "learning_rate": 8.725e-06, + "loss": 0.268, + "step": 350 + }, + { + "epoch": 0.01600213361781571, + "grad_norm": 0.1718384027481079, + "learning_rate": 8.975e-06, + "loss": 0.2682, + "step": 360 + }, + { + "epoch": 0.016446637329421702, + "grad_norm": 0.1862325370311737, + "learning_rate": 9.225e-06, + "loss": 0.269, + "step": 370 + }, + { + "epoch": 0.016891141041027694, + "grad_norm": 0.19629086554050446, + "learning_rate": 9.475e-06, + "loss": 0.2701, + "step": 380 + }, + { + "epoch": 0.017335644752633685, + "grad_norm": 0.221978560090065, + "learning_rate": 9.725000000000001e-06, + "loss": 0.2696, + "step": 390 + }, + { + "epoch": 0.017780148464239677, + "grad_norm": 0.1763482242822647, + "learning_rate": 9.975e-06, + "loss": 0.2698, + "step": 400 + }, + { + "epoch": 0.01822465217584567, + "grad_norm": 0.16576530039310455, + "learning_rate": 1.0225e-05, + "loss": 0.2693, + "step": 410 + }, + { + "epoch": 0.01866915588745166, + "grad_norm": 0.1842678338289261, + "learning_rate": 1.0475e-05, + "loss": 0.2679, + "step": 420 + }, + { + "epoch": 0.019113659599057652, + "grad_norm": 0.16513147950172424, + "learning_rate": 1.0725e-05, + "loss": 0.2698, + "step": 430 + }, + { + "epoch": 0.019558163310663644, + "grad_norm": 0.1781226545572281, + "learning_rate": 1.0975e-05, + "loss": 0.2689, + "step": 440 + }, + { + "epoch": 0.020002667022269636, + "grad_norm": 0.17886944115161896, + "learning_rate": 1.1225e-05, + "loss": 0.2689, + "step": 450 + }, + { + "epoch": 0.020447170733875628, + "grad_norm": 0.1522589921951294, + "learning_rate": 1.1475000000000001e-05, + "loss": 0.2733, + "step": 460 + }, + { + "epoch": 0.02089167444548162, + "grad_norm": 0.21575194597244263, + "learning_rate": 1.1725e-05, + "loss": 0.2693, + "step": 470 + }, + { + "epoch": 0.02133617815708761, + "grad_norm": 0.15968771278858185, + "learning_rate": 1.1975e-05, + "loss": 0.2681, + "step": 480 + }, + { + "epoch": 0.021780681868693603, + "grad_norm": 0.19757743179798126, + "learning_rate": 1.2225e-05, + "loss": 0.2679, + "step": 490 + }, + { + "epoch": 0.022225185580299595, + "grad_norm": 0.1788386106491089, + "learning_rate": 1.2475e-05, + "loss": 0.2695, + "step": 500 + }, + { + "epoch": 0.022669689291905586, + "grad_norm": 0.20420639216899872, + "learning_rate": 1.2725000000000001e-05, + "loss": 0.271, + "step": 510 + }, + { + "epoch": 0.023114193003511578, + "grad_norm": 0.18267051875591278, + "learning_rate": 1.2975e-05, + "loss": 0.2671, + "step": 520 + }, + { + "epoch": 0.02355869671511757, + "grad_norm": 0.23184366524219513, + "learning_rate": 1.3225000000000001e-05, + "loss": 0.2708, + "step": 530 + }, + { + "epoch": 0.02400320042672356, + "grad_norm": 0.1899673491716385, + "learning_rate": 1.3475000000000002e-05, + "loss": 0.2691, + "step": 540 + }, + { + "epoch": 0.024447704138329553, + "grad_norm": 0.1542953997850418, + "learning_rate": 1.3725000000000002e-05, + "loss": 0.2673, + "step": 550 + }, + { + "epoch": 0.02489220784993555, + "grad_norm": 0.19715306162834167, + "learning_rate": 1.3975000000000003e-05, + "loss": 0.2716, + "step": 560 + }, + { + "epoch": 0.02533671156154154, + "grad_norm": 0.22989854216575623, + "learning_rate": 1.4225e-05, + "loss": 0.2714, + "step": 570 + }, + { + "epoch": 0.025781215273147532, + "grad_norm": 0.19974440336227417, + "learning_rate": 1.4475e-05, + "loss": 0.2674, + "step": 580 + }, + { + "epoch": 0.026225718984753524, + "grad_norm": 0.19189710915088654, + "learning_rate": 1.4725e-05, + "loss": 0.267, + "step": 590 + }, + { + "epoch": 0.026670222696359516, + "grad_norm": 0.18281002342700958, + "learning_rate": 1.4975e-05, + "loss": 0.2693, + "step": 600 + }, + { + "epoch": 0.027114726407965507, + "grad_norm": 0.257772833108902, + "learning_rate": 1.5225e-05, + "loss": 0.2717, + "step": 610 + }, + { + "epoch": 0.0275592301195715, + "grad_norm": 0.22507594525814056, + "learning_rate": 1.5475e-05, + "loss": 0.2682, + "step": 620 + }, + { + "epoch": 0.02800373383117749, + "grad_norm": 0.20441624522209167, + "learning_rate": 1.5725e-05, + "loss": 0.2692, + "step": 630 + }, + { + "epoch": 0.028448237542783483, + "grad_norm": 0.20385460555553436, + "learning_rate": 1.5975000000000002e-05, + "loss": 0.2695, + "step": 640 + }, + { + "epoch": 0.028892741254389474, + "grad_norm": 0.18915215134620667, + "learning_rate": 1.6225e-05, + "loss": 0.2705, + "step": 650 + }, + { + "epoch": 0.029337244965995466, + "grad_norm": 0.18421444296836853, + "learning_rate": 1.6475e-05, + "loss": 0.2687, + "step": 660 + }, + { + "epoch": 0.029781748677601458, + "grad_norm": 0.19780774414539337, + "learning_rate": 1.6725000000000003e-05, + "loss": 0.2715, + "step": 670 + }, + { + "epoch": 0.03022625238920745, + "grad_norm": 0.2228350192308426, + "learning_rate": 1.6975000000000003e-05, + "loss": 0.2713, + "step": 680 + }, + { + "epoch": 0.03067075610081344, + "grad_norm": 0.20042815804481506, + "learning_rate": 1.7225e-05, + "loss": 0.2689, + "step": 690 + }, + { + "epoch": 0.031115259812419433, + "grad_norm": 0.18138141930103302, + "learning_rate": 1.7475e-05, + "loss": 0.2707, + "step": 700 + }, + { + "epoch": 0.031559763524025425, + "grad_norm": 0.17177993059158325, + "learning_rate": 1.7725e-05, + "loss": 0.274, + "step": 710 + }, + { + "epoch": 0.03200426723563142, + "grad_norm": 0.18917720019817352, + "learning_rate": 1.7975e-05, + "loss": 0.2711, + "step": 720 + }, + { + "epoch": 0.03244877094723741, + "grad_norm": 0.19221854209899902, + "learning_rate": 1.8225e-05, + "loss": 0.2696, + "step": 730 + }, + { + "epoch": 0.032893274658843404, + "grad_norm": 0.18832199275493622, + "learning_rate": 1.8475000000000002e-05, + "loss": 0.2699, + "step": 740 + }, + { + "epoch": 0.03333777837044939, + "grad_norm": 0.19060440361499786, + "learning_rate": 1.8725e-05, + "loss": 0.2699, + "step": 750 + }, + { + "epoch": 0.03378228208205539, + "grad_norm": 0.2168068140745163, + "learning_rate": 1.8975e-05, + "loss": 0.2693, + "step": 760 + }, + { + "epoch": 0.034226785793661375, + "grad_norm": 0.19331899285316467, + "learning_rate": 1.9225e-05, + "loss": 0.2709, + "step": 770 + }, + { + "epoch": 0.03467128950526737, + "grad_norm": 0.20940814912319183, + "learning_rate": 1.9475000000000002e-05, + "loss": 0.2704, + "step": 780 + }, + { + "epoch": 0.03511579321687336, + "grad_norm": 0.20360715687274933, + "learning_rate": 1.9725000000000002e-05, + "loss": 0.2693, + "step": 790 + }, + { + "epoch": 0.035560296928479354, + "grad_norm": 0.2004312425851822, + "learning_rate": 1.9975e-05, + "loss": 0.2717, + "step": 800 + }, + { + "epoch": 0.03600480064008534, + "grad_norm": 0.18645116686820984, + "learning_rate": 2.0225000000000004e-05, + "loss": 0.271, + "step": 810 + }, + { + "epoch": 0.03644930435169134, + "grad_norm": 0.18933255970478058, + "learning_rate": 2.0475e-05, + "loss": 0.2732, + "step": 820 + }, + { + "epoch": 0.036893808063297326, + "grad_norm": 0.17650452256202698, + "learning_rate": 2.0725e-05, + "loss": 0.2731, + "step": 830 + }, + { + "epoch": 0.03733831177490332, + "grad_norm": 0.18876120448112488, + "learning_rate": 2.0975e-05, + "loss": 0.2661, + "step": 840 + }, + { + "epoch": 0.03778281548650931, + "grad_norm": 0.20079544186592102, + "learning_rate": 2.1225e-05, + "loss": 0.2694, + "step": 850 + }, + { + "epoch": 0.038227319198115305, + "grad_norm": 0.25142839550971985, + "learning_rate": 2.1475e-05, + "loss": 0.2707, + "step": 860 + }, + { + "epoch": 0.03867182290972129, + "grad_norm": 0.20403754711151123, + "learning_rate": 2.1725e-05, + "loss": 0.2703, + "step": 870 + }, + { + "epoch": 0.03911632662132729, + "grad_norm": 0.19312487542629242, + "learning_rate": 2.1975000000000002e-05, + "loss": 0.269, + "step": 880 + }, + { + "epoch": 0.03956083033293328, + "grad_norm": 0.2282416671514511, + "learning_rate": 2.2225e-05, + "loss": 0.27, + "step": 890 + }, + { + "epoch": 0.04000533404453927, + "grad_norm": 0.20105138421058655, + "learning_rate": 2.2475e-05, + "loss": 0.274, + "step": 900 + }, + { + "epoch": 0.04044983775614527, + "grad_norm": 0.19247089326381683, + "learning_rate": 2.2725000000000003e-05, + "loss": 0.271, + "step": 910 + }, + { + "epoch": 0.040894341467751255, + "grad_norm": 0.20758488774299622, + "learning_rate": 2.2975000000000003e-05, + "loss": 0.2711, + "step": 920 + }, + { + "epoch": 0.04133884517935725, + "grad_norm": 0.16415567696094513, + "learning_rate": 2.3225000000000002e-05, + "loss": 0.2709, + "step": 930 + }, + { + "epoch": 0.04178334889096324, + "grad_norm": 0.1953057497739792, + "learning_rate": 2.3475e-05, + "loss": 0.2723, + "step": 940 + }, + { + "epoch": 0.042227852602569234, + "grad_norm": 0.21941415965557098, + "learning_rate": 2.3725e-05, + "loss": 0.2748, + "step": 950 + }, + { + "epoch": 0.04267235631417522, + "grad_norm": 0.20610283315181732, + "learning_rate": 2.3975e-05, + "loss": 0.2718, + "step": 960 + }, + { + "epoch": 0.04311686002578122, + "grad_norm": 0.22166700661182404, + "learning_rate": 2.4225e-05, + "loss": 0.2691, + "step": 970 + }, + { + "epoch": 0.043561363737387206, + "grad_norm": 0.19174064695835114, + "learning_rate": 2.4475000000000002e-05, + "loss": 0.2716, + "step": 980 + }, + { + "epoch": 0.0440058674489932, + "grad_norm": 0.17838400602340698, + "learning_rate": 2.4725e-05, + "loss": 0.2737, + "step": 990 + }, + { + "epoch": 0.04445037116059919, + "grad_norm": 0.20878177881240845, + "learning_rate": 2.4975e-05, + "loss": 0.2712, + "step": 1000 + }, + { + "epoch": 0.044894874872205184, + "grad_norm": 0.2028988152742386, + "learning_rate": 2.5225e-05, + "loss": 0.2699, + "step": 1010 + }, + { + "epoch": 0.04533937858381117, + "grad_norm": 0.20200198888778687, + "learning_rate": 2.5475e-05, + "loss": 0.2707, + "step": 1020 + }, + { + "epoch": 0.04578388229541717, + "grad_norm": 0.19882875680923462, + "learning_rate": 2.5725e-05, + "loss": 0.2729, + "step": 1030 + }, + { + "epoch": 0.046228386007023156, + "grad_norm": 0.19507187604904175, + "learning_rate": 2.5974999999999998e-05, + "loss": 0.2691, + "step": 1040 + }, + { + "epoch": 0.04667288971862915, + "grad_norm": 0.17349746823310852, + "learning_rate": 2.6225e-05, + "loss": 0.2718, + "step": 1050 + }, + { + "epoch": 0.04711739343023514, + "grad_norm": 0.21074552834033966, + "learning_rate": 2.6475e-05, + "loss": 0.2732, + "step": 1060 + }, + { + "epoch": 0.047561897141841135, + "grad_norm": 0.2570444345474243, + "learning_rate": 2.6725e-05, + "loss": 0.273, + "step": 1070 + }, + { + "epoch": 0.04800640085344712, + "grad_norm": 0.21584808826446533, + "learning_rate": 2.6975000000000002e-05, + "loss": 0.2735, + "step": 1080 + }, + { + "epoch": 0.04845090456505312, + "grad_norm": 0.20628534257411957, + "learning_rate": 2.7225e-05, + "loss": 0.2731, + "step": 1090 + }, + { + "epoch": 0.04889540827665911, + "grad_norm": 0.23713749647140503, + "learning_rate": 2.7475e-05, + "loss": 0.2696, + "step": 1100 + }, + { + "epoch": 0.0493399119882651, + "grad_norm": 0.2100566327571869, + "learning_rate": 2.7725e-05, + "loss": 0.2722, + "step": 1110 + }, + { + "epoch": 0.0497844156998711, + "grad_norm": 0.1792062371969223, + "learning_rate": 2.7975000000000002e-05, + "loss": 0.273, + "step": 1120 + }, + { + "epoch": 0.050228919411477085, + "grad_norm": 0.21431782841682434, + "learning_rate": 2.8225e-05, + "loss": 0.2748, + "step": 1130 + }, + { + "epoch": 0.05067342312308308, + "grad_norm": 0.21538813412189484, + "learning_rate": 2.8475e-05, + "loss": 0.2739, + "step": 1140 + }, + { + "epoch": 0.05111792683468907, + "grad_norm": 0.17332017421722412, + "learning_rate": 2.8725e-05, + "loss": 0.2709, + "step": 1150 + }, + { + "epoch": 0.051562430546295064, + "grad_norm": 0.21509426832199097, + "learning_rate": 2.8975000000000003e-05, + "loss": 0.2717, + "step": 1160 + }, + { + "epoch": 0.05200693425790105, + "grad_norm": 0.2128811478614807, + "learning_rate": 2.9225000000000002e-05, + "loss": 0.2732, + "step": 1170 + }, + { + "epoch": 0.05245143796950705, + "grad_norm": 0.19888530671596527, + "learning_rate": 2.9475e-05, + "loss": 0.2759, + "step": 1180 + }, + { + "epoch": 0.052895941681113036, + "grad_norm": 0.2230934202671051, + "learning_rate": 2.9725000000000004e-05, + "loss": 0.2741, + "step": 1190 + }, + { + "epoch": 0.05334044539271903, + "grad_norm": 0.17264658212661743, + "learning_rate": 2.9975000000000004e-05, + "loss": 0.2743, + "step": 1200 + }, + { + "epoch": 0.05378494910432502, + "grad_norm": 0.2053840607404709, + "learning_rate": 3.0225000000000003e-05, + "loss": 0.2704, + "step": 1210 + }, + { + "epoch": 0.054229452815931015, + "grad_norm": 0.20070472359657288, + "learning_rate": 3.0475000000000002e-05, + "loss": 0.2748, + "step": 1220 + }, + { + "epoch": 0.054673956527537, + "grad_norm": 0.21044769883155823, + "learning_rate": 3.0725e-05, + "loss": 0.2758, + "step": 1230 + }, + { + "epoch": 0.055118460239143, + "grad_norm": 0.23589909076690674, + "learning_rate": 3.0975e-05, + "loss": 0.2739, + "step": 1240 + }, + { + "epoch": 0.055562963950748986, + "grad_norm": 0.22947776317596436, + "learning_rate": 3.122500000000001e-05, + "loss": 0.2776, + "step": 1250 + }, + { + "epoch": 0.05600746766235498, + "grad_norm": 0.22065019607543945, + "learning_rate": 3.1475e-05, + "loss": 0.2713, + "step": 1260 + }, + { + "epoch": 0.05645197137396097, + "grad_norm": 0.233997642993927, + "learning_rate": 3.1725e-05, + "loss": 0.275, + "step": 1270 + }, + { + "epoch": 0.056896475085566965, + "grad_norm": 0.23386363685131073, + "learning_rate": 3.1975e-05, + "loss": 0.2753, + "step": 1280 + }, + { + "epoch": 0.05734097879717295, + "grad_norm": 0.23811566829681396, + "learning_rate": 3.2225e-05, + "loss": 0.2769, + "step": 1290 + }, + { + "epoch": 0.05778548250877895, + "grad_norm": 0.21429018676280975, + "learning_rate": 3.2474999999999997e-05, + "loss": 0.273, + "step": 1300 + }, + { + "epoch": 0.05822998622038494, + "grad_norm": 0.23074990510940552, + "learning_rate": 3.2725e-05, + "loss": 0.2727, + "step": 1310 + }, + { + "epoch": 0.05867448993199093, + "grad_norm": 0.20997484028339386, + "learning_rate": 3.2975e-05, + "loss": 0.2727, + "step": 1320 + }, + { + "epoch": 0.05911899364359693, + "grad_norm": 0.27365031838417053, + "learning_rate": 3.3225e-05, + "loss": 0.2734, + "step": 1330 + }, + { + "epoch": 0.059563497355202916, + "grad_norm": 0.24065148830413818, + "learning_rate": 3.3475e-05, + "loss": 0.2765, + "step": 1340 + }, + { + "epoch": 0.06000800106680891, + "grad_norm": 0.21822811663150787, + "learning_rate": 3.3725e-05, + "loss": 0.2736, + "step": 1350 + }, + { + "epoch": 0.0604525047784149, + "grad_norm": 0.20479148626327515, + "learning_rate": 3.3975e-05, + "loss": 0.2723, + "step": 1360 + }, + { + "epoch": 0.060897008490020894, + "grad_norm": 0.20557302236557007, + "learning_rate": 3.4225e-05, + "loss": 0.2728, + "step": 1370 + }, + { + "epoch": 0.06134151220162688, + "grad_norm": 0.2158011645078659, + "learning_rate": 3.4475000000000005e-05, + "loss": 0.2754, + "step": 1380 + }, + { + "epoch": 0.06178601591323288, + "grad_norm": 0.1896408498287201, + "learning_rate": 3.4725000000000004e-05, + "loss": 0.274, + "step": 1390 + }, + { + "epoch": 0.062230519624838866, + "grad_norm": 0.18373218178749084, + "learning_rate": 3.4975e-05, + "loss": 0.2732, + "step": 1400 + }, + { + "epoch": 0.06267502333644485, + "grad_norm": 0.19982001185417175, + "learning_rate": 3.5225e-05, + "loss": 0.274, + "step": 1410 + }, + { + "epoch": 0.06311952704805085, + "grad_norm": 0.20756012201309204, + "learning_rate": 3.5475e-05, + "loss": 0.2761, + "step": 1420 + }, + { + "epoch": 0.06356403075965684, + "grad_norm": 0.24354207515716553, + "learning_rate": 3.5725e-05, + "loss": 0.2735, + "step": 1430 + }, + { + "epoch": 0.06400853447126284, + "grad_norm": 0.1933511197566986, + "learning_rate": 3.5975e-05, + "loss": 0.2751, + "step": 1440 + }, + { + "epoch": 0.06445303818286882, + "grad_norm": 0.21637563407421112, + "learning_rate": 3.6225000000000006e-05, + "loss": 0.2755, + "step": 1450 + }, + { + "epoch": 0.06489754189447482, + "grad_norm": 0.21534334123134613, + "learning_rate": 3.6475000000000006e-05, + "loss": 0.2733, + "step": 1460 + }, + { + "epoch": 0.06534204560608081, + "grad_norm": 0.2245812565088272, + "learning_rate": 3.6725000000000005e-05, + "loss": 0.2721, + "step": 1470 + }, + { + "epoch": 0.06578654931768681, + "grad_norm": 0.1968315839767456, + "learning_rate": 3.6975000000000004e-05, + "loss": 0.2768, + "step": 1480 + }, + { + "epoch": 0.06623105302929279, + "grad_norm": 0.2430274486541748, + "learning_rate": 3.7225000000000004e-05, + "loss": 0.2742, + "step": 1490 + }, + { + "epoch": 0.06667555674089878, + "grad_norm": 0.23078420758247375, + "learning_rate": 3.7475e-05, + "loss": 0.2796, + "step": 1500 + }, + { + "epoch": 0.06712006045250478, + "grad_norm": 0.22983036935329437, + "learning_rate": 3.7725e-05, + "loss": 0.2759, + "step": 1510 + }, + { + "epoch": 0.06756456416411077, + "grad_norm": 0.23755060136318207, + "learning_rate": 3.7975e-05, + "loss": 0.2778, + "step": 1520 + }, + { + "epoch": 0.06800906787571676, + "grad_norm": 0.25988948345184326, + "learning_rate": 3.8225e-05, + "loss": 0.2743, + "step": 1530 + }, + { + "epoch": 0.06845357158732275, + "grad_norm": 0.2143726348876953, + "learning_rate": 3.8475e-05, + "loss": 0.2758, + "step": 1540 + }, + { + "epoch": 0.06889807529892875, + "grad_norm": 0.22911296784877777, + "learning_rate": 3.8725e-05, + "loss": 0.2772, + "step": 1550 + }, + { + "epoch": 0.06934257901053474, + "grad_norm": 0.18892864882946014, + "learning_rate": 3.8975e-05, + "loss": 0.2751, + "step": 1560 + }, + { + "epoch": 0.06978708272214074, + "grad_norm": 0.28160107135772705, + "learning_rate": 3.9225e-05, + "loss": 0.2765, + "step": 1570 + }, + { + "epoch": 0.07023158643374672, + "grad_norm": 0.2525385320186615, + "learning_rate": 3.9475000000000004e-05, + "loss": 0.2756, + "step": 1580 + }, + { + "epoch": 0.07067609014535271, + "grad_norm": 0.21565088629722595, + "learning_rate": 3.9725e-05, + "loss": 0.276, + "step": 1590 + }, + { + "epoch": 0.07112059385695871, + "grad_norm": 0.26411736011505127, + "learning_rate": 3.9975e-05, + "loss": 0.275, + "step": 1600 + }, + { + "epoch": 0.0715650975685647, + "grad_norm": 0.21731440722942352, + "learning_rate": 4.0225e-05, + "loss": 0.2757, + "step": 1610 + }, + { + "epoch": 0.07200960128017068, + "grad_norm": 0.221094012260437, + "learning_rate": 4.0475e-05, + "loss": 0.2752, + "step": 1620 + }, + { + "epoch": 0.07245410499177668, + "grad_norm": 0.24395069479942322, + "learning_rate": 4.0725e-05, + "loss": 0.2744, + "step": 1630 + }, + { + "epoch": 0.07289860870338268, + "grad_norm": 0.2460785061120987, + "learning_rate": 4.0975e-05, + "loss": 0.2762, + "step": 1640 + }, + { + "epoch": 0.07334311241498867, + "grad_norm": 0.25961068272590637, + "learning_rate": 4.1225e-05, + "loss": 0.2779, + "step": 1650 + }, + { + "epoch": 0.07378761612659465, + "grad_norm": 0.23807944357395172, + "learning_rate": 4.1475000000000005e-05, + "loss": 0.2779, + "step": 1660 + }, + { + "epoch": 0.07423211983820065, + "grad_norm": 0.25995519757270813, + "learning_rate": 4.1725000000000005e-05, + "loss": 0.277, + "step": 1670 + }, + { + "epoch": 0.07467662354980664, + "grad_norm": 0.2156476378440857, + "learning_rate": 4.1975000000000004e-05, + "loss": 0.2779, + "step": 1680 + }, + { + "epoch": 0.07512112726141264, + "grad_norm": 0.22322994470596313, + "learning_rate": 4.2225e-05, + "loss": 0.2781, + "step": 1690 + }, + { + "epoch": 0.07556563097301862, + "grad_norm": 0.22553017735481262, + "learning_rate": 4.2475e-05, + "loss": 0.2763, + "step": 1700 + }, + { + "epoch": 0.07601013468462461, + "grad_norm": 0.20847512781620026, + "learning_rate": 4.2725e-05, + "loss": 0.2766, + "step": 1710 + }, + { + "epoch": 0.07645463839623061, + "grad_norm": 0.19292756915092468, + "learning_rate": 4.2975e-05, + "loss": 0.2779, + "step": 1720 + }, + { + "epoch": 0.0768991421078366, + "grad_norm": 0.2445269376039505, + "learning_rate": 4.322500000000001e-05, + "loss": 0.2782, + "step": 1730 + }, + { + "epoch": 0.07734364581944259, + "grad_norm": 0.23703569173812866, + "learning_rate": 4.3475000000000006e-05, + "loss": 0.2742, + "step": 1740 + }, + { + "epoch": 0.07778814953104858, + "grad_norm": 0.21651557087898254, + "learning_rate": 4.3725000000000006e-05, + "loss": 0.2781, + "step": 1750 + }, + { + "epoch": 0.07823265324265458, + "grad_norm": 0.2741398513317108, + "learning_rate": 4.3975e-05, + "loss": 0.279, + "step": 1760 + }, + { + "epoch": 0.07867715695426057, + "grad_norm": 0.2605728805065155, + "learning_rate": 4.4225e-05, + "loss": 0.2765, + "step": 1770 + }, + { + "epoch": 0.07912166066586657, + "grad_norm": 0.24853983521461487, + "learning_rate": 4.4475e-05, + "loss": 0.2757, + "step": 1780 + }, + { + "epoch": 0.07956616437747255, + "grad_norm": 0.2502795457839966, + "learning_rate": 4.4725e-05, + "loss": 0.2774, + "step": 1790 + }, + { + "epoch": 0.08001066808907854, + "grad_norm": 0.22066520154476166, + "learning_rate": 4.4975e-05, + "loss": 0.2754, + "step": 1800 + }, + { + "epoch": 0.08045517180068454, + "grad_norm": 0.21026866137981415, + "learning_rate": 4.5225e-05, + "loss": 0.2786, + "step": 1810 + }, + { + "epoch": 0.08089967551229053, + "grad_norm": 0.21695229411125183, + "learning_rate": 4.5475e-05, + "loss": 0.2765, + "step": 1820 + }, + { + "epoch": 0.08134417922389652, + "grad_norm": 0.2423473298549652, + "learning_rate": 4.5725e-05, + "loss": 0.2796, + "step": 1830 + }, + { + "epoch": 0.08178868293550251, + "grad_norm": 0.24135592579841614, + "learning_rate": 4.5975e-05, + "loss": 0.2767, + "step": 1840 + }, + { + "epoch": 0.0822331866471085, + "grad_norm": 0.32516834139823914, + "learning_rate": 4.6225e-05, + "loss": 0.281, + "step": 1850 + }, + { + "epoch": 0.0826776903587145, + "grad_norm": 0.20808929204940796, + "learning_rate": 4.6475000000000005e-05, + "loss": 0.2796, + "step": 1860 + }, + { + "epoch": 0.08312219407032048, + "grad_norm": 0.24240165948867798, + "learning_rate": 4.6725000000000004e-05, + "loss": 0.279, + "step": 1870 + }, + { + "epoch": 0.08356669778192648, + "grad_norm": 0.22025592625141144, + "learning_rate": 4.6975000000000003e-05, + "loss": 0.2769, + "step": 1880 + }, + { + "epoch": 0.08401120149353247, + "grad_norm": 0.22489838302135468, + "learning_rate": 4.7225e-05, + "loss": 0.276, + "step": 1890 + }, + { + "epoch": 0.08445570520513847, + "grad_norm": 0.21870987117290497, + "learning_rate": 4.7475e-05, + "loss": 0.2766, + "step": 1900 + }, + { + "epoch": 0.08490020891674445, + "grad_norm": 0.22224681079387665, + "learning_rate": 4.7725e-05, + "loss": 0.2784, + "step": 1910 + }, + { + "epoch": 0.08534471262835044, + "grad_norm": 0.2244553416967392, + "learning_rate": 4.7975e-05, + "loss": 0.2784, + "step": 1920 + }, + { + "epoch": 0.08578921633995644, + "grad_norm": 0.21315747499465942, + "learning_rate": 4.822500000000001e-05, + "loss": 0.278, + "step": 1930 + }, + { + "epoch": 0.08623372005156243, + "grad_norm": 0.20681248605251312, + "learning_rate": 4.8475000000000006e-05, + "loss": 0.2784, + "step": 1940 + }, + { + "epoch": 0.08667822376316842, + "grad_norm": 0.20701389014720917, + "learning_rate": 4.8725000000000005e-05, + "loss": 0.2779, + "step": 1950 + }, + { + "epoch": 0.08712272747477441, + "grad_norm": 0.20868739485740662, + "learning_rate": 4.8975000000000005e-05, + "loss": 0.2743, + "step": 1960 + }, + { + "epoch": 0.0875672311863804, + "grad_norm": 0.2382962852716446, + "learning_rate": 4.9225000000000004e-05, + "loss": 0.2775, + "step": 1970 + }, + { + "epoch": 0.0880117348979864, + "grad_norm": 0.23192723095417023, + "learning_rate": 4.9475e-05, + "loss": 0.2775, + "step": 1980 + }, + { + "epoch": 0.0884562386095924, + "grad_norm": 0.26497533917427063, + "learning_rate": 4.9725e-05, + "loss": 0.2784, + "step": 1990 + }, + { + "epoch": 0.08890074232119838, + "grad_norm": 0.22865675389766693, + "learning_rate": 4.9975e-05, + "loss": 0.2766, + "step": 2000 + }, + { + "epoch": 0.08934524603280437, + "grad_norm": 0.21799954771995544, + "learning_rate": 5.0225e-05, + "loss": 0.276, + "step": 2010 + }, + { + "epoch": 0.08978974974441037, + "grad_norm": 0.22859756648540497, + "learning_rate": 5.047500000000001e-05, + "loss": 0.2794, + "step": 2020 + }, + { + "epoch": 0.09023425345601636, + "grad_norm": 0.2109600305557251, + "learning_rate": 5.0725e-05, + "loss": 0.2783, + "step": 2030 + }, + { + "epoch": 0.09067875716762235, + "grad_norm": 0.2084050178527832, + "learning_rate": 5.0975000000000006e-05, + "loss": 0.28, + "step": 2040 + }, + { + "epoch": 0.09112326087922834, + "grad_norm": 0.20979563891887665, + "learning_rate": 5.1225e-05, + "loss": 0.2791, + "step": 2050 + }, + { + "epoch": 0.09156776459083434, + "grad_norm": 0.22055427730083466, + "learning_rate": 5.1475000000000004e-05, + "loss": 0.2771, + "step": 2060 + }, + { + "epoch": 0.09201226830244033, + "grad_norm": 0.2374027669429779, + "learning_rate": 5.1725000000000004e-05, + "loss": 0.2799, + "step": 2070 + }, + { + "epoch": 0.09245677201404631, + "grad_norm": 0.2503259479999542, + "learning_rate": 5.197500000000001e-05, + "loss": 0.2742, + "step": 2080 + }, + { + "epoch": 0.09290127572565231, + "grad_norm": 0.25145775079727173, + "learning_rate": 5.2225e-05, + "loss": 0.2784, + "step": 2090 + }, + { + "epoch": 0.0933457794372583, + "grad_norm": 0.21555325388908386, + "learning_rate": 5.247500000000001e-05, + "loss": 0.2777, + "step": 2100 + }, + { + "epoch": 0.0937902831488643, + "grad_norm": 0.21602387726306915, + "learning_rate": 5.2725e-05, + "loss": 0.2796, + "step": 2110 + }, + { + "epoch": 0.09423478686047028, + "grad_norm": 0.24940906465053558, + "learning_rate": 5.297500000000001e-05, + "loss": 0.2791, + "step": 2120 + }, + { + "epoch": 0.09467929057207627, + "grad_norm": 0.2905471920967102, + "learning_rate": 5.3225e-05, + "loss": 0.2801, + "step": 2130 + }, + { + "epoch": 0.09512379428368227, + "grad_norm": 0.2230394184589386, + "learning_rate": 5.3475e-05, + "loss": 0.2777, + "step": 2140 + }, + { + "epoch": 0.09556829799528826, + "grad_norm": 0.21739816665649414, + "learning_rate": 5.3725000000000005e-05, + "loss": 0.2776, + "step": 2150 + }, + { + "epoch": 0.09601280170689425, + "grad_norm": 0.22526273131370544, + "learning_rate": 5.3975e-05, + "loss": 0.2821, + "step": 2160 + }, + { + "epoch": 0.09645730541850024, + "grad_norm": 0.19952887296676636, + "learning_rate": 5.4225000000000003e-05, + "loss": 0.2793, + "step": 2170 + }, + { + "epoch": 0.09690180913010624, + "grad_norm": 0.25534775853157043, + "learning_rate": 5.4474999999999996e-05, + "loss": 0.282, + "step": 2180 + }, + { + "epoch": 0.09734631284171223, + "grad_norm": 0.22061723470687866, + "learning_rate": 5.4725e-05, + "loss": 0.2791, + "step": 2190 + }, + { + "epoch": 0.09779081655331821, + "grad_norm": 0.20050807297229767, + "learning_rate": 5.4975e-05, + "loss": 0.2792, + "step": 2200 + }, + { + "epoch": 0.09823532026492421, + "grad_norm": 0.21830759942531586, + "learning_rate": 5.522500000000001e-05, + "loss": 0.2819, + "step": 2210 + }, + { + "epoch": 0.0986798239765302, + "grad_norm": 0.2281774878501892, + "learning_rate": 5.5475e-05, + "loss": 0.2802, + "step": 2220 + }, + { + "epoch": 0.0991243276881362, + "grad_norm": 0.2288549244403839, + "learning_rate": 5.5725000000000006e-05, + "loss": 0.2824, + "step": 2230 + }, + { + "epoch": 0.0995688313997422, + "grad_norm": 0.22301417589187622, + "learning_rate": 5.5975e-05, + "loss": 0.2832, + "step": 2240 + }, + { + "epoch": 0.10001333511134818, + "grad_norm": 0.23531626164913177, + "learning_rate": 5.6225000000000005e-05, + "loss": 0.2822, + "step": 2250 + }, + { + "epoch": 0.10045783882295417, + "grad_norm": 0.23505376279354095, + "learning_rate": 5.6475e-05, + "loss": 0.279, + "step": 2260 + }, + { + "epoch": 0.10090234253456017, + "grad_norm": 0.2646108567714691, + "learning_rate": 5.6725e-05, + "loss": 0.2828, + "step": 2270 + }, + { + "epoch": 0.10134684624616616, + "grad_norm": 0.2712406814098358, + "learning_rate": 5.6975e-05, + "loss": 0.2824, + "step": 2280 + }, + { + "epoch": 0.10179134995777214, + "grad_norm": 0.26156213879585266, + "learning_rate": 5.722500000000001e-05, + "loss": 0.2811, + "step": 2290 + }, + { + "epoch": 0.10223585366937814, + "grad_norm": 0.23424793779850006, + "learning_rate": 5.7475e-05, + "loss": 0.2793, + "step": 2300 + }, + { + "epoch": 0.10268035738098413, + "grad_norm": 0.2662680745124817, + "learning_rate": 5.772500000000001e-05, + "loss": 0.2843, + "step": 2310 + }, + { + "epoch": 0.10312486109259013, + "grad_norm": 0.21334044635295868, + "learning_rate": 5.7975e-05, + "loss": 0.2786, + "step": 2320 + }, + { + "epoch": 0.10356936480419611, + "grad_norm": 0.21026477217674255, + "learning_rate": 5.8225000000000006e-05, + "loss": 0.2813, + "step": 2330 + }, + { + "epoch": 0.1040138685158021, + "grad_norm": 0.21464654803276062, + "learning_rate": 5.8475000000000005e-05, + "loss": 0.2805, + "step": 2340 + }, + { + "epoch": 0.1044583722274081, + "grad_norm": 0.24839900434017181, + "learning_rate": 5.8725000000000004e-05, + "loss": 0.2814, + "step": 2350 + }, + { + "epoch": 0.1049028759390141, + "grad_norm": 0.2237076312303543, + "learning_rate": 5.8975000000000004e-05, + "loss": 0.2808, + "step": 2360 + }, + { + "epoch": 0.10534737965062008, + "grad_norm": 0.2467891126871109, + "learning_rate": 5.922500000000001e-05, + "loss": 0.2812, + "step": 2370 + }, + { + "epoch": 0.10579188336222607, + "grad_norm": 0.21759827435016632, + "learning_rate": 5.9475e-05, + "loss": 0.2785, + "step": 2380 + }, + { + "epoch": 0.10623638707383207, + "grad_norm": 0.6132181882858276, + "learning_rate": 5.9724999999999995e-05, + "loss": 0.282, + "step": 2390 + }, + { + "epoch": 0.10668089078543806, + "grad_norm": 0.26480361819267273, + "learning_rate": 5.9975e-05, + "loss": 0.2819, + "step": 2400 + }, + { + "epoch": 0.10712539449704404, + "grad_norm": 0.24238169193267822, + "learning_rate": 6.0225e-05, + "loss": 0.2851, + "step": 2410 + }, + { + "epoch": 0.10756989820865004, + "grad_norm": 0.24328593909740448, + "learning_rate": 6.0475000000000006e-05, + "loss": 0.2798, + "step": 2420 + }, + { + "epoch": 0.10801440192025603, + "grad_norm": 0.23081164062023163, + "learning_rate": 6.0725e-05, + "loss": 0.2793, + "step": 2430 + }, + { + "epoch": 0.10845890563186203, + "grad_norm": 0.27270567417144775, + "learning_rate": 6.0975000000000005e-05, + "loss": 0.2803, + "step": 2440 + }, + { + "epoch": 0.10890340934346802, + "grad_norm": 0.2221400886774063, + "learning_rate": 6.1225e-05, + "loss": 0.2835, + "step": 2450 + }, + { + "epoch": 0.109347913055074, + "grad_norm": 0.2358832061290741, + "learning_rate": 6.1475e-05, + "loss": 0.2829, + "step": 2460 + }, + { + "epoch": 0.10979241676668, + "grad_norm": 0.33493247628211975, + "learning_rate": 6.1725e-05, + "loss": 0.2808, + "step": 2470 + }, + { + "epoch": 0.110236920478286, + "grad_norm": 0.24156461656093597, + "learning_rate": 6.1975e-05, + "loss": 0.2823, + "step": 2480 + }, + { + "epoch": 0.11068142418989199, + "grad_norm": 0.22118592262268066, + "learning_rate": 6.2225e-05, + "loss": 0.2837, + "step": 2490 + }, + { + "epoch": 0.11112592790149797, + "grad_norm": 0.21149942278862, + "learning_rate": 6.2475e-05, + "loss": 0.282, + "step": 2500 + }, + { + "epoch": 0.11157043161310397, + "grad_norm": 0.215234637260437, + "learning_rate": 6.2725e-05, + "loss": 0.2839, + "step": 2510 + }, + { + "epoch": 0.11201493532470996, + "grad_norm": 0.21295280754566193, + "learning_rate": 6.297500000000001e-05, + "loss": 0.2819, + "step": 2520 + }, + { + "epoch": 0.11245943903631596, + "grad_norm": 0.232211172580719, + "learning_rate": 6.3225e-05, + "loss": 0.2846, + "step": 2530 + }, + { + "epoch": 0.11290394274792194, + "grad_norm": 0.24456021189689636, + "learning_rate": 6.347500000000001e-05, + "loss": 0.2839, + "step": 2540 + }, + { + "epoch": 0.11334844645952794, + "grad_norm": 0.24984890222549438, + "learning_rate": 6.3725e-05, + "loss": 0.2848, + "step": 2550 + }, + { + "epoch": 0.11379295017113393, + "grad_norm": 0.23015932738780975, + "learning_rate": 6.397500000000001e-05, + "loss": 0.2831, + "step": 2560 + }, + { + "epoch": 0.11423745388273993, + "grad_norm": 0.24100635945796967, + "learning_rate": 6.4225e-05, + "loss": 0.2816, + "step": 2570 + }, + { + "epoch": 0.1146819575943459, + "grad_norm": 0.2252528965473175, + "learning_rate": 6.447500000000001e-05, + "loss": 0.2811, + "step": 2580 + }, + { + "epoch": 0.1151264613059519, + "grad_norm": 0.23663966357707977, + "learning_rate": 6.4725e-05, + "loss": 0.2827, + "step": 2590 + }, + { + "epoch": 0.1155709650175579, + "grad_norm": 0.19790388643741608, + "learning_rate": 6.497500000000001e-05, + "loss": 0.2818, + "step": 2600 + }, + { + "epoch": 0.11601546872916389, + "grad_norm": 0.8143248558044434, + "learning_rate": 6.5225e-05, + "loss": 0.282, + "step": 2610 + }, + { + "epoch": 0.11645997244076987, + "grad_norm": 0.2837538421154022, + "learning_rate": 6.5475e-05, + "loss": 0.2829, + "step": 2620 + }, + { + "epoch": 0.11690447615237587, + "grad_norm": 0.22466331720352173, + "learning_rate": 6.5725e-05, + "loss": 0.2852, + "step": 2630 + }, + { + "epoch": 0.11734897986398186, + "grad_norm": 0.2637028992176056, + "learning_rate": 6.5975e-05, + "loss": 0.2889, + "step": 2640 + }, + { + "epoch": 0.11779348357558786, + "grad_norm": 0.2827359139919281, + "learning_rate": 6.6225e-05, + "loss": 0.2822, + "step": 2650 + }, + { + "epoch": 0.11823798728719385, + "grad_norm": 0.26172927021980286, + "learning_rate": 6.6475e-05, + "loss": 0.289, + "step": 2660 + }, + { + "epoch": 0.11868249099879984, + "grad_norm": 0.23070627450942993, + "learning_rate": 6.672500000000001e-05, + "loss": 0.2859, + "step": 2670 + }, + { + "epoch": 0.11912699471040583, + "grad_norm": 0.25939762592315674, + "learning_rate": 6.6975e-05, + "loss": 0.2897, + "step": 2680 + }, + { + "epoch": 0.11957149842201183, + "grad_norm": 0.2731914818286896, + "learning_rate": 6.722500000000001e-05, + "loss": 0.2827, + "step": 2690 + }, + { + "epoch": 0.12001600213361782, + "grad_norm": 0.2720582187175751, + "learning_rate": 6.7475e-05, + "loss": 0.2837, + "step": 2700 + }, + { + "epoch": 0.1204605058452238, + "grad_norm": 0.2182457149028778, + "learning_rate": 6.7725e-05, + "loss": 0.2857, + "step": 2710 + }, + { + "epoch": 0.1209050095568298, + "grad_norm": 0.23477570712566376, + "learning_rate": 6.7975e-05, + "loss": 0.2829, + "step": 2720 + }, + { + "epoch": 0.1213495132684358, + "grad_norm": 0.2302234172821045, + "learning_rate": 6.8225e-05, + "loss": 0.2847, + "step": 2730 + }, + { + "epoch": 0.12179401698004179, + "grad_norm": 0.2394218146800995, + "learning_rate": 6.8475e-05, + "loss": 0.2824, + "step": 2740 + }, + { + "epoch": 0.12223852069164777, + "grad_norm": 0.20100021362304688, + "learning_rate": 6.8725e-05, + "loss": 0.2816, + "step": 2750 + }, + { + "epoch": 0.12268302440325377, + "grad_norm": 0.2358885258436203, + "learning_rate": 6.8975e-05, + "loss": 0.2826, + "step": 2760 + }, + { + "epoch": 0.12312752811485976, + "grad_norm": 0.2367487996816635, + "learning_rate": 6.9225e-05, + "loss": 0.2817, + "step": 2770 + }, + { + "epoch": 0.12357203182646576, + "grad_norm": 0.252703458070755, + "learning_rate": 6.9475e-05, + "loss": 0.2822, + "step": 2780 + }, + { + "epoch": 0.12401653553807174, + "grad_norm": 0.2816811203956604, + "learning_rate": 6.9725e-05, + "loss": 0.283, + "step": 2790 + }, + { + "epoch": 0.12446103924967773, + "grad_norm": 0.25816836953163147, + "learning_rate": 6.997500000000001e-05, + "loss": 0.2822, + "step": 2800 + }, + { + "epoch": 0.12490554296128373, + "grad_norm": 0.27946797013282776, + "learning_rate": 7.022500000000001e-05, + "loss": 0.2813, + "step": 2810 + }, + { + "epoch": 0.1253500466728897, + "grad_norm": 0.2459253966808319, + "learning_rate": 7.0475e-05, + "loss": 0.2828, + "step": 2820 + }, + { + "epoch": 0.12579455038449572, + "grad_norm": 0.2445862740278244, + "learning_rate": 7.072500000000001e-05, + "loss": 0.281, + "step": 2830 + }, + { + "epoch": 0.1262390540961017, + "grad_norm": 0.24043415486812592, + "learning_rate": 7.0975e-05, + "loss": 0.2829, + "step": 2840 + }, + { + "epoch": 0.12668355780770768, + "grad_norm": 0.2706281840801239, + "learning_rate": 7.122500000000001e-05, + "loss": 0.2842, + "step": 2850 + }, + { + "epoch": 0.1271280615193137, + "grad_norm": 0.21347899734973907, + "learning_rate": 7.1475e-05, + "loss": 0.2844, + "step": 2860 + }, + { + "epoch": 0.12757256523091967, + "grad_norm": 0.2513664662837982, + "learning_rate": 7.172500000000001e-05, + "loss": 0.2819, + "step": 2870 + }, + { + "epoch": 0.12801706894252568, + "grad_norm": 0.491122305393219, + "learning_rate": 7.1975e-05, + "loss": 0.2832, + "step": 2880 + }, + { + "epoch": 0.12846157265413166, + "grad_norm": 0.2824999690055847, + "learning_rate": 7.2225e-05, + "loss": 0.2822, + "step": 2890 + }, + { + "epoch": 0.12890607636573764, + "grad_norm": 0.2610629200935364, + "learning_rate": 7.2475e-05, + "loss": 0.2875, + "step": 2900 + }, + { + "epoch": 0.12935058007734365, + "grad_norm": 0.2656794488430023, + "learning_rate": 7.272499999999999e-05, + "loss": 0.2874, + "step": 2910 + }, + { + "epoch": 0.12979508378894963, + "grad_norm": 0.23427121341228485, + "learning_rate": 7.2975e-05, + "loss": 0.2851, + "step": 2920 + }, + { + "epoch": 0.13023958750055564, + "grad_norm": 0.20821060240268707, + "learning_rate": 7.3225e-05, + "loss": 0.2859, + "step": 2930 + }, + { + "epoch": 0.13068409121216162, + "grad_norm": 0.24434420466423035, + "learning_rate": 7.347500000000001e-05, + "loss": 0.2837, + "step": 2940 + }, + { + "epoch": 0.1311285949237676, + "grad_norm": 0.24364469945430756, + "learning_rate": 7.3725e-05, + "loss": 0.2843, + "step": 2950 + }, + { + "epoch": 0.13157309863537361, + "grad_norm": 0.2495317906141281, + "learning_rate": 7.397500000000001e-05, + "loss": 0.2888, + "step": 2960 + }, + { + "epoch": 0.1320176023469796, + "grad_norm": 0.24542462825775146, + "learning_rate": 7.4225e-05, + "loss": 0.2854, + "step": 2970 + }, + { + "epoch": 0.13246210605858558, + "grad_norm": 0.2649517357349396, + "learning_rate": 7.447500000000001e-05, + "loss": 0.2862, + "step": 2980 + }, + { + "epoch": 0.13290660977019159, + "grad_norm": 0.2734636664390564, + "learning_rate": 7.4725e-05, + "loss": 0.2847, + "step": 2990 + }, + { + "epoch": 0.13335111348179757, + "grad_norm": 0.23679780960083008, + "learning_rate": 7.4975e-05, + "loss": 0.2868, + "step": 3000 + }, + { + "epoch": 0.13379561719340358, + "grad_norm": 0.3124074935913086, + "learning_rate": 7.5225e-05, + "loss": 0.2855, + "step": 3010 + }, + { + "epoch": 0.13424012090500956, + "grad_norm": 0.24486179649829865, + "learning_rate": 7.5475e-05, + "loss": 0.2908, + "step": 3020 + }, + { + "epoch": 0.13468462461661554, + "grad_norm": 0.2233928143978119, + "learning_rate": 7.5725e-05, + "loss": 0.2878, + "step": 3030 + }, + { + "epoch": 0.13512912832822155, + "grad_norm": 0.2136278599500656, + "learning_rate": 7.5975e-05, + "loss": 0.2857, + "step": 3040 + }, + { + "epoch": 0.13557363203982753, + "grad_norm": 0.2594446837902069, + "learning_rate": 7.6225e-05, + "loss": 0.2861, + "step": 3050 + }, + { + "epoch": 0.1360181357514335, + "grad_norm": 0.2675434350967407, + "learning_rate": 7.6475e-05, + "loss": 0.287, + "step": 3060 + }, + { + "epoch": 0.13646263946303952, + "grad_norm": 0.2627080976963043, + "learning_rate": 7.672500000000001e-05, + "loss": 0.2895, + "step": 3070 + }, + { + "epoch": 0.1369071431746455, + "grad_norm": 0.27718496322631836, + "learning_rate": 7.697500000000001e-05, + "loss": 0.2886, + "step": 3080 + }, + { + "epoch": 0.1373516468862515, + "grad_norm": 0.22442762553691864, + "learning_rate": 7.722500000000001e-05, + "loss": 0.291, + "step": 3090 + }, + { + "epoch": 0.1377961505978575, + "grad_norm": 0.23623594641685486, + "learning_rate": 7.747500000000001e-05, + "loss": 0.2906, + "step": 3100 + }, + { + "epoch": 0.13824065430946347, + "grad_norm": 0.31437963247299194, + "learning_rate": 7.7725e-05, + "loss": 0.2876, + "step": 3110 + }, + { + "epoch": 0.13868515802106948, + "grad_norm": 0.28623542189598083, + "learning_rate": 7.797500000000001e-05, + "loss": 0.2885, + "step": 3120 + }, + { + "epoch": 0.13912966173267546, + "grad_norm": 0.26494812965393066, + "learning_rate": 7.8225e-05, + "loss": 0.2883, + "step": 3130 + }, + { + "epoch": 0.13957416544428147, + "grad_norm": 0.2524864971637726, + "learning_rate": 7.8475e-05, + "loss": 0.2864, + "step": 3140 + }, + { + "epoch": 0.14001866915588745, + "grad_norm": 0.29268351197242737, + "learning_rate": 7.8725e-05, + "loss": 0.2908, + "step": 3150 + }, + { + "epoch": 0.14046317286749344, + "grad_norm": 0.27678024768829346, + "learning_rate": 7.8975e-05, + "loss": 0.2852, + "step": 3160 + }, + { + "epoch": 0.14090767657909944, + "grad_norm": 0.25320127606391907, + "learning_rate": 7.9225e-05, + "loss": 0.2883, + "step": 3170 + }, + { + "epoch": 0.14135218029070543, + "grad_norm": 0.22543565928936005, + "learning_rate": 7.9475e-05, + "loss": 0.2877, + "step": 3180 + }, + { + "epoch": 0.1417966840023114, + "grad_norm": 0.2950059473514557, + "learning_rate": 7.9725e-05, + "loss": 0.2876, + "step": 3190 + }, + { + "epoch": 0.14224118771391742, + "grad_norm": 0.2856177091598511, + "learning_rate": 7.9975e-05, + "loss": 0.289, + "step": 3200 + }, + { + "epoch": 0.1426856914255234, + "grad_norm": 0.24508604407310486, + "learning_rate": 8.022500000000001e-05, + "loss": 0.2897, + "step": 3210 + }, + { + "epoch": 0.1431301951371294, + "grad_norm": 0.26644179224967957, + "learning_rate": 8.0475e-05, + "loss": 0.287, + "step": 3220 + }, + { + "epoch": 0.1435746988487354, + "grad_norm": 0.25992679595947266, + "learning_rate": 8.072500000000001e-05, + "loss": 0.2909, + "step": 3230 + }, + { + "epoch": 0.14401920256034137, + "grad_norm": 0.2313682585954666, + "learning_rate": 8.0975e-05, + "loss": 0.2865, + "step": 3240 + }, + { + "epoch": 0.14446370627194738, + "grad_norm": 0.27160003781318665, + "learning_rate": 8.122500000000001e-05, + "loss": 0.2888, + "step": 3250 + }, + { + "epoch": 0.14490820998355336, + "grad_norm": 0.2589600384235382, + "learning_rate": 8.1475e-05, + "loss": 0.2878, + "step": 3260 + }, + { + "epoch": 0.14535271369515934, + "grad_norm": 0.2312152087688446, + "learning_rate": 8.172500000000001e-05, + "loss": 0.287, + "step": 3270 + }, + { + "epoch": 0.14579721740676535, + "grad_norm": 0.24848471581935883, + "learning_rate": 8.1975e-05, + "loss": 0.2905, + "step": 3280 + }, + { + "epoch": 0.14624172111837133, + "grad_norm": 0.2515927851200104, + "learning_rate": 8.2225e-05, + "loss": 0.2871, + "step": 3290 + }, + { + "epoch": 0.14668622482997734, + "grad_norm": 0.275562584400177, + "learning_rate": 8.2475e-05, + "loss": 0.2875, + "step": 3300 + }, + { + "epoch": 0.14713072854158332, + "grad_norm": 0.22684700787067413, + "learning_rate": 8.2725e-05, + "loss": 0.2877, + "step": 3310 + }, + { + "epoch": 0.1475752322531893, + "grad_norm": 0.21145884692668915, + "learning_rate": 8.2975e-05, + "loss": 0.2899, + "step": 3320 + }, + { + "epoch": 0.1480197359647953, + "grad_norm": 0.2741777002811432, + "learning_rate": 8.3225e-05, + "loss": 0.2906, + "step": 3330 + }, + { + "epoch": 0.1484642396764013, + "grad_norm": 0.22289250791072845, + "learning_rate": 8.347500000000001e-05, + "loss": 0.2854, + "step": 3340 + }, + { + "epoch": 0.1489087433880073, + "grad_norm": 0.24108561873435974, + "learning_rate": 8.3725e-05, + "loss": 0.2903, + "step": 3350 + }, + { + "epoch": 0.14935324709961328, + "grad_norm": 0.2345762550830841, + "learning_rate": 8.397500000000001e-05, + "loss": 0.2868, + "step": 3360 + }, + { + "epoch": 0.14979775081121927, + "grad_norm": 0.26703619956970215, + "learning_rate": 8.422500000000001e-05, + "loss": 0.2923, + "step": 3370 + }, + { + "epoch": 0.15024225452282527, + "grad_norm": 0.2551576495170593, + "learning_rate": 8.447500000000001e-05, + "loss": 0.2881, + "step": 3380 + }, + { + "epoch": 0.15068675823443126, + "grad_norm": 0.26091471314430237, + "learning_rate": 8.4725e-05, + "loss": 0.2882, + "step": 3390 + }, + { + "epoch": 0.15113126194603724, + "grad_norm": 0.27166566252708435, + "learning_rate": 8.4975e-05, + "loss": 0.291, + "step": 3400 + }, + { + "epoch": 0.15157576565764325, + "grad_norm": 0.24071428179740906, + "learning_rate": 8.5225e-05, + "loss": 0.2888, + "step": 3410 + }, + { + "epoch": 0.15202026936924923, + "grad_norm": 0.276619553565979, + "learning_rate": 8.5475e-05, + "loss": 0.289, + "step": 3420 + }, + { + "epoch": 0.15246477308085524, + "grad_norm": 0.23294131457805634, + "learning_rate": 8.5725e-05, + "loss": 0.2869, + "step": 3430 + }, + { + "epoch": 0.15290927679246122, + "grad_norm": 0.2821224629878998, + "learning_rate": 8.5975e-05, + "loss": 0.2888, + "step": 3440 + }, + { + "epoch": 0.1533537805040672, + "grad_norm": 0.24131058156490326, + "learning_rate": 8.6225e-05, + "loss": 0.2871, + "step": 3450 + }, + { + "epoch": 0.1537982842156732, + "grad_norm": 0.2556333541870117, + "learning_rate": 8.6475e-05, + "loss": 0.2925, + "step": 3460 + }, + { + "epoch": 0.1542427879272792, + "grad_norm": 0.2480180561542511, + "learning_rate": 8.672500000000001e-05, + "loss": 0.288, + "step": 3470 + }, + { + "epoch": 0.15468729163888517, + "grad_norm": 0.23742569983005524, + "learning_rate": 8.6975e-05, + "loss": 0.2888, + "step": 3480 + }, + { + "epoch": 0.15513179535049118, + "grad_norm": 0.2392365038394928, + "learning_rate": 8.7225e-05, + "loss": 0.283, + "step": 3490 + }, + { + "epoch": 0.15557629906209716, + "grad_norm": 0.28150442242622375, + "learning_rate": 8.747500000000001e-05, + "loss": 0.2926, + "step": 3500 + }, + { + "epoch": 0.15602080277370317, + "grad_norm": 0.2292432188987732, + "learning_rate": 8.7725e-05, + "loss": 0.2872, + "step": 3510 + }, + { + "epoch": 0.15646530648530915, + "grad_norm": 0.24382424354553223, + "learning_rate": 8.797500000000001e-05, + "loss": 0.2898, + "step": 3520 + }, + { + "epoch": 0.15690981019691513, + "grad_norm": 0.22255957126617432, + "learning_rate": 8.8225e-05, + "loss": 0.2894, + "step": 3530 + }, + { + "epoch": 0.15735431390852114, + "grad_norm": 0.2513941824436188, + "learning_rate": 8.847500000000001e-05, + "loss": 0.2896, + "step": 3540 + }, + { + "epoch": 0.15779881762012712, + "grad_norm": 0.2544025480747223, + "learning_rate": 8.8725e-05, + "loss": 0.2894, + "step": 3550 + }, + { + "epoch": 0.15824332133173313, + "grad_norm": 0.2397886961698532, + "learning_rate": 8.897500000000001e-05, + "loss": 0.292, + "step": 3560 + }, + { + "epoch": 0.15868782504333911, + "grad_norm": 0.27642199397087097, + "learning_rate": 8.9225e-05, + "loss": 0.2895, + "step": 3570 + }, + { + "epoch": 0.1591323287549451, + "grad_norm": 0.2722671926021576, + "learning_rate": 8.9475e-05, + "loss": 0.2894, + "step": 3580 + }, + { + "epoch": 0.1595768324665511, + "grad_norm": 0.2218194603919983, + "learning_rate": 8.9725e-05, + "loss": 0.2874, + "step": 3590 + }, + { + "epoch": 0.1600213361781571, + "grad_norm": 0.2515266537666321, + "learning_rate": 8.9975e-05, + "loss": 0.2933, + "step": 3600 + }, + { + "epoch": 0.16046583988976307, + "grad_norm": 0.2120060920715332, + "learning_rate": 9.0225e-05, + "loss": 0.2899, + "step": 3610 + }, + { + "epoch": 0.16091034360136908, + "grad_norm": 0.2494654804468155, + "learning_rate": 9.0475e-05, + "loss": 0.2884, + "step": 3620 + }, + { + "epoch": 0.16135484731297506, + "grad_norm": 0.24775126576423645, + "learning_rate": 9.072500000000001e-05, + "loss": 0.2917, + "step": 3630 + }, + { + "epoch": 0.16179935102458107, + "grad_norm": 0.26205533742904663, + "learning_rate": 9.0975e-05, + "loss": 0.2876, + "step": 3640 + }, + { + "epoch": 0.16224385473618705, + "grad_norm": 0.266767293214798, + "learning_rate": 9.122500000000001e-05, + "loss": 0.2927, + "step": 3650 + }, + { + "epoch": 0.16268835844779303, + "grad_norm": 0.2321104109287262, + "learning_rate": 9.1475e-05, + "loss": 0.2877, + "step": 3660 + }, + { + "epoch": 0.16313286215939904, + "grad_norm": 0.23294489085674286, + "learning_rate": 9.172500000000001e-05, + "loss": 0.2888, + "step": 3670 + }, + { + "epoch": 0.16357736587100502, + "grad_norm": 0.27284887433052063, + "learning_rate": 9.1975e-05, + "loss": 0.2877, + "step": 3680 + }, + { + "epoch": 0.164021869582611, + "grad_norm": 0.26084384322166443, + "learning_rate": 9.2225e-05, + "loss": 0.2916, + "step": 3690 + }, + { + "epoch": 0.164466373294217, + "grad_norm": 0.2988293468952179, + "learning_rate": 9.2475e-05, + "loss": 0.2895, + "step": 3700 + }, + { + "epoch": 0.164910877005823, + "grad_norm": 0.2196889966726303, + "learning_rate": 9.2725e-05, + "loss": 0.2925, + "step": 3710 + }, + { + "epoch": 0.165355380717429, + "grad_norm": 0.2718175947666168, + "learning_rate": 9.2975e-05, + "loss": 0.2895, + "step": 3720 + }, + { + "epoch": 0.16579988442903498, + "grad_norm": 0.24285270273685455, + "learning_rate": 9.3225e-05, + "loss": 0.2912, + "step": 3730 + }, + { + "epoch": 0.16624438814064096, + "grad_norm": 0.2716880142688751, + "learning_rate": 9.3475e-05, + "loss": 0.2925, + "step": 3740 + }, + { + "epoch": 0.16668889185224697, + "grad_norm": 0.2597470283508301, + "learning_rate": 9.3725e-05, + "loss": 0.2887, + "step": 3750 + }, + { + "epoch": 0.16713339556385295, + "grad_norm": 0.20825420320034027, + "learning_rate": 9.397500000000001e-05, + "loss": 0.2902, + "step": 3760 + }, + { + "epoch": 0.16757789927545896, + "grad_norm": 0.2190062403678894, + "learning_rate": 9.422500000000001e-05, + "loss": 0.2919, + "step": 3770 + }, + { + "epoch": 0.16802240298706494, + "grad_norm": 0.2717248797416687, + "learning_rate": 9.4475e-05, + "loss": 0.2912, + "step": 3780 + }, + { + "epoch": 0.16846690669867093, + "grad_norm": 0.28763964772224426, + "learning_rate": 9.472500000000001e-05, + "loss": 0.2884, + "step": 3790 + }, + { + "epoch": 0.16891141041027694, + "grad_norm": 0.2973826229572296, + "learning_rate": 9.4975e-05, + "loss": 0.2895, + "step": 3800 + }, + { + "epoch": 0.16935591412188292, + "grad_norm": 0.2195618897676468, + "learning_rate": 9.522500000000001e-05, + "loss": 0.2928, + "step": 3810 + }, + { + "epoch": 0.1698004178334889, + "grad_norm": 0.23491276800632477, + "learning_rate": 9.5475e-05, + "loss": 0.2925, + "step": 3820 + }, + { + "epoch": 0.1702449215450949, + "grad_norm": 0.2512952983379364, + "learning_rate": 9.572500000000001e-05, + "loss": 0.2939, + "step": 3830 + }, + { + "epoch": 0.1706894252567009, + "grad_norm": 0.33727601170539856, + "learning_rate": 9.5975e-05, + "loss": 0.2939, + "step": 3840 + }, + { + "epoch": 0.1711339289683069, + "grad_norm": 0.24449588358402252, + "learning_rate": 9.622500000000001e-05, + "loss": 0.2916, + "step": 3850 + }, + { + "epoch": 0.17157843267991288, + "grad_norm": 0.29182058572769165, + "learning_rate": 9.6475e-05, + "loss": 0.2913, + "step": 3860 + }, + { + "epoch": 0.17202293639151886, + "grad_norm": 0.2313079982995987, + "learning_rate": 9.6725e-05, + "loss": 0.2958, + "step": 3870 + }, + { + "epoch": 0.17246744010312487, + "grad_norm": 0.23651671409606934, + "learning_rate": 9.6975e-05, + "loss": 0.2928, + "step": 3880 + }, + { + "epoch": 0.17291194381473085, + "grad_norm": 0.28313735127449036, + "learning_rate": 9.7225e-05, + "loss": 0.2926, + "step": 3890 + }, + { + "epoch": 0.17335644752633683, + "grad_norm": 0.2726205289363861, + "learning_rate": 9.747500000000001e-05, + "loss": 0.2925, + "step": 3900 + }, + { + "epoch": 0.17380095123794284, + "grad_norm": 0.22940541803836823, + "learning_rate": 9.7725e-05, + "loss": 0.2913, + "step": 3910 + }, + { + "epoch": 0.17424545494954882, + "grad_norm": 0.28850653767585754, + "learning_rate": 9.797500000000001e-05, + "loss": 0.292, + "step": 3920 + }, + { + "epoch": 0.17468995866115483, + "grad_norm": 0.26994752883911133, + "learning_rate": 9.8225e-05, + "loss": 0.2923, + "step": 3930 + }, + { + "epoch": 0.1751344623727608, + "grad_norm": 0.28330543637275696, + "learning_rate": 9.847500000000001e-05, + "loss": 0.288, + "step": 3940 + }, + { + "epoch": 0.1755789660843668, + "grad_norm": 0.24815787374973297, + "learning_rate": 9.8725e-05, + "loss": 0.2927, + "step": 3950 + }, + { + "epoch": 0.1760234697959728, + "grad_norm": 0.2786967158317566, + "learning_rate": 9.897500000000001e-05, + "loss": 0.2941, + "step": 3960 + }, + { + "epoch": 0.17646797350757878, + "grad_norm": 0.2803199887275696, + "learning_rate": 9.9225e-05, + "loss": 0.2929, + "step": 3970 + }, + { + "epoch": 0.1769124772191848, + "grad_norm": 0.26548582315444946, + "learning_rate": 9.9475e-05, + "loss": 0.292, + "step": 3980 + }, + { + "epoch": 0.17735698093079078, + "grad_norm": 0.2625095546245575, + "learning_rate": 9.9725e-05, + "loss": 0.2945, + "step": 3990 + }, + { + "epoch": 0.17780148464239676, + "grad_norm": 0.25194576382637024, + "learning_rate": 9.9975e-05, + "loss": 0.2933, + "step": 4000 + }, + { + "epoch": 0.17824598835400277, + "grad_norm": 0.2569403052330017, + "learning_rate": 9.999999653982884e-05, + "loss": 0.2921, + "step": 4010 + }, + { + "epoch": 0.17869049206560875, + "grad_norm": 0.2589402496814728, + "learning_rate": 9.999998457874392e-05, + "loss": 0.2894, + "step": 4020 + }, + { + "epoch": 0.17913499577721473, + "grad_norm": 0.24898768961429596, + "learning_rate": 9.999996407402913e-05, + "loss": 0.2933, + "step": 4030 + }, + { + "epoch": 0.17957949948882074, + "grad_norm": 0.23194114863872528, + "learning_rate": 9.999993502568801e-05, + "loss": 0.2932, + "step": 4040 + }, + { + "epoch": 0.18002400320042672, + "grad_norm": 0.23102961480617523, + "learning_rate": 9.999989743372548e-05, + "loss": 0.2898, + "step": 4050 + }, + { + "epoch": 0.18046850691203273, + "grad_norm": 0.2545153796672821, + "learning_rate": 9.999985129814798e-05, + "loss": 0.2973, + "step": 4060 + }, + { + "epoch": 0.1809130106236387, + "grad_norm": 0.2681058347225189, + "learning_rate": 9.99997966189634e-05, + "loss": 0.2962, + "step": 4070 + }, + { + "epoch": 0.1813575143352447, + "grad_norm": 0.2400905191898346, + "learning_rate": 9.999973339618107e-05, + "loss": 0.2964, + "step": 4080 + }, + { + "epoch": 0.1818020180468507, + "grad_norm": 0.2740291953086853, + "learning_rate": 9.999966162981179e-05, + "loss": 0.2943, + "step": 4090 + }, + { + "epoch": 0.18224652175845668, + "grad_norm": 0.2577119469642639, + "learning_rate": 9.999958131986784e-05, + "loss": 0.2936, + "step": 4100 + }, + { + "epoch": 0.18269102547006266, + "grad_norm": 0.2590191066265106, + "learning_rate": 9.999949246636293e-05, + "loss": 0.2931, + "step": 4110 + }, + { + "epoch": 0.18313552918166867, + "grad_norm": 0.2748775780200958, + "learning_rate": 9.999939506931224e-05, + "loss": 0.293, + "step": 4120 + }, + { + "epoch": 0.18358003289327465, + "grad_norm": 0.30414530634880066, + "learning_rate": 9.999928912873243e-05, + "loss": 0.2943, + "step": 4130 + }, + { + "epoch": 0.18402453660488066, + "grad_norm": 0.2544001638889313, + "learning_rate": 9.999917464464159e-05, + "loss": 0.2982, + "step": 4140 + }, + { + "epoch": 0.18446904031648664, + "grad_norm": 0.26015883684158325, + "learning_rate": 9.999905161705929e-05, + "loss": 0.2972, + "step": 4150 + }, + { + "epoch": 0.18491354402809262, + "grad_norm": 0.2274996042251587, + "learning_rate": 9.999892004600653e-05, + "loss": 0.2943, + "step": 4160 + }, + { + "epoch": 0.18535804773969863, + "grad_norm": 0.2543921172618866, + "learning_rate": 9.999877993150581e-05, + "loss": 0.2993, + "step": 4170 + }, + { + "epoch": 0.18580255145130462, + "grad_norm": 0.3177953064441681, + "learning_rate": 9.999863127358108e-05, + "loss": 0.295, + "step": 4180 + }, + { + "epoch": 0.18624705516291062, + "grad_norm": 0.2618386149406433, + "learning_rate": 9.999847407225773e-05, + "loss": 0.2951, + "step": 4190 + }, + { + "epoch": 0.1866915588745166, + "grad_norm": 0.24441750347614288, + "learning_rate": 9.999830832756262e-05, + "loss": 0.2943, + "step": 4200 + }, + { + "epoch": 0.1871360625861226, + "grad_norm": 0.2673989236354828, + "learning_rate": 9.999813403952407e-05, + "loss": 0.2927, + "step": 4210 + }, + { + "epoch": 0.1875805662977286, + "grad_norm": 0.3371955156326294, + "learning_rate": 9.999795120817187e-05, + "loss": 0.2934, + "step": 4220 + }, + { + "epoch": 0.18802507000933458, + "grad_norm": 0.2313065230846405, + "learning_rate": 9.999775983353725e-05, + "loss": 0.2947, + "step": 4230 + }, + { + "epoch": 0.18846957372094056, + "grad_norm": 0.2351427972316742, + "learning_rate": 9.999755991565292e-05, + "loss": 0.2961, + "step": 4240 + }, + { + "epoch": 0.18891407743254657, + "grad_norm": 0.2864508032798767, + "learning_rate": 9.999735145455303e-05, + "loss": 0.2938, + "step": 4250 + }, + { + "epoch": 0.18935858114415255, + "grad_norm": 0.2298264354467392, + "learning_rate": 9.99971344502732e-05, + "loss": 0.2977, + "step": 4260 + }, + { + "epoch": 0.18980308485575856, + "grad_norm": 0.2625798285007477, + "learning_rate": 9.999690890285053e-05, + "loss": 0.294, + "step": 4270 + }, + { + "epoch": 0.19024758856736454, + "grad_norm": 0.2943839132785797, + "learning_rate": 9.999667481232356e-05, + "loss": 0.2929, + "step": 4280 + }, + { + "epoch": 0.19069209227897052, + "grad_norm": 0.2379462569952011, + "learning_rate": 9.999643217873225e-05, + "loss": 0.2929, + "step": 4290 + }, + { + "epoch": 0.19113659599057653, + "grad_norm": 0.2618236243724823, + "learning_rate": 9.999618100211809e-05, + "loss": 0.2932, + "step": 4300 + }, + { + "epoch": 0.1915810997021825, + "grad_norm": 0.23761165142059326, + "learning_rate": 9.999592128252402e-05, + "loss": 0.2927, + "step": 4310 + }, + { + "epoch": 0.1920256034137885, + "grad_norm": 0.2167397141456604, + "learning_rate": 9.999565301999437e-05, + "loss": 0.2936, + "step": 4320 + }, + { + "epoch": 0.1924701071253945, + "grad_norm": 0.2835066616535187, + "learning_rate": 9.999537621457502e-05, + "loss": 0.2937, + "step": 4330 + }, + { + "epoch": 0.19291461083700048, + "grad_norm": 0.27259114384651184, + "learning_rate": 9.999509086631323e-05, + "loss": 0.2925, + "step": 4340 + }, + { + "epoch": 0.1933591145486065, + "grad_norm": 0.2529068887233734, + "learning_rate": 9.99947969752578e-05, + "loss": 0.291, + "step": 4350 + }, + { + "epoch": 0.19380361826021247, + "grad_norm": 0.2605750858783722, + "learning_rate": 9.999449454145891e-05, + "loss": 0.295, + "step": 4360 + }, + { + "epoch": 0.19424812197181845, + "grad_norm": 0.20354653894901276, + "learning_rate": 9.999418356496827e-05, + "loss": 0.2934, + "step": 4370 + }, + { + "epoch": 0.19469262568342446, + "grad_norm": 0.2218426614999771, + "learning_rate": 9.999386404583899e-05, + "loss": 0.2955, + "step": 4380 + }, + { + "epoch": 0.19513712939503045, + "grad_norm": 0.2478162795305252, + "learning_rate": 9.999353598412568e-05, + "loss": 0.2931, + "step": 4390 + }, + { + "epoch": 0.19558163310663643, + "grad_norm": 0.28984007239341736, + "learning_rate": 9.999319937988442e-05, + "loss": 0.2959, + "step": 4400 + }, + { + "epoch": 0.19602613681824244, + "grad_norm": 0.26256638765335083, + "learning_rate": 9.999285423317268e-05, + "loss": 0.2928, + "step": 4410 + }, + { + "epoch": 0.19647064052984842, + "grad_norm": 0.25076764822006226, + "learning_rate": 9.999250054404947e-05, + "loss": 0.2962, + "step": 4420 + }, + { + "epoch": 0.19691514424145443, + "grad_norm": 0.23122021555900574, + "learning_rate": 9.99921383125752e-05, + "loss": 0.2933, + "step": 4430 + }, + { + "epoch": 0.1973596479530604, + "grad_norm": 0.2487145960330963, + "learning_rate": 9.99917675388118e-05, + "loss": 0.2967, + "step": 4440 + }, + { + "epoch": 0.1978041516646664, + "grad_norm": 0.22873981297016144, + "learning_rate": 9.99913882228226e-05, + "loss": 0.2903, + "step": 4450 + }, + { + "epoch": 0.1982486553762724, + "grad_norm": 0.26742124557495117, + "learning_rate": 9.999100036467242e-05, + "loss": 0.2934, + "step": 4460 + }, + { + "epoch": 0.19869315908787838, + "grad_norm": 0.24630218744277954, + "learning_rate": 9.999060396442753e-05, + "loss": 0.2925, + "step": 4470 + }, + { + "epoch": 0.1991376627994844, + "grad_norm": 0.22956933081150055, + "learning_rate": 9.999019902215566e-05, + "loss": 0.2939, + "step": 4480 + }, + { + "epoch": 0.19958216651109037, + "grad_norm": 0.2376791536808014, + "learning_rate": 9.998978553792602e-05, + "loss": 0.2928, + "step": 4490 + }, + { + "epoch": 0.20002667022269635, + "grad_norm": 0.23815643787384033, + "learning_rate": 9.998936351180926e-05, + "loss": 0.2963, + "step": 4500 + }, + { + "epoch": 0.20047117393430236, + "grad_norm": 0.2701186537742615, + "learning_rate": 9.998893294387747e-05, + "loss": 0.2961, + "step": 4510 + }, + { + "epoch": 0.20091567764590834, + "grad_norm": 0.2659163475036621, + "learning_rate": 9.998849383420426e-05, + "loss": 0.2924, + "step": 4520 + }, + { + "epoch": 0.20136018135751432, + "grad_norm": 0.25392401218414307, + "learning_rate": 9.998804618286465e-05, + "loss": 0.2899, + "step": 4530 + }, + { + "epoch": 0.20180468506912033, + "grad_norm": 0.22939877212047577, + "learning_rate": 9.99875899899351e-05, + "loss": 0.2946, + "step": 4540 + }, + { + "epoch": 0.2022491887807263, + "grad_norm": 0.19711053371429443, + "learning_rate": 9.99871252554936e-05, + "loss": 0.2933, + "step": 4550 + }, + { + "epoch": 0.20269369249233232, + "grad_norm": 0.2505910098552704, + "learning_rate": 9.998665197961955e-05, + "loss": 0.2904, + "step": 4560 + }, + { + "epoch": 0.2031381962039383, + "grad_norm": 0.2680283486843109, + "learning_rate": 9.998617016239379e-05, + "loss": 0.2941, + "step": 4570 + }, + { + "epoch": 0.20358269991554429, + "grad_norm": 0.2589718699455261, + "learning_rate": 9.998567980389869e-05, + "loss": 0.2941, + "step": 4580 + }, + { + "epoch": 0.2040272036271503, + "grad_norm": 0.23942498862743378, + "learning_rate": 9.998518090421802e-05, + "loss": 0.294, + "step": 4590 + }, + { + "epoch": 0.20447170733875628, + "grad_norm": 0.2142358422279358, + "learning_rate": 9.998467346343703e-05, + "loss": 0.2962, + "step": 4600 + }, + { + "epoch": 0.20491621105036226, + "grad_norm": 0.2655090093612671, + "learning_rate": 9.998415748164243e-05, + "loss": 0.2963, + "step": 4610 + }, + { + "epoch": 0.20536071476196827, + "grad_norm": 0.25839459896087646, + "learning_rate": 9.998363295892238e-05, + "loss": 0.2951, + "step": 4620 + }, + { + "epoch": 0.20580521847357425, + "grad_norm": 0.2514372169971466, + "learning_rate": 9.998309989536652e-05, + "loss": 0.2945, + "step": 4630 + }, + { + "epoch": 0.20624972218518026, + "grad_norm": 0.3192076086997986, + "learning_rate": 9.998255829106593e-05, + "loss": 0.2955, + "step": 4640 + }, + { + "epoch": 0.20669422589678624, + "grad_norm": 0.2845157980918884, + "learning_rate": 9.998200814611316e-05, + "loss": 0.2963, + "step": 4650 + }, + { + "epoch": 0.20713872960839222, + "grad_norm": 0.24879880249500275, + "learning_rate": 9.998144946060219e-05, + "loss": 0.2982, + "step": 4660 + }, + { + "epoch": 0.20758323331999823, + "grad_norm": 0.22740480303764343, + "learning_rate": 9.998088223462852e-05, + "loss": 0.295, + "step": 4670 + }, + { + "epoch": 0.2080277370316042, + "grad_norm": 0.274922251701355, + "learning_rate": 9.998030646828905e-05, + "loss": 0.2938, + "step": 4680 + }, + { + "epoch": 0.20847224074321022, + "grad_norm": 0.24819044768810272, + "learning_rate": 9.997972216168217e-05, + "loss": 0.2952, + "step": 4690 + }, + { + "epoch": 0.2089167444548162, + "grad_norm": 0.23860077559947968, + "learning_rate": 9.997912931490771e-05, + "loss": 0.2997, + "step": 4700 + }, + { + "epoch": 0.20936124816642218, + "grad_norm": 0.22749757766723633, + "learning_rate": 9.9978527928067e-05, + "loss": 0.2983, + "step": 4710 + }, + { + "epoch": 0.2098057518780282, + "grad_norm": 0.2271840125322342, + "learning_rate": 9.997791800126277e-05, + "loss": 0.2969, + "step": 4720 + }, + { + "epoch": 0.21025025558963417, + "grad_norm": 0.240873321890831, + "learning_rate": 9.997729953459927e-05, + "loss": 0.2971, + "step": 4730 + }, + { + "epoch": 0.21069475930124015, + "grad_norm": 0.24473753571510315, + "learning_rate": 9.997667252818214e-05, + "loss": 0.2964, + "step": 4740 + }, + { + "epoch": 0.21113926301284616, + "grad_norm": 0.24351654946804047, + "learning_rate": 9.997603698211855e-05, + "loss": 0.2952, + "step": 4750 + }, + { + "epoch": 0.21158376672445214, + "grad_norm": 0.21601586043834686, + "learning_rate": 9.99753928965171e-05, + "loss": 0.2961, + "step": 4760 + }, + { + "epoch": 0.21202827043605815, + "grad_norm": 0.22949309647083282, + "learning_rate": 9.997474027148781e-05, + "loss": 0.2953, + "step": 4770 + }, + { + "epoch": 0.21247277414766413, + "grad_norm": 0.2353469878435135, + "learning_rate": 9.997407910714223e-05, + "loss": 0.2965, + "step": 4780 + }, + { + "epoch": 0.21291727785927012, + "grad_norm": 0.2781222462654114, + "learning_rate": 9.997340940359332e-05, + "loss": 0.2975, + "step": 4790 + }, + { + "epoch": 0.21336178157087612, + "grad_norm": 0.25190937519073486, + "learning_rate": 9.997273116095552e-05, + "loss": 0.2967, + "step": 4800 + }, + { + "epoch": 0.2138062852824821, + "grad_norm": 0.2653065323829651, + "learning_rate": 9.997204437934473e-05, + "loss": 0.2989, + "step": 4810 + }, + { + "epoch": 0.2142507889940881, + "grad_norm": 0.2416587769985199, + "learning_rate": 9.997134905887829e-05, + "loss": 0.2954, + "step": 4820 + }, + { + "epoch": 0.2146952927056941, + "grad_norm": 0.23849238455295563, + "learning_rate": 9.997064519967501e-05, + "loss": 0.2944, + "step": 4830 + }, + { + "epoch": 0.21513979641730008, + "grad_norm": 0.24432197213172913, + "learning_rate": 9.996993280185517e-05, + "loss": 0.3002, + "step": 4840 + }, + { + "epoch": 0.2155843001289061, + "grad_norm": 0.25008976459503174, + "learning_rate": 9.99692118655405e-05, + "loss": 0.296, + "step": 4850 + }, + { + "epoch": 0.21602880384051207, + "grad_norm": 0.2597600817680359, + "learning_rate": 9.996848239085417e-05, + "loss": 0.297, + "step": 4860 + }, + { + "epoch": 0.21647330755211805, + "grad_norm": 0.2891717255115509, + "learning_rate": 9.996774437792085e-05, + "loss": 0.2937, + "step": 4870 + }, + { + "epoch": 0.21691781126372406, + "grad_norm": 0.2474713772535324, + "learning_rate": 9.996699782686664e-05, + "loss": 0.2979, + "step": 4880 + }, + { + "epoch": 0.21736231497533004, + "grad_norm": 0.2520633637905121, + "learning_rate": 9.996624273781909e-05, + "loss": 0.292, + "step": 4890 + }, + { + "epoch": 0.21780681868693605, + "grad_norm": 0.24183107912540436, + "learning_rate": 9.996547911090725e-05, + "loss": 0.2924, + "step": 4900 + }, + { + "epoch": 0.21825132239854203, + "grad_norm": 0.22584061324596405, + "learning_rate": 9.996470694626157e-05, + "loss": 0.2943, + "step": 4910 + }, + { + "epoch": 0.218695826110148, + "grad_norm": 0.24537430703639984, + "learning_rate": 9.996392624401403e-05, + "loss": 0.293, + "step": 4920 + }, + { + "epoch": 0.21914032982175402, + "grad_norm": 0.2710023522377014, + "learning_rate": 9.996313700429801e-05, + "loss": 0.2909, + "step": 4930 + }, + { + "epoch": 0.21958483353336, + "grad_norm": 0.27631792426109314, + "learning_rate": 9.996233922724836e-05, + "loss": 0.2929, + "step": 4940 + }, + { + "epoch": 0.22002933724496598, + "grad_norm": 0.24580226838588715, + "learning_rate": 9.996153291300141e-05, + "loss": 0.2924, + "step": 4950 + }, + { + "epoch": 0.220473840956572, + "grad_norm": 0.2265775203704834, + "learning_rate": 9.996071806169494e-05, + "loss": 0.2916, + "step": 4960 + }, + { + "epoch": 0.22091834466817797, + "grad_norm": 0.26412782073020935, + "learning_rate": 9.995989467346817e-05, + "loss": 0.2994, + "step": 4970 + }, + { + "epoch": 0.22136284837978398, + "grad_norm": 0.2636505365371704, + "learning_rate": 9.995906274846183e-05, + "loss": 0.2953, + "step": 4980 + }, + { + "epoch": 0.22180735209138996, + "grad_norm": 0.2577556073665619, + "learning_rate": 9.995822228681803e-05, + "loss": 0.2939, + "step": 4990 + }, + { + "epoch": 0.22225185580299595, + "grad_norm": 0.25387096405029297, + "learning_rate": 9.99573732886804e-05, + "loss": 0.2973, + "step": 5000 + }, + { + "epoch": 0.22269635951460195, + "grad_norm": 0.2741299569606781, + "learning_rate": 9.995651575419402e-05, + "loss": 0.2988, + "step": 5010 + }, + { + "epoch": 0.22314086322620794, + "grad_norm": 0.22055159509181976, + "learning_rate": 9.995564968350541e-05, + "loss": 0.2971, + "step": 5020 + }, + { + "epoch": 0.22358536693781392, + "grad_norm": 0.2665863633155823, + "learning_rate": 9.995477507676256e-05, + "loss": 0.2959, + "step": 5030 + }, + { + "epoch": 0.22402987064941993, + "grad_norm": 0.2666335701942444, + "learning_rate": 9.995389193411493e-05, + "loss": 0.2948, + "step": 5040 + }, + { + "epoch": 0.2244743743610259, + "grad_norm": 0.27468276023864746, + "learning_rate": 9.995300025571339e-05, + "loss": 0.2958, + "step": 5050 + }, + { + "epoch": 0.22491887807263192, + "grad_norm": 0.27699676156044006, + "learning_rate": 9.995210004171034e-05, + "loss": 0.2953, + "step": 5060 + }, + { + "epoch": 0.2253633817842379, + "grad_norm": 0.24338053166866302, + "learning_rate": 9.995119129225956e-05, + "loss": 0.2952, + "step": 5070 + }, + { + "epoch": 0.22580788549584388, + "grad_norm": 0.25887176394462585, + "learning_rate": 9.995027400751637e-05, + "loss": 0.2963, + "step": 5080 + }, + { + "epoch": 0.2262523892074499, + "grad_norm": 0.2796298563480377, + "learning_rate": 9.994934818763751e-05, + "loss": 0.296, + "step": 5090 + }, + { + "epoch": 0.22669689291905587, + "grad_norm": 0.2255997657775879, + "learning_rate": 9.994841383278115e-05, + "loss": 0.297, + "step": 5100 + }, + { + "epoch": 0.22714139663066188, + "grad_norm": 0.22560493648052216, + "learning_rate": 9.994747094310695e-05, + "loss": 0.2971, + "step": 5110 + }, + { + "epoch": 0.22758590034226786, + "grad_norm": 0.2519712448120117, + "learning_rate": 9.994651951877604e-05, + "loss": 0.298, + "step": 5120 + }, + { + "epoch": 0.22803040405387384, + "grad_norm": 0.2672719657421112, + "learning_rate": 9.994555955995099e-05, + "loss": 0.2938, + "step": 5130 + }, + { + "epoch": 0.22847490776547985, + "grad_norm": 0.32869860529899597, + "learning_rate": 9.994459106679581e-05, + "loss": 0.2981, + "step": 5140 + }, + { + "epoch": 0.22891941147708583, + "grad_norm": 0.29771918058395386, + "learning_rate": 9.994361403947603e-05, + "loss": 0.2964, + "step": 5150 + }, + { + "epoch": 0.2293639151886918, + "grad_norm": 0.29055529832839966, + "learning_rate": 9.994262847815854e-05, + "loss": 0.299, + "step": 5160 + }, + { + "epoch": 0.22980841890029782, + "grad_norm": 0.24260424077510834, + "learning_rate": 9.99416343830118e-05, + "loss": 0.297, + "step": 5170 + }, + { + "epoch": 0.2302529226119038, + "grad_norm": 0.260137677192688, + "learning_rate": 9.994063175420565e-05, + "loss": 0.296, + "step": 5180 + }, + { + "epoch": 0.2306974263235098, + "grad_norm": 0.286685973405838, + "learning_rate": 9.99396205919114e-05, + "loss": 0.2986, + "step": 5190 + }, + { + "epoch": 0.2311419300351158, + "grad_norm": 0.2468431442975998, + "learning_rate": 9.993860089630185e-05, + "loss": 0.2963, + "step": 5200 + }, + { + "epoch": 0.23158643374672178, + "grad_norm": 0.2236059606075287, + "learning_rate": 9.993757266755123e-05, + "loss": 0.2976, + "step": 5210 + }, + { + "epoch": 0.23203093745832779, + "grad_norm": 0.29381510615348816, + "learning_rate": 9.993653590583522e-05, + "loss": 0.2952, + "step": 5220 + }, + { + "epoch": 0.23247544116993377, + "grad_norm": 0.25302526354789734, + "learning_rate": 9.993549061133102e-05, + "loss": 0.2967, + "step": 5230 + }, + { + "epoch": 0.23291994488153975, + "grad_norm": 0.2711452841758728, + "learning_rate": 9.993443678421719e-05, + "loss": 0.2922, + "step": 5240 + }, + { + "epoch": 0.23336444859314576, + "grad_norm": 0.2699531018733978, + "learning_rate": 9.993337442467384e-05, + "loss": 0.2936, + "step": 5250 + }, + { + "epoch": 0.23380895230475174, + "grad_norm": 0.2807251214981079, + "learning_rate": 9.993230353288248e-05, + "loss": 0.2966, + "step": 5260 + }, + { + "epoch": 0.23425345601635775, + "grad_norm": 0.24542687833309174, + "learning_rate": 9.993122410902608e-05, + "loss": 0.2967, + "step": 5270 + }, + { + "epoch": 0.23469795972796373, + "grad_norm": 0.2786318361759186, + "learning_rate": 9.993013615328912e-05, + "loss": 0.2946, + "step": 5280 + }, + { + "epoch": 0.2351424634395697, + "grad_norm": 0.23871219158172607, + "learning_rate": 9.992903966585747e-05, + "loss": 0.2961, + "step": 5290 + }, + { + "epoch": 0.23558696715117572, + "grad_norm": 0.22332586348056793, + "learning_rate": 9.992793464691852e-05, + "loss": 0.2958, + "step": 5300 + }, + { + "epoch": 0.2360314708627817, + "grad_norm": 0.2956766188144684, + "learning_rate": 9.992682109666105e-05, + "loss": 0.2935, + "step": 5310 + }, + { + "epoch": 0.2364759745743877, + "grad_norm": 0.21131952106952667, + "learning_rate": 9.992569901527538e-05, + "loss": 0.2961, + "step": 5320 + }, + { + "epoch": 0.2369204782859937, + "grad_norm": 0.2617781162261963, + "learning_rate": 9.99245684029532e-05, + "loss": 0.291, + "step": 5330 + }, + { + "epoch": 0.23736498199759967, + "grad_norm": 0.22272878885269165, + "learning_rate": 9.992342925988774e-05, + "loss": 0.2959, + "step": 5340 + }, + { + "epoch": 0.23780948570920568, + "grad_norm": 0.2162347137928009, + "learning_rate": 9.992228158627361e-05, + "loss": 0.2924, + "step": 5350 + }, + { + "epoch": 0.23825398942081166, + "grad_norm": 0.27073681354522705, + "learning_rate": 9.992112538230693e-05, + "loss": 0.2958, + "step": 5360 + }, + { + "epoch": 0.23869849313241764, + "grad_norm": 0.2385716289281845, + "learning_rate": 9.991996064818527e-05, + "loss": 0.2955, + "step": 5370 + }, + { + "epoch": 0.23914299684402365, + "grad_norm": 0.24530945718288422, + "learning_rate": 9.991878738410768e-05, + "loss": 0.2967, + "step": 5380 + }, + { + "epoch": 0.23958750055562963, + "grad_norm": 0.23313157260417938, + "learning_rate": 9.991760559027457e-05, + "loss": 0.2953, + "step": 5390 + }, + { + "epoch": 0.24003200426723564, + "grad_norm": 0.22315561771392822, + "learning_rate": 9.991641526688793e-05, + "loss": 0.2938, + "step": 5400 + }, + { + "epoch": 0.24047650797884162, + "grad_norm": 0.2582952678203583, + "learning_rate": 9.991521641415113e-05, + "loss": 0.2987, + "step": 5410 + }, + { + "epoch": 0.2409210116904476, + "grad_norm": 0.22180667519569397, + "learning_rate": 9.991400903226904e-05, + "loss": 0.291, + "step": 5420 + }, + { + "epoch": 0.24136551540205362, + "grad_norm": 0.23129193484783173, + "learning_rate": 9.991279312144794e-05, + "loss": 0.2971, + "step": 5430 + }, + { + "epoch": 0.2418100191136596, + "grad_norm": 0.22916343808174133, + "learning_rate": 9.991156868189564e-05, + "loss": 0.297, + "step": 5440 + }, + { + "epoch": 0.24225452282526558, + "grad_norm": 0.2982069253921509, + "learning_rate": 9.991033571382131e-05, + "loss": 0.2961, + "step": 5450 + }, + { + "epoch": 0.2426990265368716, + "grad_norm": 0.2984744608402252, + "learning_rate": 9.990909421743569e-05, + "loss": 0.2954, + "step": 5460 + }, + { + "epoch": 0.24314353024847757, + "grad_norm": 0.2814599871635437, + "learning_rate": 9.990784419295085e-05, + "loss": 0.2939, + "step": 5470 + }, + { + "epoch": 0.24358803396008358, + "grad_norm": 0.22538314759731293, + "learning_rate": 9.990658564058044e-05, + "loss": 0.296, + "step": 5480 + }, + { + "epoch": 0.24403253767168956, + "grad_norm": 0.2226548194885254, + "learning_rate": 9.990531856053948e-05, + "loss": 0.2911, + "step": 5490 + }, + { + "epoch": 0.24447704138329554, + "grad_norm": 0.2563781142234802, + "learning_rate": 9.99040429530445e-05, + "loss": 0.2953, + "step": 5500 + }, + { + "epoch": 0.24492154509490155, + "grad_norm": 0.24151112139225006, + "learning_rate": 9.990275881831346e-05, + "loss": 0.2939, + "step": 5510 + }, + { + "epoch": 0.24536604880650753, + "grad_norm": 0.23078025877475739, + "learning_rate": 9.990146615656577e-05, + "loss": 0.2949, + "step": 5520 + }, + { + "epoch": 0.24581055251811354, + "grad_norm": 0.23867231607437134, + "learning_rate": 9.990016496802233e-05, + "loss": 0.2942, + "step": 5530 + }, + { + "epoch": 0.24625505622971952, + "grad_norm": 0.24654915928840637, + "learning_rate": 9.989885525290548e-05, + "loss": 0.296, + "step": 5540 + }, + { + "epoch": 0.2466995599413255, + "grad_norm": 0.2521542012691498, + "learning_rate": 9.989753701143897e-05, + "loss": 0.2936, + "step": 5550 + }, + { + "epoch": 0.2471440636529315, + "grad_norm": 0.25592637062072754, + "learning_rate": 9.989621024384812e-05, + "loss": 0.2908, + "step": 5560 + }, + { + "epoch": 0.2475885673645375, + "grad_norm": 0.24364013969898224, + "learning_rate": 9.989487495035959e-05, + "loss": 0.2941, + "step": 5570 + }, + { + "epoch": 0.24803307107614347, + "grad_norm": 0.31049844622612, + "learning_rate": 9.989353113120156e-05, + "loss": 0.298, + "step": 5580 + }, + { + "epoch": 0.24847757478774948, + "grad_norm": 0.24416477978229523, + "learning_rate": 9.989217878660366e-05, + "loss": 0.296, + "step": 5590 + }, + { + "epoch": 0.24892207849935546, + "grad_norm": 0.2600790858268738, + "learning_rate": 9.989081791679695e-05, + "loss": 0.2996, + "step": 5600 + }, + { + "epoch": 0.24936658221096147, + "grad_norm": 0.2486354410648346, + "learning_rate": 9.988944852201397e-05, + "loss": 0.2964, + "step": 5610 + }, + { + "epoch": 0.24981108592256746, + "grad_norm": 0.2710673213005066, + "learning_rate": 9.988807060248873e-05, + "loss": 0.2973, + "step": 5620 + }, + { + "epoch": 0.25025558963417344, + "grad_norm": 0.26060178875923157, + "learning_rate": 9.988668415845665e-05, + "loss": 0.2954, + "step": 5630 + }, + { + "epoch": 0.2507000933457794, + "grad_norm": 0.2589991092681885, + "learning_rate": 9.988528919015466e-05, + "loss": 0.2989, + "step": 5640 + }, + { + "epoch": 0.25114459705738545, + "grad_norm": 0.2607569992542267, + "learning_rate": 9.988388569782112e-05, + "loss": 0.2939, + "step": 5650 + }, + { + "epoch": 0.25158910076899144, + "grad_norm": 0.22537016868591309, + "learning_rate": 9.988247368169583e-05, + "loss": 0.2982, + "step": 5660 + }, + { + "epoch": 0.2520336044805974, + "grad_norm": 0.25840187072753906, + "learning_rate": 9.988105314202007e-05, + "loss": 0.2954, + "step": 5670 + }, + { + "epoch": 0.2524781081922034, + "grad_norm": 0.22425690293312073, + "learning_rate": 9.987962407903659e-05, + "loss": 0.2945, + "step": 5680 + }, + { + "epoch": 0.2529226119038094, + "grad_norm": 0.2726045846939087, + "learning_rate": 9.987818649298957e-05, + "loss": 0.2948, + "step": 5690 + }, + { + "epoch": 0.25336711561541536, + "grad_norm": 0.2733224928379059, + "learning_rate": 9.987674038412465e-05, + "loss": 0.2929, + "step": 5700 + }, + { + "epoch": 0.2538116193270214, + "grad_norm": 0.27956482768058777, + "learning_rate": 9.987528575268891e-05, + "loss": 0.3001, + "step": 5710 + }, + { + "epoch": 0.2542561230386274, + "grad_norm": 0.2616364359855652, + "learning_rate": 9.987382259893095e-05, + "loss": 0.299, + "step": 5720 + }, + { + "epoch": 0.25470062675023336, + "grad_norm": 0.22214923799037933, + "learning_rate": 9.987235092310074e-05, + "loss": 0.2964, + "step": 5730 + }, + { + "epoch": 0.25514513046183934, + "grad_norm": 0.2628353536128998, + "learning_rate": 9.987087072544978e-05, + "loss": 0.2999, + "step": 5740 + }, + { + "epoch": 0.2555896341734453, + "grad_norm": 0.23768246173858643, + "learning_rate": 9.9869382006231e-05, + "loss": 0.2947, + "step": 5750 + }, + { + "epoch": 0.25603413788505136, + "grad_norm": 0.20992518961429596, + "learning_rate": 9.986788476569875e-05, + "loss": 0.2954, + "step": 5760 + }, + { + "epoch": 0.25647864159665734, + "grad_norm": 0.24173763394355774, + "learning_rate": 9.986637900410887e-05, + "loss": 0.2962, + "step": 5770 + }, + { + "epoch": 0.2569231453082633, + "grad_norm": 0.38859671354293823, + "learning_rate": 9.986486472171869e-05, + "loss": 0.2967, + "step": 5780 + }, + { + "epoch": 0.2573676490198693, + "grad_norm": 0.2557801306247711, + "learning_rate": 9.986334191878692e-05, + "loss": 0.2952, + "step": 5790 + }, + { + "epoch": 0.2578121527314753, + "grad_norm": 0.22884570062160492, + "learning_rate": 9.986181059557378e-05, + "loss": 0.2938, + "step": 5800 + }, + { + "epoch": 0.2582566564430813, + "grad_norm": 0.23953980207443237, + "learning_rate": 9.986027075234094e-05, + "loss": 0.294, + "step": 5810 + }, + { + "epoch": 0.2587011601546873, + "grad_norm": 0.23657606542110443, + "learning_rate": 9.985872238935152e-05, + "loss": 0.293, + "step": 5820 + }, + { + "epoch": 0.2591456638662933, + "grad_norm": 0.23418377339839935, + "learning_rate": 9.985716550687008e-05, + "loss": 0.2981, + "step": 5830 + }, + { + "epoch": 0.25959016757789927, + "grad_norm": 0.2641599774360657, + "learning_rate": 9.985560010516264e-05, + "loss": 0.297, + "step": 5840 + }, + { + "epoch": 0.26003467128950525, + "grad_norm": 0.2435406595468521, + "learning_rate": 9.985402618449668e-05, + "loss": 0.2984, + "step": 5850 + }, + { + "epoch": 0.2604791750011113, + "grad_norm": 0.30665692687034607, + "learning_rate": 9.985244374514118e-05, + "loss": 0.2978, + "step": 5860 + }, + { + "epoch": 0.26092367871271727, + "grad_norm": 0.2827000319957733, + "learning_rate": 9.985085278736651e-05, + "loss": 0.2947, + "step": 5870 + }, + { + "epoch": 0.26136818242432325, + "grad_norm": 0.26703259348869324, + "learning_rate": 9.984925331144452e-05, + "loss": 0.2989, + "step": 5880 + }, + { + "epoch": 0.26181268613592923, + "grad_norm": 0.2570643424987793, + "learning_rate": 9.984764531764851e-05, + "loss": 0.2921, + "step": 5890 + }, + { + "epoch": 0.2622571898475352, + "grad_norm": 0.22544541954994202, + "learning_rate": 9.984602880625326e-05, + "loss": 0.2966, + "step": 5900 + }, + { + "epoch": 0.2627016935591412, + "grad_norm": 0.22135256230831146, + "learning_rate": 9.9844403777535e-05, + "loss": 0.2969, + "step": 5910 + }, + { + "epoch": 0.26314619727074723, + "grad_norm": 0.2571542263031006, + "learning_rate": 9.984277023177135e-05, + "loss": 0.2935, + "step": 5920 + }, + { + "epoch": 0.2635907009823532, + "grad_norm": 0.2784907817840576, + "learning_rate": 9.984112816924148e-05, + "loss": 0.294, + "step": 5930 + }, + { + "epoch": 0.2640352046939592, + "grad_norm": 0.29060617089271545, + "learning_rate": 9.983947759022596e-05, + "loss": 0.2966, + "step": 5940 + }, + { + "epoch": 0.2644797084055652, + "grad_norm": 0.2316971868276596, + "learning_rate": 9.983781849500682e-05, + "loss": 0.2943, + "step": 5950 + }, + { + "epoch": 0.26492421211717115, + "grad_norm": 0.2567489445209503, + "learning_rate": 9.98361508838676e-05, + "loss": 0.2945, + "step": 5960 + }, + { + "epoch": 0.2653687158287772, + "grad_norm": 0.2279815524816513, + "learning_rate": 9.98344747570932e-05, + "loss": 0.2927, + "step": 5970 + }, + { + "epoch": 0.26581321954038317, + "grad_norm": 0.2601902186870575, + "learning_rate": 9.983279011497004e-05, + "loss": 0.2961, + "step": 5980 + }, + { + "epoch": 0.26625772325198915, + "grad_norm": 0.2410673201084137, + "learning_rate": 9.983109695778596e-05, + "loss": 0.295, + "step": 5990 + }, + { + "epoch": 0.26670222696359513, + "grad_norm": 0.23212289810180664, + "learning_rate": 9.982939528583032e-05, + "loss": 0.2956, + "step": 6000 + }, + { + "epoch": 0.2671467306752011, + "grad_norm": 0.2599615752696991, + "learning_rate": 9.982768509939385e-05, + "loss": 0.2919, + "step": 6010 + }, + { + "epoch": 0.26759123438680715, + "grad_norm": 0.21388792991638184, + "learning_rate": 9.982596639876879e-05, + "loss": 0.2933, + "step": 6020 + }, + { + "epoch": 0.26803573809841313, + "grad_norm": 0.2424013763666153, + "learning_rate": 9.982423918424881e-05, + "loss": 0.2943, + "step": 6030 + }, + { + "epoch": 0.2684802418100191, + "grad_norm": 0.23005414009094238, + "learning_rate": 9.982250345612908e-05, + "loss": 0.2972, + "step": 6040 + }, + { + "epoch": 0.2689247455216251, + "grad_norm": 0.23896101117134094, + "learning_rate": 9.982075921470611e-05, + "loss": 0.2927, + "step": 6050 + }, + { + "epoch": 0.2693692492332311, + "grad_norm": 0.23790036141872406, + "learning_rate": 9.981900646027802e-05, + "loss": 0.2984, + "step": 6060 + }, + { + "epoch": 0.2698137529448371, + "grad_norm": 0.21532975137233734, + "learning_rate": 9.981724519314425e-05, + "loss": 0.2961, + "step": 6070 + }, + { + "epoch": 0.2702582566564431, + "grad_norm": 0.26561468839645386, + "learning_rate": 9.981547541360581e-05, + "loss": 0.3007, + "step": 6080 + }, + { + "epoch": 0.2707027603680491, + "grad_norm": 0.25612571835517883, + "learning_rate": 9.981369712196508e-05, + "loss": 0.297, + "step": 6090 + }, + { + "epoch": 0.27114726407965506, + "grad_norm": 0.274915874004364, + "learning_rate": 9.981191031852592e-05, + "loss": 0.2954, + "step": 6100 + }, + { + "epoch": 0.27159176779126104, + "grad_norm": 0.2409527748823166, + "learning_rate": 9.981011500359362e-05, + "loss": 0.2959, + "step": 6110 + }, + { + "epoch": 0.272036271502867, + "grad_norm": 0.24075402319431305, + "learning_rate": 9.9808311177475e-05, + "loss": 0.2967, + "step": 6120 + }, + { + "epoch": 0.27248077521447306, + "grad_norm": 0.21700476109981537, + "learning_rate": 9.980649884047826e-05, + "loss": 0.2926, + "step": 6130 + }, + { + "epoch": 0.27292527892607904, + "grad_norm": 0.2445088028907776, + "learning_rate": 9.980467799291307e-05, + "loss": 0.2934, + "step": 6140 + }, + { + "epoch": 0.273369782637685, + "grad_norm": 0.2463339865207672, + "learning_rate": 9.980284863509058e-05, + "loss": 0.2967, + "step": 6150 + }, + { + "epoch": 0.273814286349291, + "grad_norm": 0.2224964052438736, + "learning_rate": 9.980101076732334e-05, + "loss": 0.2967, + "step": 6160 + }, + { + "epoch": 0.274258790060897, + "grad_norm": 0.20258945226669312, + "learning_rate": 9.979916438992544e-05, + "loss": 0.2957, + "step": 6170 + }, + { + "epoch": 0.274703293772503, + "grad_norm": 0.23148855566978455, + "learning_rate": 9.979730950321237e-05, + "loss": 0.2951, + "step": 6180 + }, + { + "epoch": 0.275147797484109, + "grad_norm": 0.2383941411972046, + "learning_rate": 9.979544610750104e-05, + "loss": 0.2971, + "step": 6190 + }, + { + "epoch": 0.275592301195715, + "grad_norm": 0.2555932104587555, + "learning_rate": 9.97935742031099e-05, + "loss": 0.2969, + "step": 6200 + }, + { + "epoch": 0.27603680490732097, + "grad_norm": 0.2421383410692215, + "learning_rate": 9.979169379035878e-05, + "loss": 0.294, + "step": 6210 + }, + { + "epoch": 0.27648130861892695, + "grad_norm": 0.2276301085948944, + "learning_rate": 9.978980486956899e-05, + "loss": 0.2954, + "step": 6220 + }, + { + "epoch": 0.276925812330533, + "grad_norm": 0.21330302953720093, + "learning_rate": 9.978790744106332e-05, + "loss": 0.2942, + "step": 6230 + }, + { + "epoch": 0.27737031604213896, + "grad_norm": 0.23944738507270813, + "learning_rate": 9.978600150516594e-05, + "loss": 0.2954, + "step": 6240 + }, + { + "epoch": 0.27781481975374495, + "grad_norm": 0.2744289040565491, + "learning_rate": 9.978408706220259e-05, + "loss": 0.2962, + "step": 6250 + }, + { + "epoch": 0.2782593234653509, + "grad_norm": 0.21848206222057343, + "learning_rate": 9.978216411250032e-05, + "loss": 0.2972, + "step": 6260 + }, + { + "epoch": 0.2787038271769569, + "grad_norm": 0.2202863097190857, + "learning_rate": 9.978023265638778e-05, + "loss": 0.2973, + "step": 6270 + }, + { + "epoch": 0.27914833088856295, + "grad_norm": 0.25768113136291504, + "learning_rate": 9.977829269419495e-05, + "loss": 0.2999, + "step": 6280 + }, + { + "epoch": 0.2795928346001689, + "grad_norm": 0.2628850042819977, + "learning_rate": 9.977634422625335e-05, + "loss": 0.2987, + "step": 6290 + }, + { + "epoch": 0.2800373383117749, + "grad_norm": 0.21435067057609558, + "learning_rate": 9.97743872528959e-05, + "loss": 0.2943, + "step": 6300 + }, + { + "epoch": 0.2804818420233809, + "grad_norm": 0.2599984109401703, + "learning_rate": 9.9772421774457e-05, + "loss": 0.295, + "step": 6310 + }, + { + "epoch": 0.28092634573498687, + "grad_norm": 0.2365584373474121, + "learning_rate": 9.977044779127252e-05, + "loss": 0.2931, + "step": 6320 + }, + { + "epoch": 0.28137084944659285, + "grad_norm": 0.23116160929203033, + "learning_rate": 9.976846530367971e-05, + "loss": 0.2963, + "step": 6330 + }, + { + "epoch": 0.2818153531581989, + "grad_norm": 0.33690717816352844, + "learning_rate": 9.976647431201735e-05, + "loss": 0.2959, + "step": 6340 + }, + { + "epoch": 0.28225985686980487, + "grad_norm": 0.22915393114089966, + "learning_rate": 9.976447481662568e-05, + "loss": 0.2924, + "step": 6350 + }, + { + "epoch": 0.28270436058141085, + "grad_norm": 0.2603790760040283, + "learning_rate": 9.976246681784629e-05, + "loss": 0.2951, + "step": 6360 + }, + { + "epoch": 0.28314886429301683, + "grad_norm": 0.22880372405052185, + "learning_rate": 9.976045031602234e-05, + "loss": 0.2959, + "step": 6370 + }, + { + "epoch": 0.2835933680046228, + "grad_norm": 0.24076321721076965, + "learning_rate": 9.975842531149837e-05, + "loss": 0.2944, + "step": 6380 + }, + { + "epoch": 0.28403787171622885, + "grad_norm": 0.2471006065607071, + "learning_rate": 9.975639180462043e-05, + "loss": 0.2925, + "step": 6390 + }, + { + "epoch": 0.28448237542783483, + "grad_norm": 0.2560718059539795, + "learning_rate": 9.975434979573596e-05, + "loss": 0.2941, + "step": 6400 + }, + { + "epoch": 0.2849268791394408, + "grad_norm": 0.2845134735107422, + "learning_rate": 9.97522992851939e-05, + "loss": 0.2957, + "step": 6410 + }, + { + "epoch": 0.2853713828510468, + "grad_norm": 0.25485002994537354, + "learning_rate": 9.975024027334461e-05, + "loss": 0.2974, + "step": 6420 + }, + { + "epoch": 0.2858158865626528, + "grad_norm": 0.23550960421562195, + "learning_rate": 9.974817276053993e-05, + "loss": 0.2955, + "step": 6430 + }, + { + "epoch": 0.2862603902742588, + "grad_norm": 0.2541617453098297, + "learning_rate": 9.974609674713315e-05, + "loss": 0.2934, + "step": 6440 + }, + { + "epoch": 0.2867048939858648, + "grad_norm": 0.251323401927948, + "learning_rate": 9.9744012233479e-05, + "loss": 0.2936, + "step": 6450 + }, + { + "epoch": 0.2871493976974708, + "grad_norm": 0.2643023729324341, + "learning_rate": 9.974191921993366e-05, + "loss": 0.295, + "step": 6460 + }, + { + "epoch": 0.28759390140907676, + "grad_norm": 0.28170910477638245, + "learning_rate": 9.973981770685474e-05, + "loss": 0.2941, + "step": 6470 + }, + { + "epoch": 0.28803840512068274, + "grad_norm": 0.22239089012145996, + "learning_rate": 9.97377076946014e-05, + "loss": 0.2937, + "step": 6480 + }, + { + "epoch": 0.2884829088322888, + "grad_norm": 0.2664618492126465, + "learning_rate": 9.973558918353412e-05, + "loss": 0.2955, + "step": 6490 + }, + { + "epoch": 0.28892741254389476, + "grad_norm": 0.30961599946022034, + "learning_rate": 9.973346217401494e-05, + "loss": 0.2955, + "step": 6500 + }, + { + "epoch": 0.28937191625550074, + "grad_norm": 0.24032534658908844, + "learning_rate": 9.973132666640726e-05, + "loss": 0.2964, + "step": 6510 + }, + { + "epoch": 0.2898164199671067, + "grad_norm": 0.22726471722126007, + "learning_rate": 9.972918266107602e-05, + "loss": 0.2994, + "step": 6520 + }, + { + "epoch": 0.2902609236787127, + "grad_norm": 0.27349185943603516, + "learning_rate": 9.972703015838756e-05, + "loss": 0.2967, + "step": 6530 + }, + { + "epoch": 0.2907054273903187, + "grad_norm": 0.2205624133348465, + "learning_rate": 9.97248691587097e-05, + "loss": 0.2985, + "step": 6540 + }, + { + "epoch": 0.2911499311019247, + "grad_norm": 0.24621236324310303, + "learning_rate": 9.972269966241166e-05, + "loss": 0.296, + "step": 6550 + }, + { + "epoch": 0.2915944348135307, + "grad_norm": 0.22082248330116272, + "learning_rate": 9.972052166986417e-05, + "loss": 0.2952, + "step": 6560 + }, + { + "epoch": 0.2920389385251367, + "grad_norm": 0.2535746991634369, + "learning_rate": 9.971833518143938e-05, + "loss": 0.3003, + "step": 6570 + }, + { + "epoch": 0.29248344223674266, + "grad_norm": 0.25163403153419495, + "learning_rate": 9.971614019751093e-05, + "loss": 0.2987, + "step": 6580 + }, + { + "epoch": 0.29292794594834864, + "grad_norm": 0.24991901218891144, + "learning_rate": 9.971393671845383e-05, + "loss": 0.2971, + "step": 6590 + }, + { + "epoch": 0.2933724496599547, + "grad_norm": 0.22676822543144226, + "learning_rate": 9.971172474464464e-05, + "loss": 0.2973, + "step": 6600 + }, + { + "epoch": 0.29381695337156066, + "grad_norm": 0.2891048192977905, + "learning_rate": 9.97095042764613e-05, + "loss": 0.2955, + "step": 6610 + }, + { + "epoch": 0.29426145708316664, + "grad_norm": 0.2585970461368561, + "learning_rate": 9.970727531428324e-05, + "loss": 0.2963, + "step": 6620 + }, + { + "epoch": 0.2947059607947726, + "grad_norm": 0.2104913741350174, + "learning_rate": 9.970503785849132e-05, + "loss": 0.298, + "step": 6630 + }, + { + "epoch": 0.2951504645063786, + "grad_norm": 0.21189534664154053, + "learning_rate": 9.970279190946788e-05, + "loss": 0.2946, + "step": 6640 + }, + { + "epoch": 0.29559496821798464, + "grad_norm": 0.25835126638412476, + "learning_rate": 9.970053746759667e-05, + "loss": 0.2955, + "step": 6650 + }, + { + "epoch": 0.2960394719295906, + "grad_norm": 0.2770098149776459, + "learning_rate": 9.969827453326292e-05, + "loss": 0.2954, + "step": 6660 + }, + { + "epoch": 0.2964839756411966, + "grad_norm": 0.26953887939453125, + "learning_rate": 9.969600310685332e-05, + "loss": 0.2951, + "step": 6670 + }, + { + "epoch": 0.2969284793528026, + "grad_norm": 0.2638455927371979, + "learning_rate": 9.969372318875596e-05, + "loss": 0.2956, + "step": 6680 + }, + { + "epoch": 0.29737298306440857, + "grad_norm": 0.30646222829818726, + "learning_rate": 9.969143477936043e-05, + "loss": 0.2957, + "step": 6690 + }, + { + "epoch": 0.2978174867760146, + "grad_norm": 0.22912345826625824, + "learning_rate": 9.968913787905775e-05, + "loss": 0.2956, + "step": 6700 + }, + { + "epoch": 0.2982619904876206, + "grad_norm": 0.2290901243686676, + "learning_rate": 9.968683248824045e-05, + "loss": 0.2947, + "step": 6710 + }, + { + "epoch": 0.29870649419922657, + "grad_norm": 0.25622397661209106, + "learning_rate": 9.968451860730238e-05, + "loss": 0.2925, + "step": 6720 + }, + { + "epoch": 0.29915099791083255, + "grad_norm": 0.2561624050140381, + "learning_rate": 9.968219623663896e-05, + "loss": 0.2966, + "step": 6730 + }, + { + "epoch": 0.29959550162243853, + "grad_norm": 0.27322959899902344, + "learning_rate": 9.967986537664702e-05, + "loss": 0.2956, + "step": 6740 + }, + { + "epoch": 0.3000400053340445, + "grad_norm": 0.22692246735095978, + "learning_rate": 9.967752602772483e-05, + "loss": 0.2935, + "step": 6750 + }, + { + "epoch": 0.30048450904565055, + "grad_norm": 0.24597331881523132, + "learning_rate": 9.967517819027212e-05, + "loss": 0.2956, + "step": 6760 + }, + { + "epoch": 0.30092901275725653, + "grad_norm": 0.22953423857688904, + "learning_rate": 9.967282186469009e-05, + "loss": 0.2922, + "step": 6770 + }, + { + "epoch": 0.3013735164688625, + "grad_norm": 0.2242104411125183, + "learning_rate": 9.967045705138135e-05, + "loss": 0.2942, + "step": 6780 + }, + { + "epoch": 0.3018180201804685, + "grad_norm": 0.1898539960384369, + "learning_rate": 9.966808375074998e-05, + "loss": 0.2948, + "step": 6790 + }, + { + "epoch": 0.3022625238920745, + "grad_norm": 0.2871861755847931, + "learning_rate": 9.966570196320154e-05, + "loss": 0.2967, + "step": 6800 + }, + { + "epoch": 0.3027070276036805, + "grad_norm": 0.22656647861003876, + "learning_rate": 9.966331168914299e-05, + "loss": 0.2985, + "step": 6810 + }, + { + "epoch": 0.3031515313152865, + "grad_norm": 0.2592788338661194, + "learning_rate": 9.966091292898277e-05, + "loss": 0.2962, + "step": 6820 + }, + { + "epoch": 0.3035960350268925, + "grad_norm": 0.24374200403690338, + "learning_rate": 9.965850568313076e-05, + "loss": 0.2967, + "step": 6830 + }, + { + "epoch": 0.30404053873849846, + "grad_norm": 0.2666373550891876, + "learning_rate": 9.965608995199827e-05, + "loss": 0.2967, + "step": 6840 + }, + { + "epoch": 0.30448504245010444, + "grad_norm": 0.24239900708198547, + "learning_rate": 9.965366573599812e-05, + "loss": 0.294, + "step": 6850 + }, + { + "epoch": 0.3049295461617105, + "grad_norm": 0.27754148840904236, + "learning_rate": 9.965123303554453e-05, + "loss": 0.2962, + "step": 6860 + }, + { + "epoch": 0.30537404987331646, + "grad_norm": 0.2512037456035614, + "learning_rate": 9.964879185105317e-05, + "loss": 0.2995, + "step": 6870 + }, + { + "epoch": 0.30581855358492244, + "grad_norm": 0.26160356402397156, + "learning_rate": 9.964634218294119e-05, + "loss": 0.2962, + "step": 6880 + }, + { + "epoch": 0.3062630572965284, + "grad_norm": 0.27509286999702454, + "learning_rate": 9.964388403162714e-05, + "loss": 0.2978, + "step": 6890 + }, + { + "epoch": 0.3067075610081344, + "grad_norm": 0.26586297154426575, + "learning_rate": 9.96414173975311e-05, + "loss": 0.2938, + "step": 6900 + }, + { + "epoch": 0.30715206471974044, + "grad_norm": 0.24167679250240326, + "learning_rate": 9.963894228107451e-05, + "loss": 0.2928, + "step": 6910 + }, + { + "epoch": 0.3075965684313464, + "grad_norm": 0.23740267753601074, + "learning_rate": 9.963645868268032e-05, + "loss": 0.2999, + "step": 6920 + }, + { + "epoch": 0.3080410721429524, + "grad_norm": 0.25324392318725586, + "learning_rate": 9.963396660277289e-05, + "loss": 0.2976, + "step": 6930 + }, + { + "epoch": 0.3084855758545584, + "grad_norm": 0.2249986082315445, + "learning_rate": 9.963146604177807e-05, + "loss": 0.295, + "step": 6940 + }, + { + "epoch": 0.30893007956616436, + "grad_norm": 0.27964961528778076, + "learning_rate": 9.962895700012311e-05, + "loss": 0.2956, + "step": 6950 + }, + { + "epoch": 0.30937458327777034, + "grad_norm": 0.25618645548820496, + "learning_rate": 9.962643947823677e-05, + "loss": 0.2992, + "step": 6960 + }, + { + "epoch": 0.3098190869893764, + "grad_norm": 0.20892058312892914, + "learning_rate": 9.962391347654921e-05, + "loss": 0.2966, + "step": 6970 + }, + { + "epoch": 0.31026359070098236, + "grad_norm": 0.21048887073993683, + "learning_rate": 9.962137899549204e-05, + "loss": 0.2943, + "step": 6980 + }, + { + "epoch": 0.31070809441258834, + "grad_norm": 0.20708401501178741, + "learning_rate": 9.961883603549835e-05, + "loss": 0.298, + "step": 6990 + }, + { + "epoch": 0.3111525981241943, + "grad_norm": 0.19395826756954193, + "learning_rate": 9.961628459700267e-05, + "loss": 0.2928, + "step": 7000 + }, + { + "epoch": 0.3115971018358003, + "grad_norm": 0.17996107041835785, + "learning_rate": 9.961372468044095e-05, + "loss": 0.2932, + "step": 7010 + }, + { + "epoch": 0.31204160554740634, + "grad_norm": 0.21276812255382538, + "learning_rate": 9.961115628625062e-05, + "loss": 0.2925, + "step": 7020 + }, + { + "epoch": 0.3124861092590123, + "grad_norm": 0.24968264997005463, + "learning_rate": 9.960857941487056e-05, + "loss": 0.293, + "step": 7030 + }, + { + "epoch": 0.3129306129706183, + "grad_norm": 0.20816880464553833, + "learning_rate": 9.960599406674106e-05, + "loss": 0.291, + "step": 7040 + }, + { + "epoch": 0.3133751166822243, + "grad_norm": 0.29066213965415955, + "learning_rate": 9.960340024230393e-05, + "loss": 0.2967, + "step": 7050 + }, + { + "epoch": 0.31381962039383027, + "grad_norm": 0.24801217019557953, + "learning_rate": 9.960079794200232e-05, + "loss": 0.2934, + "step": 7060 + }, + { + "epoch": 0.3142641241054363, + "grad_norm": 0.19691598415374756, + "learning_rate": 9.959818716628096e-05, + "loss": 0.2967, + "step": 7070 + }, + { + "epoch": 0.3147086278170423, + "grad_norm": 0.24273979663848877, + "learning_rate": 9.95955679155859e-05, + "loss": 0.2951, + "step": 7080 + }, + { + "epoch": 0.31515313152864827, + "grad_norm": 0.23824471235275269, + "learning_rate": 9.959294019036472e-05, + "loss": 0.2948, + "step": 7090 + }, + { + "epoch": 0.31559763524025425, + "grad_norm": 0.25608935952186584, + "learning_rate": 9.959030399106646e-05, + "loss": 0.2979, + "step": 7100 + }, + { + "epoch": 0.31604213895186023, + "grad_norm": 0.26114651560783386, + "learning_rate": 9.958765931814153e-05, + "loss": 0.2968, + "step": 7110 + }, + { + "epoch": 0.31648664266346627, + "grad_norm": 0.25164902210235596, + "learning_rate": 9.958500617204184e-05, + "loss": 0.2943, + "step": 7120 + }, + { + "epoch": 0.31693114637507225, + "grad_norm": 0.24660594761371613, + "learning_rate": 9.958234455322075e-05, + "loss": 0.2974, + "step": 7130 + }, + { + "epoch": 0.31737565008667823, + "grad_norm": 0.2313624769449234, + "learning_rate": 9.957967446213308e-05, + "loss": 0.3002, + "step": 7140 + }, + { + "epoch": 0.3178201537982842, + "grad_norm": 0.2545800507068634, + "learning_rate": 9.957699589923501e-05, + "loss": 0.2998, + "step": 7150 + }, + { + "epoch": 0.3182646575098902, + "grad_norm": 0.2486312836408615, + "learning_rate": 9.957430886498431e-05, + "loss": 0.2998, + "step": 7160 + }, + { + "epoch": 0.3187091612214962, + "grad_norm": 0.23389972746372223, + "learning_rate": 9.957161335984008e-05, + "loss": 0.296, + "step": 7170 + }, + { + "epoch": 0.3191536649331022, + "grad_norm": 0.2318969964981079, + "learning_rate": 9.956890938426291e-05, + "loss": 0.2976, + "step": 7180 + }, + { + "epoch": 0.3195981686447082, + "grad_norm": 0.29072049260139465, + "learning_rate": 9.956619693871482e-05, + "loss": 0.3011, + "step": 7190 + }, + { + "epoch": 0.3200426723563142, + "grad_norm": 0.2134065330028534, + "learning_rate": 9.956347602365934e-05, + "loss": 0.2943, + "step": 7200 + }, + { + "epoch": 0.32048717606792015, + "grad_norm": 0.2557726800441742, + "learning_rate": 9.956074663956135e-05, + "loss": 0.294, + "step": 7210 + }, + { + "epoch": 0.32093167977952614, + "grad_norm": 0.22199155390262604, + "learning_rate": 9.955800878688726e-05, + "loss": 0.2952, + "step": 7220 + }, + { + "epoch": 0.3213761834911322, + "grad_norm": 0.20092397928237915, + "learning_rate": 9.955526246610489e-05, + "loss": 0.2959, + "step": 7230 + }, + { + "epoch": 0.32182068720273815, + "grad_norm": 0.27788737416267395, + "learning_rate": 9.955250767768349e-05, + "loss": 0.294, + "step": 7240 + }, + { + "epoch": 0.32226519091434414, + "grad_norm": 0.24640432000160217, + "learning_rate": 9.95497444220938e-05, + "loss": 0.2944, + "step": 7250 + }, + { + "epoch": 0.3227096946259501, + "grad_norm": 0.27660974860191345, + "learning_rate": 9.954697269980797e-05, + "loss": 0.2974, + "step": 7260 + }, + { + "epoch": 0.3231541983375561, + "grad_norm": 0.2618093192577362, + "learning_rate": 9.954419251129962e-05, + "loss": 0.2948, + "step": 7270 + }, + { + "epoch": 0.32359870204916213, + "grad_norm": 0.22985273599624634, + "learning_rate": 9.95414038570438e-05, + "loss": 0.2975, + "step": 7280 + }, + { + "epoch": 0.3240432057607681, + "grad_norm": 0.2703131437301636, + "learning_rate": 9.953860673751703e-05, + "loss": 0.2978, + "step": 7290 + }, + { + "epoch": 0.3244877094723741, + "grad_norm": 0.21820689737796783, + "learning_rate": 9.953580115319725e-05, + "loss": 0.2986, + "step": 7300 + }, + { + "epoch": 0.3249322131839801, + "grad_norm": 0.2621048092842102, + "learning_rate": 9.953298710456387e-05, + "loss": 0.3037, + "step": 7310 + }, + { + "epoch": 0.32537671689558606, + "grad_norm": 0.27477186918258667, + "learning_rate": 9.953016459209771e-05, + "loss": 0.3004, + "step": 7320 + }, + { + "epoch": 0.3258212206071921, + "grad_norm": 0.2728326618671417, + "learning_rate": 9.952733361628108e-05, + "loss": 0.2999, + "step": 7330 + }, + { + "epoch": 0.3262657243187981, + "grad_norm": 0.240944504737854, + "learning_rate": 9.952449417759772e-05, + "loss": 0.2954, + "step": 7340 + }, + { + "epoch": 0.32671022803040406, + "grad_norm": 0.2495718002319336, + "learning_rate": 9.952164627653279e-05, + "loss": 0.2938, + "step": 7350 + }, + { + "epoch": 0.32715473174201004, + "grad_norm": 0.2506386935710907, + "learning_rate": 9.951878991357292e-05, + "loss": 0.2961, + "step": 7360 + }, + { + "epoch": 0.327599235453616, + "grad_norm": 0.2416956126689911, + "learning_rate": 9.951592508920622e-05, + "loss": 0.2962, + "step": 7370 + }, + { + "epoch": 0.328043739165222, + "grad_norm": 0.23689508438110352, + "learning_rate": 9.951305180392219e-05, + "loss": 0.2958, + "step": 7380 + }, + { + "epoch": 0.32848824287682804, + "grad_norm": 0.2626291811466217, + "learning_rate": 9.951017005821178e-05, + "loss": 0.2952, + "step": 7390 + }, + { + "epoch": 0.328932746588434, + "grad_norm": 0.23636841773986816, + "learning_rate": 9.95072798525674e-05, + "loss": 0.2939, + "step": 7400 + }, + { + "epoch": 0.32937725030004, + "grad_norm": 0.2797069251537323, + "learning_rate": 9.950438118748293e-05, + "loss": 0.295, + "step": 7410 + }, + { + "epoch": 0.329821754011646, + "grad_norm": 0.2300334870815277, + "learning_rate": 9.950147406345366e-05, + "loss": 0.2974, + "step": 7420 + }, + { + "epoch": 0.33026625772325197, + "grad_norm": 0.23128914833068848, + "learning_rate": 9.949855848097635e-05, + "loss": 0.2941, + "step": 7430 + }, + { + "epoch": 0.330710761434858, + "grad_norm": 0.2508425712585449, + "learning_rate": 9.949563444054916e-05, + "loss": 0.2972, + "step": 7440 + }, + { + "epoch": 0.331155265146464, + "grad_norm": 0.24694323539733887, + "learning_rate": 9.949270194267178e-05, + "loss": 0.2943, + "step": 7450 + }, + { + "epoch": 0.33159976885806997, + "grad_norm": 0.2541545331478119, + "learning_rate": 9.948976098784526e-05, + "loss": 0.2974, + "step": 7460 + }, + { + "epoch": 0.33204427256967595, + "grad_norm": 0.263493150472641, + "learning_rate": 9.948681157657213e-05, + "loss": 0.2953, + "step": 7470 + }, + { + "epoch": 0.33248877628128193, + "grad_norm": 0.231534942984581, + "learning_rate": 9.948385370935638e-05, + "loss": 0.2957, + "step": 7480 + }, + { + "epoch": 0.33293327999288796, + "grad_norm": 0.31382110714912415, + "learning_rate": 9.94808873867034e-05, + "loss": 0.2945, + "step": 7490 + }, + { + "epoch": 0.33337778370449395, + "grad_norm": 0.22553153336048126, + "learning_rate": 9.947791260912009e-05, + "loss": 0.297, + "step": 7500 + }, + { + "epoch": 0.3338222874160999, + "grad_norm": 0.23832003772258759, + "learning_rate": 9.947492937711474e-05, + "loss": 0.293, + "step": 7510 + }, + { + "epoch": 0.3342667911277059, + "grad_norm": 0.24727420508861542, + "learning_rate": 9.947193769119707e-05, + "loss": 0.2964, + "step": 7520 + }, + { + "epoch": 0.3347112948393119, + "grad_norm": 0.19346021115779877, + "learning_rate": 9.946893755187834e-05, + "loss": 0.2958, + "step": 7530 + }, + { + "epoch": 0.3351557985509179, + "grad_norm": 0.23962393403053284, + "learning_rate": 9.946592895967115e-05, + "loss": 0.2963, + "step": 7540 + }, + { + "epoch": 0.3356003022625239, + "grad_norm": 0.2524322271347046, + "learning_rate": 9.94629119150896e-05, + "loss": 0.2953, + "step": 7550 + }, + { + "epoch": 0.3360448059741299, + "grad_norm": 0.23499521613121033, + "learning_rate": 9.94598864186492e-05, + "loss": 0.2941, + "step": 7560 + }, + { + "epoch": 0.33648930968573587, + "grad_norm": 0.21963977813720703, + "learning_rate": 9.945685247086696e-05, + "loss": 0.2997, + "step": 7570 + }, + { + "epoch": 0.33693381339734185, + "grad_norm": 0.259088933467865, + "learning_rate": 9.945381007226129e-05, + "loss": 0.2952, + "step": 7580 + }, + { + "epoch": 0.33737831710894783, + "grad_norm": 0.24146702885627747, + "learning_rate": 9.945075922335203e-05, + "loss": 0.2961, + "step": 7590 + }, + { + "epoch": 0.33782282082055387, + "grad_norm": 0.22286154329776764, + "learning_rate": 9.944769992466049e-05, + "loss": 0.2953, + "step": 7600 + }, + { + "epoch": 0.33826732453215985, + "grad_norm": 0.21716991066932678, + "learning_rate": 9.944463217670945e-05, + "loss": 0.296, + "step": 7610 + }, + { + "epoch": 0.33871182824376583, + "grad_norm": 0.2481161206960678, + "learning_rate": 9.944155598002307e-05, + "loss": 0.295, + "step": 7620 + }, + { + "epoch": 0.3391563319553718, + "grad_norm": 0.23381270468235016, + "learning_rate": 9.943847133512701e-05, + "loss": 0.2964, + "step": 7630 + }, + { + "epoch": 0.3396008356669778, + "grad_norm": 0.2222212553024292, + "learning_rate": 9.943537824254834e-05, + "loss": 0.2982, + "step": 7640 + }, + { + "epoch": 0.34004533937858383, + "grad_norm": 0.259227454662323, + "learning_rate": 9.943227670281559e-05, + "loss": 0.3011, + "step": 7650 + }, + { + "epoch": 0.3404898430901898, + "grad_norm": 0.26512154936790466, + "learning_rate": 9.942916671645873e-05, + "loss": 0.2961, + "step": 7660 + }, + { + "epoch": 0.3409343468017958, + "grad_norm": 0.28011447191238403, + "learning_rate": 9.942604828400916e-05, + "loss": 0.2951, + "step": 7670 + }, + { + "epoch": 0.3413788505134018, + "grad_norm": 0.2377437949180603, + "learning_rate": 9.942292140599975e-05, + "loss": 0.2978, + "step": 7680 + }, + { + "epoch": 0.34182335422500776, + "grad_norm": 0.29051005840301514, + "learning_rate": 9.94197860829648e-05, + "loss": 0.2963, + "step": 7690 + }, + { + "epoch": 0.3422678579366138, + "grad_norm": 0.2663821876049042, + "learning_rate": 9.941664231544004e-05, + "loss": 0.2986, + "step": 7700 + }, + { + "epoch": 0.3427123616482198, + "grad_norm": 0.2519840598106384, + "learning_rate": 9.941349010396264e-05, + "loss": 0.2973, + "step": 7710 + }, + { + "epoch": 0.34315686535982576, + "grad_norm": 0.2216961830854416, + "learning_rate": 9.941032944907125e-05, + "loss": 0.2961, + "step": 7720 + }, + { + "epoch": 0.34360136907143174, + "grad_norm": 0.23641309142112732, + "learning_rate": 9.940716035130596e-05, + "loss": 0.295, + "step": 7730 + }, + { + "epoch": 0.3440458727830377, + "grad_norm": 0.23110248148441315, + "learning_rate": 9.940398281120821e-05, + "loss": 0.2993, + "step": 7740 + }, + { + "epoch": 0.34449037649464376, + "grad_norm": 0.22104412317276, + "learning_rate": 9.940079682932102e-05, + "loss": 0.2959, + "step": 7750 + }, + { + "epoch": 0.34493488020624974, + "grad_norm": 0.26124686002731323, + "learning_rate": 9.939760240618877e-05, + "loss": 0.2972, + "step": 7760 + }, + { + "epoch": 0.3453793839178557, + "grad_norm": 0.20993158221244812, + "learning_rate": 9.939439954235729e-05, + "loss": 0.2945, + "step": 7770 + }, + { + "epoch": 0.3458238876294617, + "grad_norm": 0.21574686467647552, + "learning_rate": 9.939118823837387e-05, + "loss": 0.2962, + "step": 7780 + }, + { + "epoch": 0.3462683913410677, + "grad_norm": 0.21633441746234894, + "learning_rate": 9.938796849478725e-05, + "loss": 0.293, + "step": 7790 + }, + { + "epoch": 0.34671289505267366, + "grad_norm": 0.2170843482017517, + "learning_rate": 9.938474031214755e-05, + "loss": 0.2974, + "step": 7800 + }, + { + "epoch": 0.3471573987642797, + "grad_norm": 0.2217618077993393, + "learning_rate": 9.938150369100643e-05, + "loss": 0.2963, + "step": 7810 + }, + { + "epoch": 0.3476019024758857, + "grad_norm": 0.23633089661598206, + "learning_rate": 9.93782586319169e-05, + "loss": 0.2969, + "step": 7820 + }, + { + "epoch": 0.34804640618749166, + "grad_norm": 0.23970630764961243, + "learning_rate": 9.937500513543348e-05, + "loss": 0.2994, + "step": 7830 + }, + { + "epoch": 0.34849090989909765, + "grad_norm": 0.24662673473358154, + "learning_rate": 9.937174320211207e-05, + "loss": 0.2952, + "step": 7840 + }, + { + "epoch": 0.3489354136107036, + "grad_norm": 0.24938321113586426, + "learning_rate": 9.936847283251009e-05, + "loss": 0.2967, + "step": 7850 + }, + { + "epoch": 0.34937991732230966, + "grad_norm": 0.2461121678352356, + "learning_rate": 9.936519402718632e-05, + "loss": 0.2952, + "step": 7860 + }, + { + "epoch": 0.34982442103391564, + "grad_norm": 0.2338074892759323, + "learning_rate": 9.936190678670102e-05, + "loss": 0.2949, + "step": 7870 + }, + { + "epoch": 0.3502689247455216, + "grad_norm": 0.25867515802383423, + "learning_rate": 9.935861111161593e-05, + "loss": 0.2958, + "step": 7880 + }, + { + "epoch": 0.3507134284571276, + "grad_norm": 0.25486722588539124, + "learning_rate": 9.935530700249416e-05, + "loss": 0.2956, + "step": 7890 + }, + { + "epoch": 0.3511579321687336, + "grad_norm": 0.25938931107521057, + "learning_rate": 9.935199445990028e-05, + "loss": 0.2994, + "step": 7900 + }, + { + "epoch": 0.3516024358803396, + "grad_norm": 0.24503326416015625, + "learning_rate": 9.934867348440033e-05, + "loss": 0.2972, + "step": 7910 + }, + { + "epoch": 0.3520469395919456, + "grad_norm": 0.23161081969738007, + "learning_rate": 9.934534407656176e-05, + "loss": 0.2979, + "step": 7920 + }, + { + "epoch": 0.3524914433035516, + "grad_norm": 0.2222600132226944, + "learning_rate": 9.93420062369535e-05, + "loss": 0.2974, + "step": 7930 + }, + { + "epoch": 0.35293594701515757, + "grad_norm": 0.23026257753372192, + "learning_rate": 9.933865996614589e-05, + "loss": 0.2961, + "step": 7940 + }, + { + "epoch": 0.35338045072676355, + "grad_norm": 0.20446012914180756, + "learning_rate": 9.933530526471068e-05, + "loss": 0.3003, + "step": 7950 + }, + { + "epoch": 0.3538249544383696, + "grad_norm": 0.25517839193344116, + "learning_rate": 9.933194213322114e-05, + "loss": 0.2981, + "step": 7960 + }, + { + "epoch": 0.35426945814997557, + "grad_norm": 0.23717305064201355, + "learning_rate": 9.932857057225192e-05, + "loss": 0.2952, + "step": 7970 + }, + { + "epoch": 0.35471396186158155, + "grad_norm": 0.2518792450428009, + "learning_rate": 9.932519058237912e-05, + "loss": 0.2984, + "step": 7980 + }, + { + "epoch": 0.35515846557318753, + "grad_norm": 0.2209206074476242, + "learning_rate": 9.932180216418032e-05, + "loss": 0.2993, + "step": 7990 + }, + { + "epoch": 0.3556029692847935, + "grad_norm": 0.22813130915164948, + "learning_rate": 9.931840531823446e-05, + "loss": 0.293, + "step": 8000 + }, + { + "epoch": 0.3560474729963995, + "grad_norm": 0.26665934920310974, + "learning_rate": 9.9315000045122e-05, + "loss": 0.2984, + "step": 8010 + }, + { + "epoch": 0.35649197670800553, + "grad_norm": 0.21717184782028198, + "learning_rate": 9.931158634542481e-05, + "loss": 0.2974, + "step": 8020 + }, + { + "epoch": 0.3569364804196115, + "grad_norm": 0.2562563419342041, + "learning_rate": 9.930816421972617e-05, + "loss": 0.2959, + "step": 8030 + }, + { + "epoch": 0.3573809841312175, + "grad_norm": 0.2261233627796173, + "learning_rate": 9.930473366861086e-05, + "loss": 0.295, + "step": 8040 + }, + { + "epoch": 0.3578254878428235, + "grad_norm": 0.24868839979171753, + "learning_rate": 9.930129469266505e-05, + "loss": 0.2951, + "step": 8050 + }, + { + "epoch": 0.35826999155442946, + "grad_norm": 0.26825541257858276, + "learning_rate": 9.929784729247638e-05, + "loss": 0.2956, + "step": 8060 + }, + { + "epoch": 0.3587144952660355, + "grad_norm": 0.24683429300785065, + "learning_rate": 9.929439146863389e-05, + "loss": 0.2945, + "step": 8070 + }, + { + "epoch": 0.3591589989776415, + "grad_norm": 0.26059144735336304, + "learning_rate": 9.92909272217281e-05, + "loss": 0.2931, + "step": 8080 + }, + { + "epoch": 0.35960350268924746, + "grad_norm": 0.27940186858177185, + "learning_rate": 9.928745455235097e-05, + "loss": 0.2938, + "step": 8090 + }, + { + "epoch": 0.36004800640085344, + "grad_norm": 0.20798762142658234, + "learning_rate": 9.928397346109588e-05, + "loss": 0.2924, + "step": 8100 + }, + { + "epoch": 0.3604925101124594, + "grad_norm": 0.2080305963754654, + "learning_rate": 9.928048394855762e-05, + "loss": 0.2941, + "step": 8110 + }, + { + "epoch": 0.36093701382406546, + "grad_norm": 0.22089770436286926, + "learning_rate": 9.92769860153325e-05, + "loss": 0.2922, + "step": 8120 + }, + { + "epoch": 0.36138151753567144, + "grad_norm": 0.19879615306854248, + "learning_rate": 9.927347966201819e-05, + "loss": 0.2902, + "step": 8130 + }, + { + "epoch": 0.3618260212472774, + "grad_norm": 0.23493602871894836, + "learning_rate": 9.926996488921383e-05, + "loss": 0.2951, + "step": 8140 + }, + { + "epoch": 0.3622705249588834, + "grad_norm": 0.2691687047481537, + "learning_rate": 9.926644169752001e-05, + "loss": 0.2952, + "step": 8150 + }, + { + "epoch": 0.3627150286704894, + "grad_norm": 0.26538777351379395, + "learning_rate": 9.926291008753875e-05, + "loss": 0.2936, + "step": 8160 + }, + { + "epoch": 0.3631595323820954, + "grad_norm": 0.2512625753879547, + "learning_rate": 9.92593700598735e-05, + "loss": 0.2938, + "step": 8170 + }, + { + "epoch": 0.3636040360937014, + "grad_norm": 0.25277674198150635, + "learning_rate": 9.925582161512915e-05, + "loss": 0.2942, + "step": 8180 + }, + { + "epoch": 0.3640485398053074, + "grad_norm": 0.21416813135147095, + "learning_rate": 9.925226475391205e-05, + "loss": 0.2949, + "step": 8190 + }, + { + "epoch": 0.36449304351691336, + "grad_norm": 0.23263727128505707, + "learning_rate": 9.924869947682993e-05, + "loss": 0.2933, + "step": 8200 + }, + { + "epoch": 0.36493754722851934, + "grad_norm": 0.23450766503810883, + "learning_rate": 9.924512578449204e-05, + "loss": 0.2931, + "step": 8210 + }, + { + "epoch": 0.3653820509401253, + "grad_norm": 0.24066881835460663, + "learning_rate": 9.924154367750901e-05, + "loss": 0.2947, + "step": 8220 + }, + { + "epoch": 0.36582655465173136, + "grad_norm": 0.24345733225345612, + "learning_rate": 9.923795315649293e-05, + "loss": 0.2955, + "step": 8230 + }, + { + "epoch": 0.36627105836333734, + "grad_norm": 0.3156566619873047, + "learning_rate": 9.92343542220573e-05, + "loss": 0.2893, + "step": 8240 + }, + { + "epoch": 0.3667155620749433, + "grad_norm": 0.25202012062072754, + "learning_rate": 9.92307468748171e-05, + "loss": 0.2949, + "step": 8250 + }, + { + "epoch": 0.3671600657865493, + "grad_norm": 0.2266129106283188, + "learning_rate": 9.922713111538873e-05, + "loss": 0.2929, + "step": 8260 + }, + { + "epoch": 0.3676045694981553, + "grad_norm": 0.23394598066806793, + "learning_rate": 9.922350694439003e-05, + "loss": 0.3005, + "step": 8270 + }, + { + "epoch": 0.3680490732097613, + "grad_norm": 0.2203267216682434, + "learning_rate": 9.921987436244024e-05, + "loss": 0.2966, + "step": 8280 + }, + { + "epoch": 0.3684935769213673, + "grad_norm": 0.25210192799568176, + "learning_rate": 9.921623337016008e-05, + "loss": 0.297, + "step": 8290 + }, + { + "epoch": 0.3689380806329733, + "grad_norm": 0.20324492454528809, + "learning_rate": 9.921258396817172e-05, + "loss": 0.2928, + "step": 8300 + }, + { + "epoch": 0.36938258434457927, + "grad_norm": 0.225251704454422, + "learning_rate": 9.920892615709874e-05, + "loss": 0.2978, + "step": 8310 + }, + { + "epoch": 0.36982708805618525, + "grad_norm": 0.2769891023635864, + "learning_rate": 9.920525993756612e-05, + "loss": 0.2951, + "step": 8320 + }, + { + "epoch": 0.3702715917677913, + "grad_norm": 0.24811118841171265, + "learning_rate": 9.920158531020036e-05, + "loss": 0.2969, + "step": 8330 + }, + { + "epoch": 0.37071609547939727, + "grad_norm": 0.2626021206378937, + "learning_rate": 9.919790227562933e-05, + "loss": 0.2969, + "step": 8340 + }, + { + "epoch": 0.37116059919100325, + "grad_norm": 0.24622151255607605, + "learning_rate": 9.919421083448237e-05, + "loss": 0.2952, + "step": 8350 + }, + { + "epoch": 0.37160510290260923, + "grad_norm": 0.2683497369289398, + "learning_rate": 9.919051098739022e-05, + "loss": 0.2933, + "step": 8360 + }, + { + "epoch": 0.3720496066142152, + "grad_norm": 0.24660643935203552, + "learning_rate": 9.918680273498514e-05, + "loss": 0.2943, + "step": 8370 + }, + { + "epoch": 0.37249411032582125, + "grad_norm": 0.2116711586713791, + "learning_rate": 9.918308607790072e-05, + "loss": 0.2929, + "step": 8380 + }, + { + "epoch": 0.37293861403742723, + "grad_norm": 0.2306990772485733, + "learning_rate": 9.917936101677205e-05, + "loss": 0.2943, + "step": 8390 + }, + { + "epoch": 0.3733831177490332, + "grad_norm": 0.23810146749019623, + "learning_rate": 9.917562755223564e-05, + "loss": 0.2952, + "step": 8400 + }, + { + "epoch": 0.3738276214606392, + "grad_norm": 0.21044881641864777, + "learning_rate": 9.917188568492944e-05, + "loss": 0.2939, + "step": 8410 + }, + { + "epoch": 0.3742721251722452, + "grad_norm": 0.26659658551216125, + "learning_rate": 9.916813541549283e-05, + "loss": 0.295, + "step": 8420 + }, + { + "epoch": 0.37471662888385115, + "grad_norm": 0.2812531590461731, + "learning_rate": 9.916437674456663e-05, + "loss": 0.2941, + "step": 8430 + }, + { + "epoch": 0.3751611325954572, + "grad_norm": 0.2539752721786499, + "learning_rate": 9.916060967279308e-05, + "loss": 0.2955, + "step": 8440 + }, + { + "epoch": 0.3756056363070632, + "grad_norm": 0.28012770414352417, + "learning_rate": 9.91568342008159e-05, + "loss": 0.2952, + "step": 8450 + }, + { + "epoch": 0.37605014001866915, + "grad_norm": 0.25735580921173096, + "learning_rate": 9.915305032928019e-05, + "loss": 0.2937, + "step": 8460 + }, + { + "epoch": 0.37649464373027514, + "grad_norm": 0.2246946543455124, + "learning_rate": 9.914925805883253e-05, + "loss": 0.2928, + "step": 8470 + }, + { + "epoch": 0.3769391474418811, + "grad_norm": 0.2579336166381836, + "learning_rate": 9.914545739012088e-05, + "loss": 0.2978, + "step": 8480 + }, + { + "epoch": 0.37738365115348715, + "grad_norm": 0.2855905294418335, + "learning_rate": 9.91416483237947e-05, + "loss": 0.2975, + "step": 8490 + }, + { + "epoch": 0.37782815486509314, + "grad_norm": 0.23888668417930603, + "learning_rate": 9.913783086050485e-05, + "loss": 0.2945, + "step": 8500 + }, + { + "epoch": 0.3782726585766991, + "grad_norm": 0.2260417491197586, + "learning_rate": 9.913400500090364e-05, + "loss": 0.293, + "step": 8510 + }, + { + "epoch": 0.3787171622883051, + "grad_norm": 0.24287539720535278, + "learning_rate": 9.913017074564479e-05, + "loss": 0.2957, + "step": 8520 + }, + { + "epoch": 0.3791616659999111, + "grad_norm": 0.1923934370279312, + "learning_rate": 9.912632809538348e-05, + "loss": 0.2936, + "step": 8530 + }, + { + "epoch": 0.3796061697115171, + "grad_norm": 0.20934845507144928, + "learning_rate": 9.912247705077629e-05, + "loss": 0.2995, + "step": 8540 + }, + { + "epoch": 0.3800506734231231, + "grad_norm": 0.2503669261932373, + "learning_rate": 9.911861761248127e-05, + "loss": 0.2932, + "step": 8550 + }, + { + "epoch": 0.3804951771347291, + "grad_norm": 0.20755788683891296, + "learning_rate": 9.91147497811579e-05, + "loss": 0.2964, + "step": 8560 + }, + { + "epoch": 0.38093968084633506, + "grad_norm": 0.28797653317451477, + "learning_rate": 9.911087355746709e-05, + "loss": 0.2945, + "step": 8570 + }, + { + "epoch": 0.38138418455794104, + "grad_norm": 0.25434765219688416, + "learning_rate": 9.910698894207117e-05, + "loss": 0.2935, + "step": 8580 + }, + { + "epoch": 0.381828688269547, + "grad_norm": 0.22537842392921448, + "learning_rate": 9.910309593563392e-05, + "loss": 0.292, + "step": 8590 + }, + { + "epoch": 0.38227319198115306, + "grad_norm": 0.24856440722942352, + "learning_rate": 9.909919453882057e-05, + "loss": 0.2956, + "step": 8600 + }, + { + "epoch": 0.38271769569275904, + "grad_norm": 0.21269504725933075, + "learning_rate": 9.90952847522977e-05, + "loss": 0.2967, + "step": 8610 + }, + { + "epoch": 0.383162199404365, + "grad_norm": 0.235601544380188, + "learning_rate": 9.909136657673346e-05, + "loss": 0.297, + "step": 8620 + }, + { + "epoch": 0.383606703115971, + "grad_norm": 0.2588299512863159, + "learning_rate": 9.908744001279731e-05, + "loss": 0.2945, + "step": 8630 + }, + { + "epoch": 0.384051206827577, + "grad_norm": 0.2450632005929947, + "learning_rate": 9.90835050611602e-05, + "loss": 0.2927, + "step": 8640 + }, + { + "epoch": 0.384495710539183, + "grad_norm": 0.23928236961364746, + "learning_rate": 9.90795617224945e-05, + "loss": 0.2949, + "step": 8650 + }, + { + "epoch": 0.384940214250789, + "grad_norm": 0.22600045800209045, + "learning_rate": 9.907560999747405e-05, + "loss": 0.2955, + "step": 8660 + }, + { + "epoch": 0.385384717962395, + "grad_norm": 0.24144971370697021, + "learning_rate": 9.907164988677408e-05, + "loss": 0.2947, + "step": 8670 + }, + { + "epoch": 0.38582922167400097, + "grad_norm": 0.23996804654598236, + "learning_rate": 9.906768139107124e-05, + "loss": 0.2945, + "step": 8680 + }, + { + "epoch": 0.38627372538560695, + "grad_norm": 0.2525530457496643, + "learning_rate": 9.906370451104367e-05, + "loss": 0.2963, + "step": 8690 + }, + { + "epoch": 0.386718229097213, + "grad_norm": 0.21789361536502838, + "learning_rate": 9.905971924737088e-05, + "loss": 0.2981, + "step": 8700 + }, + { + "epoch": 0.38716273280881897, + "grad_norm": 0.2527002990245819, + "learning_rate": 9.905572560073387e-05, + "loss": 0.2935, + "step": 8710 + }, + { + "epoch": 0.38760723652042495, + "grad_norm": 0.24655990302562714, + "learning_rate": 9.905172357181501e-05, + "loss": 0.2929, + "step": 8720 + }, + { + "epoch": 0.38805174023203093, + "grad_norm": 0.20771031081676483, + "learning_rate": 9.904771316129817e-05, + "loss": 0.2953, + "step": 8730 + }, + { + "epoch": 0.3884962439436369, + "grad_norm": 0.23272596299648285, + "learning_rate": 9.904369436986862e-05, + "loss": 0.2991, + "step": 8740 + }, + { + "epoch": 0.38894074765524295, + "grad_norm": 0.2604868412017822, + "learning_rate": 9.903966719821303e-05, + "loss": 0.2967, + "step": 8750 + }, + { + "epoch": 0.38938525136684893, + "grad_norm": 0.2639451026916504, + "learning_rate": 9.903563164701956e-05, + "loss": 0.2949, + "step": 8760 + }, + { + "epoch": 0.3898297550784549, + "grad_norm": 0.22899667918682098, + "learning_rate": 9.903158771697778e-05, + "loss": 0.2927, + "step": 8770 + }, + { + "epoch": 0.3902742587900609, + "grad_norm": 0.2858216166496277, + "learning_rate": 9.902753540877867e-05, + "loss": 0.2981, + "step": 8780 + }, + { + "epoch": 0.39071876250166687, + "grad_norm": 0.23586192727088928, + "learning_rate": 9.902347472311466e-05, + "loss": 0.2939, + "step": 8790 + }, + { + "epoch": 0.39116326621327285, + "grad_norm": 0.21068041026592255, + "learning_rate": 9.901940566067962e-05, + "loss": 0.2949, + "step": 8800 + }, + { + "epoch": 0.3916077699248789, + "grad_norm": 0.2214769423007965, + "learning_rate": 9.901532822216883e-05, + "loss": 0.2959, + "step": 8810 + }, + { + "epoch": 0.39205227363648487, + "grad_norm": 0.23349007964134216, + "learning_rate": 9.901124240827904e-05, + "loss": 0.2946, + "step": 8820 + }, + { + "epoch": 0.39249677734809085, + "grad_norm": 0.23428601026535034, + "learning_rate": 9.900714821970835e-05, + "loss": 0.2956, + "step": 8830 + }, + { + "epoch": 0.39294128105969683, + "grad_norm": 0.2550523579120636, + "learning_rate": 9.900304565715641e-05, + "loss": 0.2909, + "step": 8840 + }, + { + "epoch": 0.3933857847713028, + "grad_norm": 0.22863608598709106, + "learning_rate": 9.899893472132419e-05, + "loss": 0.2923, + "step": 8850 + }, + { + "epoch": 0.39383028848290885, + "grad_norm": 0.2232176810503006, + "learning_rate": 9.899481541291415e-05, + "loss": 0.2936, + "step": 8860 + }, + { + "epoch": 0.39427479219451483, + "grad_norm": 0.23931196331977844, + "learning_rate": 9.899068773263016e-05, + "loss": 0.2938, + "step": 8870 + }, + { + "epoch": 0.3947192959061208, + "grad_norm": 0.20907829701900482, + "learning_rate": 9.898655168117754e-05, + "loss": 0.2938, + "step": 8880 + }, + { + "epoch": 0.3951637996177268, + "grad_norm": 0.20787157118320465, + "learning_rate": 9.898240725926302e-05, + "loss": 0.2935, + "step": 8890 + }, + { + "epoch": 0.3956083033293328, + "grad_norm": 0.24575477838516235, + "learning_rate": 9.897825446759478e-05, + "loss": 0.2993, + "step": 8900 + }, + { + "epoch": 0.3960528070409388, + "grad_norm": 0.23241446912288666, + "learning_rate": 9.897409330688241e-05, + "loss": 0.2927, + "step": 8910 + }, + { + "epoch": 0.3964973107525448, + "grad_norm": 0.24330967664718628, + "learning_rate": 9.896992377783692e-05, + "loss": 0.2978, + "step": 8920 + }, + { + "epoch": 0.3969418144641508, + "grad_norm": 0.20007412135601044, + "learning_rate": 9.89657458811708e-05, + "loss": 0.2952, + "step": 8930 + }, + { + "epoch": 0.39738631817575676, + "grad_norm": 0.22073306143283844, + "learning_rate": 9.896155961759792e-05, + "loss": 0.2951, + "step": 8940 + }, + { + "epoch": 0.39783082188736274, + "grad_norm": 0.2226456105709076, + "learning_rate": 9.895736498783361e-05, + "loss": 0.2958, + "step": 8950 + }, + { + "epoch": 0.3982753255989688, + "grad_norm": 0.24782058596611023, + "learning_rate": 9.895316199259462e-05, + "loss": 0.2993, + "step": 8960 + }, + { + "epoch": 0.39871982931057476, + "grad_norm": 0.2122953236103058, + "learning_rate": 9.894895063259909e-05, + "loss": 0.2973, + "step": 8970 + }, + { + "epoch": 0.39916433302218074, + "grad_norm": 0.22516083717346191, + "learning_rate": 9.894473090856667e-05, + "loss": 0.2973, + "step": 8980 + }, + { + "epoch": 0.3996088367337867, + "grad_norm": 0.26836472749710083, + "learning_rate": 9.894050282121839e-05, + "loss": 0.2977, + "step": 8990 + }, + { + "epoch": 0.4000533404453927, + "grad_norm": 0.2501492202281952, + "learning_rate": 9.893626637127668e-05, + "loss": 0.297, + "step": 9000 + }, + { + "epoch": 0.4004978441569987, + "grad_norm": 0.2499515414237976, + "learning_rate": 9.893202155946546e-05, + "loss": 0.2948, + "step": 9010 + }, + { + "epoch": 0.4009423478686047, + "grad_norm": 0.2501410245895386, + "learning_rate": 9.892776838651006e-05, + "loss": 0.2931, + "step": 9020 + }, + { + "epoch": 0.4013868515802107, + "grad_norm": 0.25628289580345154, + "learning_rate": 9.892350685313722e-05, + "loss": 0.2966, + "step": 9030 + }, + { + "epoch": 0.4018313552918167, + "grad_norm": 0.23378407955169678, + "learning_rate": 9.891923696007513e-05, + "loss": 0.2915, + "step": 9040 + }, + { + "epoch": 0.40227585900342266, + "grad_norm": 0.2373848706483841, + "learning_rate": 9.891495870805336e-05, + "loss": 0.2935, + "step": 9050 + }, + { + "epoch": 0.40272036271502865, + "grad_norm": 0.2539753317832947, + "learning_rate": 9.891067209780298e-05, + "loss": 0.2937, + "step": 9060 + }, + { + "epoch": 0.4031648664266347, + "grad_norm": 0.27249622344970703, + "learning_rate": 9.890637713005646e-05, + "loss": 0.2942, + "step": 9070 + }, + { + "epoch": 0.40360937013824066, + "grad_norm": 0.263069748878479, + "learning_rate": 9.890207380554767e-05, + "loss": 0.3001, + "step": 9080 + }, + { + "epoch": 0.40405387384984665, + "grad_norm": 0.23047958314418793, + "learning_rate": 9.889776212501196e-05, + "loss": 0.3045, + "step": 9090 + }, + { + "epoch": 0.4044983775614526, + "grad_norm": 0.2829137146472931, + "learning_rate": 9.889344208918605e-05, + "loss": 0.3004, + "step": 9100 + }, + { + "epoch": 0.4049428812730586, + "grad_norm": 0.24912528693675995, + "learning_rate": 9.888911369880812e-05, + "loss": 0.2985, + "step": 9110 + }, + { + "epoch": 0.40538738498466464, + "grad_norm": 0.22682690620422363, + "learning_rate": 9.888477695461777e-05, + "loss": 0.299, + "step": 9120 + }, + { + "epoch": 0.4058318886962706, + "grad_norm": 0.2826825976371765, + "learning_rate": 9.888043185735607e-05, + "loss": 0.2959, + "step": 9130 + }, + { + "epoch": 0.4062763924078766, + "grad_norm": 0.26070117950439453, + "learning_rate": 9.887607840776542e-05, + "loss": 0.2956, + "step": 9140 + }, + { + "epoch": 0.4067208961194826, + "grad_norm": 0.2455427199602127, + "learning_rate": 9.887171660658975e-05, + "loss": 0.2985, + "step": 9150 + }, + { + "epoch": 0.40716539983108857, + "grad_norm": 0.261774480342865, + "learning_rate": 9.886734645457435e-05, + "loss": 0.2944, + "step": 9160 + }, + { + "epoch": 0.4076099035426946, + "grad_norm": 0.2588505744934082, + "learning_rate": 9.886296795246597e-05, + "loss": 0.2952, + "step": 9170 + }, + { + "epoch": 0.4080544072543006, + "grad_norm": 0.4580049514770508, + "learning_rate": 9.885858110101276e-05, + "loss": 0.2979, + "step": 9180 + }, + { + "epoch": 0.40849891096590657, + "grad_norm": 0.22897885739803314, + "learning_rate": 9.885418590096434e-05, + "loss": 0.2959, + "step": 9190 + }, + { + "epoch": 0.40894341467751255, + "grad_norm": 0.2352449744939804, + "learning_rate": 9.88497823530717e-05, + "loss": 0.2983, + "step": 9200 + }, + { + "epoch": 0.40938791838911853, + "grad_norm": 0.2370125949382782, + "learning_rate": 9.884537045808732e-05, + "loss": 0.2965, + "step": 9210 + }, + { + "epoch": 0.4098324221007245, + "grad_norm": 0.24353675544261932, + "learning_rate": 9.884095021676502e-05, + "loss": 0.301, + "step": 9220 + }, + { + "epoch": 0.41027692581233055, + "grad_norm": 0.21786464750766754, + "learning_rate": 9.883652162986017e-05, + "loss": 0.299, + "step": 9230 + }, + { + "epoch": 0.41072142952393653, + "grad_norm": 0.2282354235649109, + "learning_rate": 9.883208469812943e-05, + "loss": 0.2955, + "step": 9240 + }, + { + "epoch": 0.4111659332355425, + "grad_norm": 0.23842491209506989, + "learning_rate": 9.882763942233098e-05, + "loss": 0.2967, + "step": 9250 + }, + { + "epoch": 0.4116104369471485, + "grad_norm": 0.221390500664711, + "learning_rate": 9.882318580322441e-05, + "loss": 0.2936, + "step": 9260 + }, + { + "epoch": 0.4120549406587545, + "grad_norm": 0.1764024943113327, + "learning_rate": 9.881872384157067e-05, + "loss": 0.2913, + "step": 9270 + }, + { + "epoch": 0.4124994443703605, + "grad_norm": 0.2109171599149704, + "learning_rate": 9.881425353813225e-05, + "loss": 0.2936, + "step": 9280 + }, + { + "epoch": 0.4129439480819665, + "grad_norm": 0.23425090312957764, + "learning_rate": 9.880977489367296e-05, + "loss": 0.2933, + "step": 9290 + }, + { + "epoch": 0.4133884517935725, + "grad_norm": 0.28033486008644104, + "learning_rate": 9.88052879089581e-05, + "loss": 0.2959, + "step": 9300 + }, + { + "epoch": 0.41383295550517846, + "grad_norm": 0.25915855169296265, + "learning_rate": 9.880079258475434e-05, + "loss": 0.2953, + "step": 9310 + }, + { + "epoch": 0.41427745921678444, + "grad_norm": 0.19738055765628815, + "learning_rate": 9.879628892182985e-05, + "loss": 0.2948, + "step": 9320 + }, + { + "epoch": 0.4147219629283905, + "grad_norm": 0.28652581572532654, + "learning_rate": 9.879177692095416e-05, + "loss": 0.2949, + "step": 9330 + }, + { + "epoch": 0.41516646663999646, + "grad_norm": 0.274352490901947, + "learning_rate": 9.878725658289825e-05, + "loss": 0.2935, + "step": 9340 + }, + { + "epoch": 0.41561097035160244, + "grad_norm": 0.2672279179096222, + "learning_rate": 9.878272790843454e-05, + "loss": 0.2919, + "step": 9350 + }, + { + "epoch": 0.4160554740632084, + "grad_norm": 0.23173914849758148, + "learning_rate": 9.877819089833682e-05, + "loss": 0.297, + "step": 9360 + }, + { + "epoch": 0.4164999777748144, + "grad_norm": 0.2163747400045395, + "learning_rate": 9.877364555338038e-05, + "loss": 0.2969, + "step": 9370 + }, + { + "epoch": 0.41694448148642044, + "grad_norm": 0.2859513461589813, + "learning_rate": 9.876909187434186e-05, + "loss": 0.2979, + "step": 9380 + }, + { + "epoch": 0.4173889851980264, + "grad_norm": 0.25320783257484436, + "learning_rate": 9.876452986199939e-05, + "loss": 0.295, + "step": 9390 + }, + { + "epoch": 0.4178334889096324, + "grad_norm": 0.22319287061691284, + "learning_rate": 9.875995951713248e-05, + "loss": 0.2942, + "step": 9400 + }, + { + "epoch": 0.4182779926212384, + "grad_norm": 0.19482530653476715, + "learning_rate": 9.875538084052207e-05, + "loss": 0.2936, + "step": 9410 + }, + { + "epoch": 0.41872249633284436, + "grad_norm": 0.23366771638393402, + "learning_rate": 9.875079383295053e-05, + "loss": 0.2975, + "step": 9420 + }, + { + "epoch": 0.41916700004445034, + "grad_norm": 0.23778359591960907, + "learning_rate": 9.874619849520167e-05, + "loss": 0.2971, + "step": 9430 + }, + { + "epoch": 0.4196115037560564, + "grad_norm": 0.31569644808769226, + "learning_rate": 9.874159482806069e-05, + "loss": 0.2965, + "step": 9440 + }, + { + "epoch": 0.42005600746766236, + "grad_norm": 0.2657634913921356, + "learning_rate": 9.873698283231426e-05, + "loss": 0.2952, + "step": 9450 + }, + { + "epoch": 0.42050051117926834, + "grad_norm": 0.24504265189170837, + "learning_rate": 9.87323625087504e-05, + "loss": 0.2934, + "step": 9460 + }, + { + "epoch": 0.4209450148908743, + "grad_norm": 0.2258148491382599, + "learning_rate": 9.872773385815863e-05, + "loss": 0.2955, + "step": 9470 + }, + { + "epoch": 0.4213895186024803, + "grad_norm": 0.2545129358768463, + "learning_rate": 9.872309688132986e-05, + "loss": 0.2965, + "step": 9480 + }, + { + "epoch": 0.42183402231408634, + "grad_norm": 0.25034403800964355, + "learning_rate": 9.871845157905639e-05, + "loss": 0.299, + "step": 9490 + }, + { + "epoch": 0.4222785260256923, + "grad_norm": 0.25880762934684753, + "learning_rate": 9.871379795213201e-05, + "loss": 0.2962, + "step": 9500 + }, + { + "epoch": 0.4227230297372983, + "grad_norm": 0.2395322471857071, + "learning_rate": 9.87091360013519e-05, + "loss": 0.2933, + "step": 9510 + }, + { + "epoch": 0.4231675334489043, + "grad_norm": 0.24164320528507233, + "learning_rate": 9.870446572751262e-05, + "loss": 0.2966, + "step": 9520 + }, + { + "epoch": 0.42361203716051027, + "grad_norm": 0.2059490978717804, + "learning_rate": 9.869978713141224e-05, + "loss": 0.2951, + "step": 9530 + }, + { + "epoch": 0.4240565408721163, + "grad_norm": 0.22260276973247528, + "learning_rate": 9.869510021385016e-05, + "loss": 0.2931, + "step": 9540 + }, + { + "epoch": 0.4245010445837223, + "grad_norm": 0.2690472900867462, + "learning_rate": 9.869040497562727e-05, + "loss": 0.2953, + "step": 9550 + }, + { + "epoch": 0.42494554829532827, + "grad_norm": 0.22747136652469635, + "learning_rate": 9.868570141754587e-05, + "loss": 0.2925, + "step": 9560 + }, + { + "epoch": 0.42539005200693425, + "grad_norm": 0.2005576193332672, + "learning_rate": 9.868098954040965e-05, + "loss": 0.2933, + "step": 9570 + }, + { + "epoch": 0.42583455571854023, + "grad_norm": 0.2108849287033081, + "learning_rate": 9.867626934502374e-05, + "loss": 0.2944, + "step": 9580 + }, + { + "epoch": 0.42627905943014627, + "grad_norm": 0.18030965328216553, + "learning_rate": 9.86715408321947e-05, + "loss": 0.2953, + "step": 9590 + }, + { + "epoch": 0.42672356314175225, + "grad_norm": 0.24452196061611176, + "learning_rate": 9.86668040027305e-05, + "loss": 0.2961, + "step": 9600 + }, + { + "epoch": 0.42716806685335823, + "grad_norm": 0.2056668996810913, + "learning_rate": 9.866205885744053e-05, + "loss": 0.2982, + "step": 9610 + }, + { + "epoch": 0.4276125705649642, + "grad_norm": 0.266294002532959, + "learning_rate": 9.865730539713563e-05, + "loss": 0.2959, + "step": 9620 + }, + { + "epoch": 0.4280570742765702, + "grad_norm": 0.2938980758190155, + "learning_rate": 9.8652543622628e-05, + "loss": 0.2921, + "step": 9630 + }, + { + "epoch": 0.4285015779881762, + "grad_norm": 0.2142695188522339, + "learning_rate": 9.864777353473132e-05, + "loss": 0.2946, + "step": 9640 + }, + { + "epoch": 0.4289460816997822, + "grad_norm": 0.2230747789144516, + "learning_rate": 9.864299513426068e-05, + "loss": 0.2953, + "step": 9650 + }, + { + "epoch": 0.4293905854113882, + "grad_norm": 0.21606941521167755, + "learning_rate": 9.863820842203254e-05, + "loss": 0.2925, + "step": 9660 + }, + { + "epoch": 0.4298350891229942, + "grad_norm": 0.2355862557888031, + "learning_rate": 9.863341339886483e-05, + "loss": 0.2913, + "step": 9670 + }, + { + "epoch": 0.43027959283460016, + "grad_norm": 0.2457069754600525, + "learning_rate": 9.86286100655769e-05, + "loss": 0.2955, + "step": 9680 + }, + { + "epoch": 0.43072409654620614, + "grad_norm": 0.21925196051597595, + "learning_rate": 9.862379842298953e-05, + "loss": 0.2908, + "step": 9690 + }, + { + "epoch": 0.4311686002578122, + "grad_norm": 0.24717478454113007, + "learning_rate": 9.861897847192485e-05, + "loss": 0.293, + "step": 9700 + }, + { + "epoch": 0.43161310396941815, + "grad_norm": 0.2188500165939331, + "learning_rate": 9.86141502132065e-05, + "loss": 0.295, + "step": 9710 + }, + { + "epoch": 0.43205760768102414, + "grad_norm": 0.19906699657440186, + "learning_rate": 9.860931364765946e-05, + "loss": 0.2942, + "step": 9720 + }, + { + "epoch": 0.4325021113926301, + "grad_norm": 0.2379574477672577, + "learning_rate": 9.860446877611021e-05, + "loss": 0.2964, + "step": 9730 + }, + { + "epoch": 0.4329466151042361, + "grad_norm": 0.21540623903274536, + "learning_rate": 9.859961559938655e-05, + "loss": 0.2945, + "step": 9740 + }, + { + "epoch": 0.43339111881584214, + "grad_norm": 0.2200712412595749, + "learning_rate": 9.85947541183178e-05, + "loss": 0.2953, + "step": 9750 + }, + { + "epoch": 0.4338356225274481, + "grad_norm": 0.2651131749153137, + "learning_rate": 9.858988433373463e-05, + "loss": 0.2956, + "step": 9760 + }, + { + "epoch": 0.4342801262390541, + "grad_norm": 0.27376243472099304, + "learning_rate": 9.858500624646918e-05, + "loss": 0.2951, + "step": 9770 + }, + { + "epoch": 0.4347246299506601, + "grad_norm": 0.2360897809267044, + "learning_rate": 9.858011985735497e-05, + "loss": 0.2935, + "step": 9780 + }, + { + "epoch": 0.43516913366226606, + "grad_norm": 0.25001880526542664, + "learning_rate": 9.857522516722693e-05, + "loss": 0.2958, + "step": 9790 + }, + { + "epoch": 0.4356136373738721, + "grad_norm": 0.2230762094259262, + "learning_rate": 9.857032217692145e-05, + "loss": 0.2915, + "step": 9800 + }, + { + "epoch": 0.4360581410854781, + "grad_norm": 0.1900751143693924, + "learning_rate": 9.856541088727631e-05, + "loss": 0.2932, + "step": 9810 + }, + { + "epoch": 0.43650264479708406, + "grad_norm": 0.23318246006965637, + "learning_rate": 9.856049129913072e-05, + "loss": 0.2937, + "step": 9820 + }, + { + "epoch": 0.43694714850869004, + "grad_norm": 0.26074400544166565, + "learning_rate": 9.85555634133253e-05, + "loss": 0.2994, + "step": 9830 + }, + { + "epoch": 0.437391652220296, + "grad_norm": 0.2135591208934784, + "learning_rate": 9.855062723070208e-05, + "loss": 0.2972, + "step": 9840 + }, + { + "epoch": 0.437836155931902, + "grad_norm": 0.22864973545074463, + "learning_rate": 9.854568275210454e-05, + "loss": 0.2924, + "step": 9850 + }, + { + "epoch": 0.43828065964350804, + "grad_norm": 0.2666264474391937, + "learning_rate": 9.854072997837754e-05, + "loss": 0.2972, + "step": 9860 + }, + { + "epoch": 0.438725163355114, + "grad_norm": 0.23538368940353394, + "learning_rate": 9.853576891036737e-05, + "loss": 0.2941, + "step": 9870 + }, + { + "epoch": 0.43916966706672, + "grad_norm": 0.25356289744377136, + "learning_rate": 9.853079954892177e-05, + "loss": 0.2959, + "step": 9880 + }, + { + "epoch": 0.439614170778326, + "grad_norm": 0.2678288221359253, + "learning_rate": 9.852582189488983e-05, + "loss": 0.3005, + "step": 9890 + }, + { + "epoch": 0.44005867448993197, + "grad_norm": 0.2504653036594391, + "learning_rate": 9.852083594912212e-05, + "loss": 0.2983, + "step": 9900 + }, + { + "epoch": 0.440503178201538, + "grad_norm": 0.27121952176094055, + "learning_rate": 9.851584171247058e-05, + "loss": 0.2985, + "step": 9910 + }, + { + "epoch": 0.440947681913144, + "grad_norm": 0.38068851828575134, + "learning_rate": 9.851083918578863e-05, + "loss": 0.2987, + "step": 9920 + }, + { + "epoch": 0.44139218562474997, + "grad_norm": 0.2581348419189453, + "learning_rate": 9.850582836993103e-05, + "loss": 0.2963, + "step": 9930 + }, + { + "epoch": 0.44183668933635595, + "grad_norm": 0.5460106134414673, + "learning_rate": 9.850080926575397e-05, + "loss": 0.2979, + "step": 9940 + }, + { + "epoch": 0.44228119304796193, + "grad_norm": 0.2757161557674408, + "learning_rate": 9.849578187411515e-05, + "loss": 0.2959, + "step": 9950 + }, + { + "epoch": 0.44272569675956797, + "grad_norm": 0.19535286724567413, + "learning_rate": 9.849074619587354e-05, + "loss": 0.2957, + "step": 9960 + }, + { + "epoch": 0.44317020047117395, + "grad_norm": 0.21491405367851257, + "learning_rate": 9.848570223188964e-05, + "loss": 0.298, + "step": 9970 + }, + { + "epoch": 0.44361470418277993, + "grad_norm": 0.23556338250637054, + "learning_rate": 9.848064998302531e-05, + "loss": 0.2995, + "step": 9980 + }, + { + "epoch": 0.4440592078943859, + "grad_norm": 0.2753199636936188, + "learning_rate": 9.847558945014386e-05, + "loss": 0.2967, + "step": 9990 + }, + { + "epoch": 0.4445037116059919, + "grad_norm": 0.26512858271598816, + "learning_rate": 9.847052063410996e-05, + "loss": 0.2975, + "step": 10000 + }, + { + "epoch": 0.44494821531759793, + "grad_norm": 0.24133633077144623, + "learning_rate": 9.846544353578977e-05, + "loss": 0.2998, + "step": 10010 + }, + { + "epoch": 0.4453927190292039, + "grad_norm": 0.21755650639533997, + "learning_rate": 9.846035815605081e-05, + "loss": 0.2965, + "step": 10020 + }, + { + "epoch": 0.4458372227408099, + "grad_norm": 0.20981888473033905, + "learning_rate": 9.845526449576204e-05, + "loss": 0.2933, + "step": 10030 + }, + { + "epoch": 0.44628172645241587, + "grad_norm": 0.20697090029716492, + "learning_rate": 9.845016255579383e-05, + "loss": 0.2979, + "step": 10040 + }, + { + "epoch": 0.44672623016402185, + "grad_norm": 0.2633682191371918, + "learning_rate": 9.844505233701794e-05, + "loss": 0.2969, + "step": 10050 + }, + { + "epoch": 0.44717073387562783, + "grad_norm": 0.2797996699810028, + "learning_rate": 9.843993384030757e-05, + "loss": 0.2969, + "step": 10060 + }, + { + "epoch": 0.44761523758723387, + "grad_norm": 0.24067126214504242, + "learning_rate": 9.843480706653737e-05, + "loss": 0.296, + "step": 10070 + }, + { + "epoch": 0.44805974129883985, + "grad_norm": 0.2290130853652954, + "learning_rate": 9.84296720165833e-05, + "loss": 0.296, + "step": 10080 + }, + { + "epoch": 0.44850424501044583, + "grad_norm": 0.2280808538198471, + "learning_rate": 9.842452869132286e-05, + "loss": 0.2935, + "step": 10090 + }, + { + "epoch": 0.4489487487220518, + "grad_norm": 0.24220983684062958, + "learning_rate": 9.841937709163489e-05, + "loss": 0.297, + "step": 10100 + }, + { + "epoch": 0.4493932524336578, + "grad_norm": 0.2499627023935318, + "learning_rate": 9.841421721839962e-05, + "loss": 0.2942, + "step": 10110 + }, + { + "epoch": 0.44983775614526383, + "grad_norm": 0.2503914535045624, + "learning_rate": 9.840904907249879e-05, + "loss": 0.2896, + "step": 10120 + }, + { + "epoch": 0.4502822598568698, + "grad_norm": 0.21410812437534332, + "learning_rate": 9.840387265481545e-05, + "loss": 0.2919, + "step": 10130 + }, + { + "epoch": 0.4507267635684758, + "grad_norm": 0.2204870730638504, + "learning_rate": 9.839868796623411e-05, + "loss": 0.2943, + "step": 10140 + }, + { + "epoch": 0.4511712672800818, + "grad_norm": 0.25411927700042725, + "learning_rate": 9.839349500764072e-05, + "loss": 0.2906, + "step": 10150 + }, + { + "epoch": 0.45161577099168776, + "grad_norm": 0.23713071644306183, + "learning_rate": 9.83882937799226e-05, + "loss": 0.2937, + "step": 10160 + }, + { + "epoch": 0.4520602747032938, + "grad_norm": 0.444322794675827, + "learning_rate": 9.838308428396849e-05, + "loss": 0.2945, + "step": 10170 + }, + { + "epoch": 0.4525047784148998, + "grad_norm": 0.2401573807001114, + "learning_rate": 9.837786652066854e-05, + "loss": 0.2935, + "step": 10180 + }, + { + "epoch": 0.45294928212650576, + "grad_norm": 0.23010678589344025, + "learning_rate": 9.837264049091437e-05, + "loss": 0.2953, + "step": 10190 + }, + { + "epoch": 0.45339378583811174, + "grad_norm": 0.25963127613067627, + "learning_rate": 9.836740619559893e-05, + "loss": 0.2962, + "step": 10200 + }, + { + "epoch": 0.4538382895497177, + "grad_norm": 0.31007254123687744, + "learning_rate": 9.836216363561659e-05, + "loss": 0.2959, + "step": 10210 + }, + { + "epoch": 0.45428279326132376, + "grad_norm": 0.24549594521522522, + "learning_rate": 9.835691281186322e-05, + "loss": 0.297, + "step": 10220 + }, + { + "epoch": 0.45472729697292974, + "grad_norm": 0.2220657914876938, + "learning_rate": 9.8351653725236e-05, + "loss": 0.2972, + "step": 10230 + }, + { + "epoch": 0.4551718006845357, + "grad_norm": 0.269227534532547, + "learning_rate": 9.83463863766336e-05, + "loss": 0.2957, + "step": 10240 + }, + { + "epoch": 0.4556163043961417, + "grad_norm": 0.250498503446579, + "learning_rate": 9.834111076695602e-05, + "loss": 0.2952, + "step": 10250 + }, + { + "epoch": 0.4560608081077477, + "grad_norm": 0.2743304371833801, + "learning_rate": 9.833582689710477e-05, + "loss": 0.2973, + "step": 10260 + }, + { + "epoch": 0.45650531181935367, + "grad_norm": 0.21870045363903046, + "learning_rate": 9.833053476798268e-05, + "loss": 0.2937, + "step": 10270 + }, + { + "epoch": 0.4569498155309597, + "grad_norm": 0.25012895464897156, + "learning_rate": 9.832523438049404e-05, + "loss": 0.2979, + "step": 10280 + }, + { + "epoch": 0.4573943192425657, + "grad_norm": 0.20420582592487335, + "learning_rate": 9.831992573554454e-05, + "loss": 0.2955, + "step": 10290 + }, + { + "epoch": 0.45783882295417166, + "grad_norm": 0.2819860577583313, + "learning_rate": 9.831460883404128e-05, + "loss": 0.2958, + "step": 10300 + }, + { + "epoch": 0.45828332666577765, + "grad_norm": 0.20092421770095825, + "learning_rate": 9.830928367689278e-05, + "loss": 0.2953, + "step": 10310 + }, + { + "epoch": 0.4587278303773836, + "grad_norm": 0.22265630960464478, + "learning_rate": 9.830395026500896e-05, + "loss": 0.2972, + "step": 10320 + }, + { + "epoch": 0.45917233408898966, + "grad_norm": 0.24595889449119568, + "learning_rate": 9.829860859930115e-05, + "loss": 0.2942, + "step": 10330 + }, + { + "epoch": 0.45961683780059565, + "grad_norm": 0.23609811067581177, + "learning_rate": 9.829325868068212e-05, + "loss": 0.2928, + "step": 10340 + }, + { + "epoch": 0.4600613415122016, + "grad_norm": 0.21633630990982056, + "learning_rate": 9.8287900510066e-05, + "loss": 0.2961, + "step": 10350 + }, + { + "epoch": 0.4605058452238076, + "grad_norm": 0.23080290853977203, + "learning_rate": 9.828253408836834e-05, + "loss": 0.2981, + "step": 10360 + }, + { + "epoch": 0.4609503489354136, + "grad_norm": 0.29179146885871887, + "learning_rate": 9.827715941650615e-05, + "loss": 0.2958, + "step": 10370 + }, + { + "epoch": 0.4613948526470196, + "grad_norm": 0.22186602652072906, + "learning_rate": 9.82717764953978e-05, + "loss": 0.2957, + "step": 10380 + }, + { + "epoch": 0.4618393563586256, + "grad_norm": 0.23127450048923492, + "learning_rate": 9.826638532596308e-05, + "loss": 0.2934, + "step": 10390 + }, + { + "epoch": 0.4622838600702316, + "grad_norm": 0.27256277203559875, + "learning_rate": 9.82609859091232e-05, + "loss": 0.2986, + "step": 10400 + }, + { + "epoch": 0.46272836378183757, + "grad_norm": 0.2086099237203598, + "learning_rate": 9.825557824580076e-05, + "loss": 0.2917, + "step": 10410 + }, + { + "epoch": 0.46317286749344355, + "grad_norm": 0.2194012850522995, + "learning_rate": 9.82501623369198e-05, + "loss": 0.2965, + "step": 10420 + }, + { + "epoch": 0.4636173712050496, + "grad_norm": 0.23475363850593567, + "learning_rate": 9.824473818340574e-05, + "loss": 0.2934, + "step": 10430 + }, + { + "epoch": 0.46406187491665557, + "grad_norm": 0.2511306703090668, + "learning_rate": 9.823930578618541e-05, + "loss": 0.2954, + "step": 10440 + }, + { + "epoch": 0.46450637862826155, + "grad_norm": 0.1884424090385437, + "learning_rate": 9.823386514618709e-05, + "loss": 0.2948, + "step": 10450 + }, + { + "epoch": 0.46495088233986753, + "grad_norm": 0.2632814645767212, + "learning_rate": 9.82284162643404e-05, + "loss": 0.2893, + "step": 10460 + }, + { + "epoch": 0.4653953860514735, + "grad_norm": 0.22260838747024536, + "learning_rate": 9.822295914157642e-05, + "loss": 0.2979, + "step": 10470 + }, + { + "epoch": 0.4658398897630795, + "grad_norm": 0.2334599792957306, + "learning_rate": 9.821749377882763e-05, + "loss": 0.2944, + "step": 10480 + }, + { + "epoch": 0.46628439347468553, + "grad_norm": 0.25994014739990234, + "learning_rate": 9.821202017702791e-05, + "loss": 0.2943, + "step": 10490 + }, + { + "epoch": 0.4667288971862915, + "grad_norm": 0.22435590624809265, + "learning_rate": 9.820653833711253e-05, + "loss": 0.2916, + "step": 10500 + }, + { + "epoch": 0.4671734008978975, + "grad_norm": 0.24047724902629852, + "learning_rate": 9.820104826001822e-05, + "loss": 0.2955, + "step": 10510 + }, + { + "epoch": 0.4676179046095035, + "grad_norm": 0.25112757086753845, + "learning_rate": 9.819554994668305e-05, + "loss": 0.2956, + "step": 10520 + }, + { + "epoch": 0.46806240832110946, + "grad_norm": 0.23760870099067688, + "learning_rate": 9.819004339804654e-05, + "loss": 0.2957, + "step": 10530 + }, + { + "epoch": 0.4685069120327155, + "grad_norm": 0.24006319046020508, + "learning_rate": 9.818452861504961e-05, + "loss": 0.2988, + "step": 10540 + }, + { + "epoch": 0.4689514157443215, + "grad_norm": 0.294505774974823, + "learning_rate": 9.81790055986346e-05, + "loss": 0.2924, + "step": 10550 + }, + { + "epoch": 0.46939591945592746, + "grad_norm": 0.2046414017677307, + "learning_rate": 9.817347434974523e-05, + "loss": 0.296, + "step": 10560 + }, + { + "epoch": 0.46984042316753344, + "grad_norm": 0.21690921485424042, + "learning_rate": 9.816793486932664e-05, + "loss": 0.2935, + "step": 10570 + }, + { + "epoch": 0.4702849268791394, + "grad_norm": 0.22245211899280548, + "learning_rate": 9.816238715832538e-05, + "loss": 0.2921, + "step": 10580 + }, + { + "epoch": 0.47072943059074546, + "grad_norm": 0.20746003091335297, + "learning_rate": 9.815683121768939e-05, + "loss": 0.2944, + "step": 10590 + }, + { + "epoch": 0.47117393430235144, + "grad_norm": 0.21205401420593262, + "learning_rate": 9.815126704836804e-05, + "loss": 0.2945, + "step": 10600 + }, + { + "epoch": 0.4716184380139574, + "grad_norm": 0.21962383389472961, + "learning_rate": 9.81456946513121e-05, + "loss": 0.2936, + "step": 10610 + }, + { + "epoch": 0.4720629417255634, + "grad_norm": 0.2115192413330078, + "learning_rate": 9.814011402747373e-05, + "loss": 0.2958, + "step": 10620 + }, + { + "epoch": 0.4725074454371694, + "grad_norm": 0.2846689820289612, + "learning_rate": 9.813452517780651e-05, + "loss": 0.2961, + "step": 10630 + }, + { + "epoch": 0.4729519491487754, + "grad_norm": 0.2222760170698166, + "learning_rate": 9.81289281032654e-05, + "loss": 0.2956, + "step": 10640 + }, + { + "epoch": 0.4733964528603814, + "grad_norm": 0.2399580031633377, + "learning_rate": 9.812332280480683e-05, + "loss": 0.2977, + "step": 10650 + }, + { + "epoch": 0.4738409565719874, + "grad_norm": 0.2523430287837982, + "learning_rate": 9.811770928338854e-05, + "loss": 0.2951, + "step": 10660 + }, + { + "epoch": 0.47428546028359336, + "grad_norm": 0.2731485068798065, + "learning_rate": 9.811208753996979e-05, + "loss": 0.2932, + "step": 10670 + }, + { + "epoch": 0.47472996399519934, + "grad_norm": 0.22851277887821198, + "learning_rate": 9.810645757551113e-05, + "loss": 0.2973, + "step": 10680 + }, + { + "epoch": 0.4751744677068053, + "grad_norm": 0.23123015463352203, + "learning_rate": 9.810081939097459e-05, + "loss": 0.2912, + "step": 10690 + }, + { + "epoch": 0.47561897141841136, + "grad_norm": 0.22553972899913788, + "learning_rate": 9.809517298732356e-05, + "loss": 0.2934, + "step": 10700 + }, + { + "epoch": 0.47606347513001734, + "grad_norm": 0.252502977848053, + "learning_rate": 9.80895183655229e-05, + "loss": 0.2961, + "step": 10710 + }, + { + "epoch": 0.4765079788416233, + "grad_norm": 0.22616390883922577, + "learning_rate": 9.808385552653877e-05, + "loss": 0.2937, + "step": 10720 + }, + { + "epoch": 0.4769524825532293, + "grad_norm": 0.2010558545589447, + "learning_rate": 9.807818447133886e-05, + "loss": 0.2945, + "step": 10730 + }, + { + "epoch": 0.4773969862648353, + "grad_norm": 0.19391076266765594, + "learning_rate": 9.807250520089215e-05, + "loss": 0.2942, + "step": 10740 + }, + { + "epoch": 0.4778414899764413, + "grad_norm": 0.21219398081302643, + "learning_rate": 9.806681771616908e-05, + "loss": 0.2925, + "step": 10750 + }, + { + "epoch": 0.4782859936880473, + "grad_norm": 0.2706209421157837, + "learning_rate": 9.80611220181415e-05, + "loss": 0.2934, + "step": 10760 + }, + { + "epoch": 0.4787304973996533, + "grad_norm": 0.20238874852657318, + "learning_rate": 9.805541810778264e-05, + "loss": 0.2952, + "step": 10770 + }, + { + "epoch": 0.47917500111125927, + "grad_norm": 0.20736627280712128, + "learning_rate": 9.804970598606716e-05, + "loss": 0.2942, + "step": 10780 + }, + { + "epoch": 0.47961950482286525, + "grad_norm": 0.2106337696313858, + "learning_rate": 9.804398565397106e-05, + "loss": 0.2955, + "step": 10790 + }, + { + "epoch": 0.4800640085344713, + "grad_norm": 0.2164163887500763, + "learning_rate": 9.803825711247183e-05, + "loss": 0.296, + "step": 10800 + }, + { + "epoch": 0.48050851224607727, + "grad_norm": 0.25775399804115295, + "learning_rate": 9.803252036254831e-05, + "loss": 0.2936, + "step": 10810 + }, + { + "epoch": 0.48095301595768325, + "grad_norm": 0.24273976683616638, + "learning_rate": 9.802677540518076e-05, + "loss": 0.2977, + "step": 10820 + }, + { + "epoch": 0.48139751966928923, + "grad_norm": 0.221907839179039, + "learning_rate": 9.802102224135081e-05, + "loss": 0.2932, + "step": 10830 + }, + { + "epoch": 0.4818420233808952, + "grad_norm": 0.20878861844539642, + "learning_rate": 9.801526087204155e-05, + "loss": 0.2947, + "step": 10840 + }, + { + "epoch": 0.48228652709250125, + "grad_norm": 0.227243110537529, + "learning_rate": 9.800949129823743e-05, + "loss": 0.2962, + "step": 10850 + }, + { + "epoch": 0.48273103080410723, + "grad_norm": 0.2422817200422287, + "learning_rate": 9.80037135209243e-05, + "loss": 0.2933, + "step": 10860 + }, + { + "epoch": 0.4831755345157132, + "grad_norm": 0.22564850747585297, + "learning_rate": 9.799792754108946e-05, + "loss": 0.2924, + "step": 10870 + }, + { + "epoch": 0.4836200382273192, + "grad_norm": 0.21728861331939697, + "learning_rate": 9.799213335972152e-05, + "loss": 0.2918, + "step": 10880 + }, + { + "epoch": 0.4840645419389252, + "grad_norm": 0.2689211070537567, + "learning_rate": 9.798633097781058e-05, + "loss": 0.2934, + "step": 10890 + }, + { + "epoch": 0.48450904565053116, + "grad_norm": 0.22192668914794922, + "learning_rate": 9.79805203963481e-05, + "loss": 0.2955, + "step": 10900 + }, + { + "epoch": 0.4849535493621372, + "grad_norm": 0.2387600988149643, + "learning_rate": 9.797470161632697e-05, + "loss": 0.2941, + "step": 10910 + }, + { + "epoch": 0.4853980530737432, + "grad_norm": 0.21688814461231232, + "learning_rate": 9.796887463874145e-05, + "loss": 0.2908, + "step": 10920 + }, + { + "epoch": 0.48584255678534916, + "grad_norm": 0.23668760061264038, + "learning_rate": 9.796303946458718e-05, + "loss": 0.2975, + "step": 10930 + }, + { + "epoch": 0.48628706049695514, + "grad_norm": 0.2618896961212158, + "learning_rate": 9.795719609486127e-05, + "loss": 0.2903, + "step": 10940 + }, + { + "epoch": 0.4867315642085611, + "grad_norm": 0.47402632236480713, + "learning_rate": 9.795134453056219e-05, + "loss": 0.2936, + "step": 10950 + }, + { + "epoch": 0.48717606792016716, + "grad_norm": 0.19572043418884277, + "learning_rate": 9.794548477268979e-05, + "loss": 0.295, + "step": 10960 + }, + { + "epoch": 0.48762057163177314, + "grad_norm": 0.23275570571422577, + "learning_rate": 9.793961682224537e-05, + "loss": 0.2954, + "step": 10970 + }, + { + "epoch": 0.4880650753433791, + "grad_norm": 0.22733880579471588, + "learning_rate": 9.793374068023156e-05, + "loss": 0.2913, + "step": 10980 + }, + { + "epoch": 0.4885095790549851, + "grad_norm": 0.23567181825637817, + "learning_rate": 9.792785634765247e-05, + "loss": 0.2958, + "step": 10990 + }, + { + "epoch": 0.4889540827665911, + "grad_norm": 0.2001638263463974, + "learning_rate": 9.792196382551357e-05, + "loss": 0.294, + "step": 11000 + }, + { + "epoch": 0.4893985864781971, + "grad_norm": 0.2110067903995514, + "learning_rate": 9.791606311482171e-05, + "loss": 0.2956, + "step": 11010 + }, + { + "epoch": 0.4898430901898031, + "grad_norm": 0.19922035932540894, + "learning_rate": 9.791015421658518e-05, + "loss": 0.2915, + "step": 11020 + }, + { + "epoch": 0.4902875939014091, + "grad_norm": 0.21448294818401337, + "learning_rate": 9.790423713181362e-05, + "loss": 0.2954, + "step": 11030 + }, + { + "epoch": 0.49073209761301506, + "grad_norm": 0.2583896815776825, + "learning_rate": 9.789831186151814e-05, + "loss": 0.2935, + "step": 11040 + }, + { + "epoch": 0.49117660132462104, + "grad_norm": 0.30179768800735474, + "learning_rate": 9.789237840671118e-05, + "loss": 0.2914, + "step": 11050 + }, + { + "epoch": 0.4916211050362271, + "grad_norm": 0.2455320656299591, + "learning_rate": 9.78864367684066e-05, + "loss": 0.2906, + "step": 11060 + }, + { + "epoch": 0.49206560874783306, + "grad_norm": 0.27490609884262085, + "learning_rate": 9.788048694761968e-05, + "loss": 0.294, + "step": 11070 + }, + { + "epoch": 0.49251011245943904, + "grad_norm": 0.24293304979801178, + "learning_rate": 9.787452894536709e-05, + "loss": 0.2963, + "step": 11080 + }, + { + "epoch": 0.492954616171045, + "grad_norm": 0.6582195162773132, + "learning_rate": 9.786856276266685e-05, + "loss": 0.2961, + "step": 11090 + }, + { + "epoch": 0.493399119882651, + "grad_norm": 0.2192567139863968, + "learning_rate": 9.786258840053845e-05, + "loss": 0.2897, + "step": 11100 + }, + { + "epoch": 0.493843623594257, + "grad_norm": 0.2920767068862915, + "learning_rate": 9.785660586000273e-05, + "loss": 0.2967, + "step": 11110 + }, + { + "epoch": 0.494288127305863, + "grad_norm": 0.22989149391651154, + "learning_rate": 9.785061514208196e-05, + "loss": 0.2945, + "step": 11120 + }, + { + "epoch": 0.494732631017469, + "grad_norm": 0.49943768978118896, + "learning_rate": 9.784461624779977e-05, + "loss": 0.2939, + "step": 11130 + }, + { + "epoch": 0.495177134729075, + "grad_norm": 0.2727237641811371, + "learning_rate": 9.783860917818123e-05, + "loss": 0.2953, + "step": 11140 + }, + { + "epoch": 0.49562163844068097, + "grad_norm": 0.27978208661079407, + "learning_rate": 9.783259393425277e-05, + "loss": 0.3002, + "step": 11150 + }, + { + "epoch": 0.49606614215228695, + "grad_norm": 0.19746243953704834, + "learning_rate": 9.782657051704221e-05, + "loss": 0.2945, + "step": 11160 + }, + { + "epoch": 0.496510645863893, + "grad_norm": 0.26860859990119934, + "learning_rate": 9.782053892757883e-05, + "loss": 0.2955, + "step": 11170 + }, + { + "epoch": 0.49695514957549897, + "grad_norm": 0.23981134593486786, + "learning_rate": 9.781449916689324e-05, + "loss": 0.2909, + "step": 11180 + }, + { + "epoch": 0.49739965328710495, + "grad_norm": 0.26105737686157227, + "learning_rate": 9.780845123601746e-05, + "loss": 0.297, + "step": 11190 + }, + { + "epoch": 0.49784415699871093, + "grad_norm": 0.23563691973686218, + "learning_rate": 9.780239513598492e-05, + "loss": 0.2977, + "step": 11200 + }, + { + "epoch": 0.4982886607103169, + "grad_norm": 0.19978457689285278, + "learning_rate": 9.779633086783047e-05, + "loss": 0.2984, + "step": 11210 + }, + { + "epoch": 0.49873316442192295, + "grad_norm": 0.22951197624206543, + "learning_rate": 9.779025843259031e-05, + "loss": 0.2942, + "step": 11220 + }, + { + "epoch": 0.49917766813352893, + "grad_norm": 0.2395482212305069, + "learning_rate": 9.778417783130204e-05, + "loss": 0.2939, + "step": 11230 + }, + { + "epoch": 0.4996221718451349, + "grad_norm": 0.22924521565437317, + "learning_rate": 9.777808906500468e-05, + "loss": 0.2919, + "step": 11240 + }, + { + "epoch": 0.5000666755567409, + "grad_norm": 0.27864882349967957, + "learning_rate": 9.777199213473862e-05, + "loss": 0.3011, + "step": 11250 + }, + { + "epoch": 0.5005111792683469, + "grad_norm": 0.24003036320209503, + "learning_rate": 9.77658870415457e-05, + "loss": 0.2924, + "step": 11260 + }, + { + "epoch": 0.5009556829799529, + "grad_norm": 0.21221831440925598, + "learning_rate": 9.775977378646906e-05, + "loss": 0.2929, + "step": 11270 + }, + { + "epoch": 0.5014001866915588, + "grad_norm": 0.21045559644699097, + "learning_rate": 9.775365237055331e-05, + "loss": 0.2929, + "step": 11280 + }, + { + "epoch": 0.5018446904031648, + "grad_norm": 0.2678508162498474, + "learning_rate": 9.774752279484445e-05, + "loss": 0.2932, + "step": 11290 + }, + { + "epoch": 0.5022891941147709, + "grad_norm": 0.24539460241794586, + "learning_rate": 9.774138506038984e-05, + "loss": 0.2947, + "step": 11300 + }, + { + "epoch": 0.5027336978263769, + "grad_norm": 0.24623851478099823, + "learning_rate": 9.773523916823826e-05, + "loss": 0.2926, + "step": 11310 + }, + { + "epoch": 0.5031782015379829, + "grad_norm": 0.20394602417945862, + "learning_rate": 9.772908511943986e-05, + "loss": 0.2927, + "step": 11320 + }, + { + "epoch": 0.5036227052495889, + "grad_norm": 0.24629776179790497, + "learning_rate": 9.77229229150462e-05, + "loss": 0.2922, + "step": 11330 + }, + { + "epoch": 0.5040672089611948, + "grad_norm": 0.2514769434928894, + "learning_rate": 9.771675255611024e-05, + "loss": 0.2936, + "step": 11340 + }, + { + "epoch": 0.5045117126728008, + "grad_norm": 0.22139033675193787, + "learning_rate": 9.771057404368632e-05, + "loss": 0.2938, + "step": 11350 + }, + { + "epoch": 0.5049562163844068, + "grad_norm": 0.2521882951259613, + "learning_rate": 9.770438737883018e-05, + "loss": 0.2942, + "step": 11360 + }, + { + "epoch": 0.5054007200960128, + "grad_norm": 0.2455502152442932, + "learning_rate": 9.769819256259898e-05, + "loss": 0.291, + "step": 11370 + }, + { + "epoch": 0.5058452238076188, + "grad_norm": 0.2087896764278412, + "learning_rate": 9.769198959605119e-05, + "loss": 0.2944, + "step": 11380 + }, + { + "epoch": 0.5062897275192247, + "grad_norm": 0.21087972819805145, + "learning_rate": 9.768577848024678e-05, + "loss": 0.2948, + "step": 11390 + }, + { + "epoch": 0.5067342312308307, + "grad_norm": 0.23765185475349426, + "learning_rate": 9.767955921624702e-05, + "loss": 0.2942, + "step": 11400 + }, + { + "epoch": 0.5071787349424368, + "grad_norm": 0.21359926462173462, + "learning_rate": 9.767333180511465e-05, + "loss": 0.2896, + "step": 11410 + }, + { + "epoch": 0.5076232386540428, + "grad_norm": 0.19110387563705444, + "learning_rate": 9.766709624791373e-05, + "loss": 0.2939, + "step": 11420 + }, + { + "epoch": 0.5080677423656488, + "grad_norm": 0.2395125776529312, + "learning_rate": 9.766085254570975e-05, + "loss": 0.2929, + "step": 11430 + }, + { + "epoch": 0.5085122460772548, + "grad_norm": 0.23603588342666626, + "learning_rate": 9.76546006995696e-05, + "loss": 0.2982, + "step": 11440 + }, + { + "epoch": 0.5089567497888607, + "grad_norm": 0.24561293423175812, + "learning_rate": 9.764834071056155e-05, + "loss": 0.2958, + "step": 11450 + }, + { + "epoch": 0.5094012535004667, + "grad_norm": 0.2492629587650299, + "learning_rate": 9.764207257975526e-05, + "loss": 0.2963, + "step": 11460 + }, + { + "epoch": 0.5098457572120727, + "grad_norm": 0.2262098640203476, + "learning_rate": 9.763579630822179e-05, + "loss": 0.2929, + "step": 11470 + }, + { + "epoch": 0.5102902609236787, + "grad_norm": 0.22878770530223846, + "learning_rate": 9.762951189703356e-05, + "loss": 0.2951, + "step": 11480 + }, + { + "epoch": 0.5107347646352847, + "grad_norm": 0.2367551028728485, + "learning_rate": 9.762321934726442e-05, + "loss": 0.2926, + "step": 11490 + }, + { + "epoch": 0.5111792683468906, + "grad_norm": 0.2598431408405304, + "learning_rate": 9.761691865998959e-05, + "loss": 0.2982, + "step": 11500 + }, + { + "epoch": 0.5116237720584967, + "grad_norm": 0.2507326304912567, + "learning_rate": 9.76106098362857e-05, + "loss": 0.2906, + "step": 11510 + }, + { + "epoch": 0.5120682757701027, + "grad_norm": 0.2271503061056137, + "learning_rate": 9.760429287723072e-05, + "loss": 0.2976, + "step": 11520 + }, + { + "epoch": 0.5125127794817087, + "grad_norm": 0.2715577185153961, + "learning_rate": 9.759796778390406e-05, + "loss": 0.2966, + "step": 11530 + }, + { + "epoch": 0.5129572831933147, + "grad_norm": 0.23049454391002655, + "learning_rate": 9.759163455738653e-05, + "loss": 0.2958, + "step": 11540 + }, + { + "epoch": 0.5134017869049207, + "grad_norm": 0.24338582158088684, + "learning_rate": 9.75852931987603e-05, + "loss": 0.2958, + "step": 11550 + }, + { + "epoch": 0.5138462906165266, + "grad_norm": 0.23084986209869385, + "learning_rate": 9.757894370910891e-05, + "loss": 0.2934, + "step": 11560 + }, + { + "epoch": 0.5142907943281326, + "grad_norm": 0.21860012412071228, + "learning_rate": 9.757258608951733e-05, + "loss": 0.2961, + "step": 11570 + }, + { + "epoch": 0.5147352980397386, + "grad_norm": 0.2024325132369995, + "learning_rate": 9.75662203410719e-05, + "loss": 0.2935, + "step": 11580 + }, + { + "epoch": 0.5151798017513446, + "grad_norm": 0.22600118815898895, + "learning_rate": 9.755984646486034e-05, + "loss": 0.2922, + "step": 11590 + }, + { + "epoch": 0.5156243054629506, + "grad_norm": 0.2138344943523407, + "learning_rate": 9.75534644619718e-05, + "loss": 0.2928, + "step": 11600 + }, + { + "epoch": 0.5160688091745566, + "grad_norm": 0.23928798735141754, + "learning_rate": 9.754707433349676e-05, + "loss": 0.2926, + "step": 11610 + }, + { + "epoch": 0.5165133128861626, + "grad_norm": 0.23305588960647583, + "learning_rate": 9.754067608052715e-05, + "loss": 0.2932, + "step": 11620 + }, + { + "epoch": 0.5169578165977686, + "grad_norm": 0.21017763018608093, + "learning_rate": 9.753426970415622e-05, + "loss": 0.2905, + "step": 11630 + }, + { + "epoch": 0.5174023203093746, + "grad_norm": 0.22377198934555054, + "learning_rate": 9.752785520547868e-05, + "loss": 0.2982, + "step": 11640 + }, + { + "epoch": 0.5178468240209806, + "grad_norm": 0.20734481513500214, + "learning_rate": 9.752143258559056e-05, + "loss": 0.2924, + "step": 11650 + }, + { + "epoch": 0.5182913277325866, + "grad_norm": 0.22205153107643127, + "learning_rate": 9.751500184558933e-05, + "loss": 0.2916, + "step": 11660 + }, + { + "epoch": 0.5187358314441926, + "grad_norm": 0.2053922861814499, + "learning_rate": 9.750856298657383e-05, + "loss": 0.2962, + "step": 11670 + }, + { + "epoch": 0.5191803351557985, + "grad_norm": 0.22297680377960205, + "learning_rate": 9.750211600964428e-05, + "loss": 0.2928, + "step": 11680 + }, + { + "epoch": 0.5196248388674045, + "grad_norm": 0.2506784498691559, + "learning_rate": 9.749566091590226e-05, + "loss": 0.293, + "step": 11690 + }, + { + "epoch": 0.5200693425790105, + "grad_norm": 0.24751503765583038, + "learning_rate": 9.748919770645083e-05, + "loss": 0.2909, + "step": 11700 + }, + { + "epoch": 0.5205138462906165, + "grad_norm": 0.17220091819763184, + "learning_rate": 9.748272638239432e-05, + "loss": 0.2963, + "step": 11710 + }, + { + "epoch": 0.5209583500022226, + "grad_norm": 0.23046647012233734, + "learning_rate": 9.747624694483855e-05, + "loss": 0.2909, + "step": 11720 + }, + { + "epoch": 0.5214028537138286, + "grad_norm": 0.20003828406333923, + "learning_rate": 9.746975939489065e-05, + "loss": 0.2917, + "step": 11730 + }, + { + "epoch": 0.5218473574254345, + "grad_norm": 0.229615718126297, + "learning_rate": 9.746326373365918e-05, + "loss": 0.2931, + "step": 11740 + }, + { + "epoch": 0.5222918611370405, + "grad_norm": 0.20692545175552368, + "learning_rate": 9.745675996225403e-05, + "loss": 0.2926, + "step": 11750 + }, + { + "epoch": 0.5227363648486465, + "grad_norm": 0.2609885334968567, + "learning_rate": 9.745024808178657e-05, + "loss": 0.2956, + "step": 11760 + }, + { + "epoch": 0.5231808685602525, + "grad_norm": 0.22919628024101257, + "learning_rate": 9.744372809336947e-05, + "loss": 0.2935, + "step": 11770 + }, + { + "epoch": 0.5236253722718585, + "grad_norm": 0.18576890230178833, + "learning_rate": 9.743719999811682e-05, + "loss": 0.2956, + "step": 11780 + }, + { + "epoch": 0.5240698759834644, + "grad_norm": 0.20750458538532257, + "learning_rate": 9.743066379714412e-05, + "loss": 0.291, + "step": 11790 + }, + { + "epoch": 0.5245143796950704, + "grad_norm": 0.22531236708164215, + "learning_rate": 9.74241194915682e-05, + "loss": 0.2915, + "step": 11800 + }, + { + "epoch": 0.5249588834066764, + "grad_norm": 0.21549701690673828, + "learning_rate": 9.741756708250731e-05, + "loss": 0.2911, + "step": 11810 + }, + { + "epoch": 0.5254033871182824, + "grad_norm": 0.23498950898647308, + "learning_rate": 9.741100657108109e-05, + "loss": 0.2918, + "step": 11820 + }, + { + "epoch": 0.5258478908298885, + "grad_norm": 0.25228703022003174, + "learning_rate": 9.740443795841054e-05, + "loss": 0.2921, + "step": 11830 + }, + { + "epoch": 0.5262923945414945, + "grad_norm": 0.23387277126312256, + "learning_rate": 9.739786124561805e-05, + "loss": 0.2922, + "step": 11840 + }, + { + "epoch": 0.5267368982531004, + "grad_norm": 0.2421509474515915, + "learning_rate": 9.73912764338274e-05, + "loss": 0.2967, + "step": 11850 + }, + { + "epoch": 0.5271814019647064, + "grad_norm": 0.2261589765548706, + "learning_rate": 9.738468352416377e-05, + "loss": 0.2947, + "step": 11860 + }, + { + "epoch": 0.5276259056763124, + "grad_norm": 0.2304808348417282, + "learning_rate": 9.737808251775369e-05, + "loss": 0.2926, + "step": 11870 + }, + { + "epoch": 0.5280704093879184, + "grad_norm": 0.21153883635997772, + "learning_rate": 9.737147341572512e-05, + "loss": 0.2958, + "step": 11880 + }, + { + "epoch": 0.5285149130995244, + "grad_norm": 0.22632355988025665, + "learning_rate": 9.736485621920735e-05, + "loss": 0.2916, + "step": 11890 + }, + { + "epoch": 0.5289594168111303, + "grad_norm": 0.21684083342552185, + "learning_rate": 9.735823092933108e-05, + "loss": 0.293, + "step": 11900 + }, + { + "epoch": 0.5294039205227363, + "grad_norm": 0.22780896723270416, + "learning_rate": 9.735159754722838e-05, + "loss": 0.2938, + "step": 11910 + }, + { + "epoch": 0.5298484242343423, + "grad_norm": 0.2379431575536728, + "learning_rate": 9.734495607403275e-05, + "loss": 0.2943, + "step": 11920 + }, + { + "epoch": 0.5302929279459484, + "grad_norm": 0.252768874168396, + "learning_rate": 9.733830651087901e-05, + "loss": 0.2913, + "step": 11930 + }, + { + "epoch": 0.5307374316575544, + "grad_norm": 0.2102116197347641, + "learning_rate": 9.733164885890338e-05, + "loss": 0.2913, + "step": 11940 + }, + { + "epoch": 0.5311819353691604, + "grad_norm": 0.2164151668548584, + "learning_rate": 9.732498311924349e-05, + "loss": 0.2975, + "step": 11950 + }, + { + "epoch": 0.5316264390807663, + "grad_norm": 0.20508065819740295, + "learning_rate": 9.731830929303833e-05, + "loss": 0.2968, + "step": 11960 + }, + { + "epoch": 0.5320709427923723, + "grad_norm": 0.21006551384925842, + "learning_rate": 9.731162738142827e-05, + "loss": 0.2941, + "step": 11970 + }, + { + "epoch": 0.5325154465039783, + "grad_norm": 0.2150406837463379, + "learning_rate": 9.730493738555506e-05, + "loss": 0.2945, + "step": 11980 + }, + { + "epoch": 0.5329599502155843, + "grad_norm": 0.25183483958244324, + "learning_rate": 9.729823930656186e-05, + "loss": 0.2909, + "step": 11990 + }, + { + "epoch": 0.5334044539271903, + "grad_norm": 0.22336570918560028, + "learning_rate": 9.729153314559316e-05, + "loss": 0.2903, + "step": 12000 + }, + { + "epoch": 0.5338489576387963, + "grad_norm": 0.21916691958904266, + "learning_rate": 9.728481890379486e-05, + "loss": 0.295, + "step": 12010 + }, + { + "epoch": 0.5342934613504022, + "grad_norm": 0.20905007421970367, + "learning_rate": 9.727809658231428e-05, + "loss": 0.2945, + "step": 12020 + }, + { + "epoch": 0.5347379650620082, + "grad_norm": 0.21331876516342163, + "learning_rate": 9.727136618230003e-05, + "loss": 0.294, + "step": 12030 + }, + { + "epoch": 0.5351824687736143, + "grad_norm": 0.215127095580101, + "learning_rate": 9.726462770490219e-05, + "loss": 0.2938, + "step": 12040 + }, + { + "epoch": 0.5356269724852203, + "grad_norm": 0.23360927402973175, + "learning_rate": 9.725788115127214e-05, + "loss": 0.2923, + "step": 12050 + }, + { + "epoch": 0.5360714761968263, + "grad_norm": 0.2553536295890808, + "learning_rate": 9.725112652256274e-05, + "loss": 0.2952, + "step": 12060 + }, + { + "epoch": 0.5365159799084323, + "grad_norm": 0.19060087203979492, + "learning_rate": 9.724436381992812e-05, + "loss": 0.2925, + "step": 12070 + }, + { + "epoch": 0.5369604836200382, + "grad_norm": 0.2450326532125473, + "learning_rate": 9.723759304452387e-05, + "loss": 0.2933, + "step": 12080 + }, + { + "epoch": 0.5374049873316442, + "grad_norm": 0.22048822045326233, + "learning_rate": 9.72308141975069e-05, + "loss": 0.2923, + "step": 12090 + }, + { + "epoch": 0.5378494910432502, + "grad_norm": 0.24524632096290588, + "learning_rate": 9.722402728003557e-05, + "loss": 0.2927, + "step": 12100 + }, + { + "epoch": 0.5382939947548562, + "grad_norm": 0.30574119091033936, + "learning_rate": 9.721723229326953e-05, + "loss": 0.2951, + "step": 12110 + }, + { + "epoch": 0.5387384984664622, + "grad_norm": 0.21001240611076355, + "learning_rate": 9.721042923836992e-05, + "loss": 0.2953, + "step": 12120 + }, + { + "epoch": 0.5391830021780681, + "grad_norm": 0.21412692964076996, + "learning_rate": 9.720361811649914e-05, + "loss": 0.2923, + "step": 12130 + }, + { + "epoch": 0.5396275058896742, + "grad_norm": 0.2206936627626419, + "learning_rate": 9.719679892882106e-05, + "loss": 0.2963, + "step": 12140 + }, + { + "epoch": 0.5400720096012802, + "grad_norm": 0.25109174847602844, + "learning_rate": 9.718997167650085e-05, + "loss": 0.294, + "step": 12150 + }, + { + "epoch": 0.5405165133128862, + "grad_norm": 0.24803973734378815, + "learning_rate": 9.718313636070515e-05, + "loss": 0.2972, + "step": 12160 + }, + { + "epoch": 0.5409610170244922, + "grad_norm": 0.24165327847003937, + "learning_rate": 9.717629298260192e-05, + "loss": 0.2948, + "step": 12170 + }, + { + "epoch": 0.5414055207360982, + "grad_norm": 0.21935458481311798, + "learning_rate": 9.716944154336047e-05, + "loss": 0.2951, + "step": 12180 + }, + { + "epoch": 0.5418500244477041, + "grad_norm": 0.2378547340631485, + "learning_rate": 9.716258204415157e-05, + "loss": 0.2926, + "step": 12190 + }, + { + "epoch": 0.5422945281593101, + "grad_norm": 0.25029000639915466, + "learning_rate": 9.715571448614728e-05, + "loss": 0.292, + "step": 12200 + }, + { + "epoch": 0.5427390318709161, + "grad_norm": 0.2555437982082367, + "learning_rate": 9.71488388705211e-05, + "loss": 0.2949, + "step": 12210 + }, + { + "epoch": 0.5431835355825221, + "grad_norm": 0.21294501423835754, + "learning_rate": 9.714195519844788e-05, + "loss": 0.2914, + "step": 12220 + }, + { + "epoch": 0.5436280392941281, + "grad_norm": 0.2384794056415558, + "learning_rate": 9.713506347110386e-05, + "loss": 0.2938, + "step": 12230 + }, + { + "epoch": 0.544072543005734, + "grad_norm": 0.22081489861011505, + "learning_rate": 9.712816368966662e-05, + "loss": 0.2929, + "step": 12240 + }, + { + "epoch": 0.5445170467173401, + "grad_norm": 0.20789815485477448, + "learning_rate": 9.712125585531517e-05, + "loss": 0.2932, + "step": 12250 + }, + { + "epoch": 0.5449615504289461, + "grad_norm": 0.2700440287590027, + "learning_rate": 9.711433996922988e-05, + "loss": 0.2931, + "step": 12260 + }, + { + "epoch": 0.5454060541405521, + "grad_norm": 0.24593977630138397, + "learning_rate": 9.710741603259245e-05, + "loss": 0.2935, + "step": 12270 + }, + { + "epoch": 0.5458505578521581, + "grad_norm": 0.2262805998325348, + "learning_rate": 9.710048404658603e-05, + "loss": 0.2937, + "step": 12280 + }, + { + "epoch": 0.5462950615637641, + "grad_norm": 0.2168828547000885, + "learning_rate": 9.709354401239508e-05, + "loss": 0.2919, + "step": 12290 + }, + { + "epoch": 0.54673956527537, + "grad_norm": 0.2541919946670532, + "learning_rate": 9.708659593120546e-05, + "loss": 0.2923, + "step": 12300 + }, + { + "epoch": 0.547184068986976, + "grad_norm": 0.2438151240348816, + "learning_rate": 9.707963980420443e-05, + "loss": 0.295, + "step": 12310 + }, + { + "epoch": 0.547628572698582, + "grad_norm": 0.19579701125621796, + "learning_rate": 9.707267563258058e-05, + "loss": 0.2898, + "step": 12320 + }, + { + "epoch": 0.548073076410188, + "grad_norm": 0.2841345965862274, + "learning_rate": 9.70657034175239e-05, + "loss": 0.291, + "step": 12330 + }, + { + "epoch": 0.548517580121794, + "grad_norm": 0.25003498792648315, + "learning_rate": 9.705872316022577e-05, + "loss": 0.2932, + "step": 12340 + }, + { + "epoch": 0.5489620838334001, + "grad_norm": 0.21327929198741913, + "learning_rate": 9.705173486187891e-05, + "loss": 0.2936, + "step": 12350 + }, + { + "epoch": 0.549406587545006, + "grad_norm": 0.21034300327301025, + "learning_rate": 9.704473852367741e-05, + "loss": 0.29, + "step": 12360 + }, + { + "epoch": 0.549851091256612, + "grad_norm": 0.24111104011535645, + "learning_rate": 9.70377341468168e-05, + "loss": 0.2926, + "step": 12370 + }, + { + "epoch": 0.550295594968218, + "grad_norm": 0.25966212153434753, + "learning_rate": 9.703072173249389e-05, + "loss": 0.2957, + "step": 12380 + }, + { + "epoch": 0.550740098679824, + "grad_norm": 0.21881182491779327, + "learning_rate": 9.702370128190693e-05, + "loss": 0.2884, + "step": 12390 + }, + { + "epoch": 0.55118460239143, + "grad_norm": 0.25685355067253113, + "learning_rate": 9.701667279625552e-05, + "loss": 0.2916, + "step": 12400 + }, + { + "epoch": 0.551629106103036, + "grad_norm": 0.24263980984687805, + "learning_rate": 9.700963627674065e-05, + "loss": 0.296, + "step": 12410 + }, + { + "epoch": 0.5520736098146419, + "grad_norm": 0.24528640508651733, + "learning_rate": 9.700259172456466e-05, + "loss": 0.2945, + "step": 12420 + }, + { + "epoch": 0.5525181135262479, + "grad_norm": 0.22516100108623505, + "learning_rate": 9.699553914093124e-05, + "loss": 0.2966, + "step": 12430 + }, + { + "epoch": 0.5529626172378539, + "grad_norm": 0.23158112168312073, + "learning_rate": 9.698847852704553e-05, + "loss": 0.2941, + "step": 12440 + }, + { + "epoch": 0.5534071209494599, + "grad_norm": 0.2101387232542038, + "learning_rate": 9.6981409884114e-05, + "loss": 0.2896, + "step": 12450 + }, + { + "epoch": 0.553851624661066, + "grad_norm": 0.2820063531398773, + "learning_rate": 9.697433321334443e-05, + "loss": 0.2951, + "step": 12460 + }, + { + "epoch": 0.554296128372672, + "grad_norm": 0.24684451520442963, + "learning_rate": 9.696724851594607e-05, + "loss": 0.2932, + "step": 12470 + }, + { + "epoch": 0.5547406320842779, + "grad_norm": 0.2418467253446579, + "learning_rate": 9.696015579312952e-05, + "loss": 0.2956, + "step": 12480 + }, + { + "epoch": 0.5551851357958839, + "grad_norm": 0.19561880826950073, + "learning_rate": 9.695305504610668e-05, + "loss": 0.2912, + "step": 12490 + }, + { + "epoch": 0.5556296395074899, + "grad_norm": 0.27593740820884705, + "learning_rate": 9.694594627609092e-05, + "loss": 0.291, + "step": 12500 + }, + { + "epoch": 0.5560741432190959, + "grad_norm": 0.25578129291534424, + "learning_rate": 9.693882948429691e-05, + "loss": 0.2956, + "step": 12510 + }, + { + "epoch": 0.5565186469307019, + "grad_norm": 0.2767714560031891, + "learning_rate": 9.693170467194071e-05, + "loss": 0.2927, + "step": 12520 + }, + { + "epoch": 0.5569631506423078, + "grad_norm": 0.21735799312591553, + "learning_rate": 9.692457184023977e-05, + "loss": 0.2918, + "step": 12530 + }, + { + "epoch": 0.5574076543539138, + "grad_norm": 0.22655624151229858, + "learning_rate": 9.691743099041291e-05, + "loss": 0.2941, + "step": 12540 + }, + { + "epoch": 0.5578521580655198, + "grad_norm": 0.19719785451889038, + "learning_rate": 9.691028212368027e-05, + "loss": 0.2955, + "step": 12550 + }, + { + "epoch": 0.5582966617771259, + "grad_norm": 0.18336382508277893, + "learning_rate": 9.690312524126342e-05, + "loss": 0.2898, + "step": 12560 + }, + { + "epoch": 0.5587411654887319, + "grad_norm": 0.23160527646541595, + "learning_rate": 9.689596034438527e-05, + "loss": 0.2949, + "step": 12570 + }, + { + "epoch": 0.5591856692003379, + "grad_norm": 0.2159404456615448, + "learning_rate": 9.688878743427012e-05, + "loss": 0.2952, + "step": 12580 + }, + { + "epoch": 0.5596301729119438, + "grad_norm": 0.251957505941391, + "learning_rate": 9.688160651214359e-05, + "loss": 0.2897, + "step": 12590 + }, + { + "epoch": 0.5600746766235498, + "grad_norm": 0.23546753823757172, + "learning_rate": 9.687441757923273e-05, + "loss": 0.2916, + "step": 12600 + }, + { + "epoch": 0.5605191803351558, + "grad_norm": 0.23205634951591492, + "learning_rate": 9.68672206367659e-05, + "loss": 0.2909, + "step": 12610 + }, + { + "epoch": 0.5609636840467618, + "grad_norm": 0.18839889764785767, + "learning_rate": 9.686001568597291e-05, + "loss": 0.2926, + "step": 12620 + }, + { + "epoch": 0.5614081877583678, + "grad_norm": 0.21483798325061798, + "learning_rate": 9.685280272808486e-05, + "loss": 0.2928, + "step": 12630 + }, + { + "epoch": 0.5618526914699737, + "grad_norm": 0.22099877893924713, + "learning_rate": 9.684558176433424e-05, + "loss": 0.2892, + "step": 12640 + }, + { + "epoch": 0.5622971951815797, + "grad_norm": 0.2757834196090698, + "learning_rate": 9.683835279595495e-05, + "loss": 0.2892, + "step": 12650 + }, + { + "epoch": 0.5627416988931857, + "grad_norm": 0.30728867650032043, + "learning_rate": 9.683111582418216e-05, + "loss": 0.2968, + "step": 12660 + }, + { + "epoch": 0.5631862026047918, + "grad_norm": 0.2396005243062973, + "learning_rate": 9.682387085025254e-05, + "loss": 0.2885, + "step": 12670 + }, + { + "epoch": 0.5636307063163978, + "grad_norm": 0.24524140357971191, + "learning_rate": 9.681661787540401e-05, + "loss": 0.2907, + "step": 12680 + }, + { + "epoch": 0.5640752100280038, + "grad_norm": 0.223764106631279, + "learning_rate": 9.680935690087593e-05, + "loss": 0.2942, + "step": 12690 + }, + { + "epoch": 0.5645197137396097, + "grad_norm": 0.21568983793258667, + "learning_rate": 9.680208792790901e-05, + "loss": 0.2921, + "step": 12700 + }, + { + "epoch": 0.5649642174512157, + "grad_norm": 0.233209490776062, + "learning_rate": 9.679481095774529e-05, + "loss": 0.2889, + "step": 12710 + }, + { + "epoch": 0.5654087211628217, + "grad_norm": 0.23102827370166779, + "learning_rate": 9.678752599162822e-05, + "loss": 0.2938, + "step": 12720 + }, + { + "epoch": 0.5658532248744277, + "grad_norm": 0.25606659054756165, + "learning_rate": 9.678023303080259e-05, + "loss": 0.2954, + "step": 12730 + }, + { + "epoch": 0.5662977285860337, + "grad_norm": 0.25866830348968506, + "learning_rate": 9.677293207651459e-05, + "loss": 0.2909, + "step": 12740 + }, + { + "epoch": 0.5667422322976396, + "grad_norm": 0.20850858092308044, + "learning_rate": 9.676562313001173e-05, + "loss": 0.291, + "step": 12750 + }, + { + "epoch": 0.5671867360092456, + "grad_norm": 0.2279546856880188, + "learning_rate": 9.675830619254293e-05, + "loss": 0.2928, + "step": 12760 + }, + { + "epoch": 0.5676312397208517, + "grad_norm": 0.20325301587581635, + "learning_rate": 9.675098126535843e-05, + "loss": 0.2896, + "step": 12770 + }, + { + "epoch": 0.5680757434324577, + "grad_norm": 0.24836884438991547, + "learning_rate": 9.674364834970988e-05, + "loss": 0.2959, + "step": 12780 + }, + { + "epoch": 0.5685202471440637, + "grad_norm": 0.2567458152770996, + "learning_rate": 9.673630744685028e-05, + "loss": 0.2956, + "step": 12790 + }, + { + "epoch": 0.5689647508556697, + "grad_norm": 0.24492938816547394, + "learning_rate": 9.672895855803397e-05, + "loss": 0.2953, + "step": 12800 + }, + { + "epoch": 0.5694092545672756, + "grad_norm": 0.2946721315383911, + "learning_rate": 9.672160168451667e-05, + "loss": 0.2955, + "step": 12810 + }, + { + "epoch": 0.5698537582788816, + "grad_norm": 0.300987184047699, + "learning_rate": 9.671423682755549e-05, + "loss": 0.2918, + "step": 12820 + }, + { + "epoch": 0.5702982619904876, + "grad_norm": 0.20916037261486053, + "learning_rate": 9.670686398840888e-05, + "loss": 0.2939, + "step": 12830 + }, + { + "epoch": 0.5707427657020936, + "grad_norm": 0.20441651344299316, + "learning_rate": 9.669948316833664e-05, + "loss": 0.2957, + "step": 12840 + }, + { + "epoch": 0.5711872694136996, + "grad_norm": 0.26659396290779114, + "learning_rate": 9.669209436859997e-05, + "loss": 0.2916, + "step": 12850 + }, + { + "epoch": 0.5716317731253056, + "grad_norm": 0.21438734233379364, + "learning_rate": 9.66846975904614e-05, + "loss": 0.2899, + "step": 12860 + }, + { + "epoch": 0.5720762768369115, + "grad_norm": 0.2145991176366806, + "learning_rate": 9.667729283518483e-05, + "loss": 0.2979, + "step": 12870 + }, + { + "epoch": 0.5725207805485176, + "grad_norm": 0.21771354973316193, + "learning_rate": 9.666988010403557e-05, + "loss": 0.2908, + "step": 12880 + }, + { + "epoch": 0.5729652842601236, + "grad_norm": 0.22167086601257324, + "learning_rate": 9.66624593982802e-05, + "loss": 0.2915, + "step": 12890 + }, + { + "epoch": 0.5734097879717296, + "grad_norm": 0.23611631989479065, + "learning_rate": 9.665503071918675e-05, + "loss": 0.2909, + "step": 12900 + }, + { + "epoch": 0.5738542916833356, + "grad_norm": 0.23636697232723236, + "learning_rate": 9.664759406802456e-05, + "loss": 0.2942, + "step": 12910 + }, + { + "epoch": 0.5742987953949416, + "grad_norm": 0.2369667887687683, + "learning_rate": 9.664014944606437e-05, + "loss": 0.2945, + "step": 12920 + }, + { + "epoch": 0.5747432991065475, + "grad_norm": 0.2105274796485901, + "learning_rate": 9.663269685457822e-05, + "loss": 0.2897, + "step": 12930 + }, + { + "epoch": 0.5751878028181535, + "grad_norm": 0.23276376724243164, + "learning_rate": 9.662523629483962e-05, + "loss": 0.292, + "step": 12940 + }, + { + "epoch": 0.5756323065297595, + "grad_norm": 0.24027150869369507, + "learning_rate": 9.661776776812333e-05, + "loss": 0.2909, + "step": 12950 + }, + { + "epoch": 0.5760768102413655, + "grad_norm": 0.26251834630966187, + "learning_rate": 9.661029127570553e-05, + "loss": 0.2903, + "step": 12960 + }, + { + "epoch": 0.5765213139529715, + "grad_norm": 0.23672805726528168, + "learning_rate": 9.660280681886373e-05, + "loss": 0.2933, + "step": 12970 + }, + { + "epoch": 0.5769658176645776, + "grad_norm": 0.22331362962722778, + "learning_rate": 9.659531439887685e-05, + "loss": 0.2937, + "step": 12980 + }, + { + "epoch": 0.5774103213761835, + "grad_norm": 0.24839992821216583, + "learning_rate": 9.658781401702511e-05, + "loss": 0.2921, + "step": 12990 + }, + { + "epoch": 0.5778548250877895, + "grad_norm": 0.21064649522304535, + "learning_rate": 9.658030567459015e-05, + "loss": 0.2926, + "step": 13000 + }, + { + "epoch": 0.5782993287993955, + "grad_norm": 0.22921045124530792, + "learning_rate": 9.65727893728549e-05, + "loss": 0.2929, + "step": 13010 + }, + { + "epoch": 0.5787438325110015, + "grad_norm": 0.22505544126033783, + "learning_rate": 9.656526511310375e-05, + "loss": 0.2925, + "step": 13020 + }, + { + "epoch": 0.5791883362226075, + "grad_norm": 0.2924664318561554, + "learning_rate": 9.655773289662233e-05, + "loss": 0.2916, + "step": 13030 + }, + { + "epoch": 0.5796328399342134, + "grad_norm": 0.24794597923755646, + "learning_rate": 9.655019272469772e-05, + "loss": 0.2933, + "step": 13040 + }, + { + "epoch": 0.5800773436458194, + "grad_norm": 0.21423020958900452, + "learning_rate": 9.654264459861832e-05, + "loss": 0.2925, + "step": 13050 + }, + { + "epoch": 0.5805218473574254, + "grad_norm": 0.24917946755886078, + "learning_rate": 9.653508851967391e-05, + "loss": 0.2915, + "step": 13060 + }, + { + "epoch": 0.5809663510690314, + "grad_norm": 0.18149971961975098, + "learning_rate": 9.65275244891556e-05, + "loss": 0.2924, + "step": 13070 + }, + { + "epoch": 0.5814108547806374, + "grad_norm": 0.2497694492340088, + "learning_rate": 9.651995250835591e-05, + "loss": 0.2911, + "step": 13080 + }, + { + "epoch": 0.5818553584922435, + "grad_norm": 0.2242593616247177, + "learning_rate": 9.651237257856862e-05, + "loss": 0.2955, + "step": 13090 + }, + { + "epoch": 0.5822998622038494, + "grad_norm": 0.22217150032520294, + "learning_rate": 9.6504784701089e-05, + "loss": 0.2922, + "step": 13100 + }, + { + "epoch": 0.5827443659154554, + "grad_norm": 0.22610501945018768, + "learning_rate": 9.649718887721357e-05, + "loss": 0.2937, + "step": 13110 + }, + { + "epoch": 0.5831888696270614, + "grad_norm": 0.20916247367858887, + "learning_rate": 9.648958510824028e-05, + "loss": 0.2947, + "step": 13120 + }, + { + "epoch": 0.5836333733386674, + "grad_norm": 0.22473081946372986, + "learning_rate": 9.648197339546837e-05, + "loss": 0.2951, + "step": 13130 + }, + { + "epoch": 0.5840778770502734, + "grad_norm": 0.20731133222579956, + "learning_rate": 9.647435374019851e-05, + "loss": 0.2907, + "step": 13140 + }, + { + "epoch": 0.5845223807618793, + "grad_norm": 0.23594151437282562, + "learning_rate": 9.646672614373266e-05, + "loss": 0.2939, + "step": 13150 + }, + { + "epoch": 0.5849668844734853, + "grad_norm": 0.22792035341262817, + "learning_rate": 9.645909060737418e-05, + "loss": 0.2923, + "step": 13160 + }, + { + "epoch": 0.5854113881850913, + "grad_norm": 0.17636287212371826, + "learning_rate": 9.645144713242778e-05, + "loss": 0.2913, + "step": 13170 + }, + { + "epoch": 0.5858558918966973, + "grad_norm": 0.22215153276920319, + "learning_rate": 9.64437957201995e-05, + "loss": 0.292, + "step": 13180 + }, + { + "epoch": 0.5863003956083034, + "grad_norm": 0.2073495239019394, + "learning_rate": 9.643613637199678e-05, + "loss": 0.2932, + "step": 13190 + }, + { + "epoch": 0.5867448993199094, + "grad_norm": 0.23554739356040955, + "learning_rate": 9.642846908912839e-05, + "loss": 0.2953, + "step": 13200 + }, + { + "epoch": 0.5871894030315153, + "grad_norm": 0.23514990508556366, + "learning_rate": 9.642079387290444e-05, + "loss": 0.2901, + "step": 13210 + }, + { + "epoch": 0.5876339067431213, + "grad_norm": 0.2045503854751587, + "learning_rate": 9.641311072463644e-05, + "loss": 0.2934, + "step": 13220 + }, + { + "epoch": 0.5880784104547273, + "grad_norm": 0.24406351149082184, + "learning_rate": 9.640541964563722e-05, + "loss": 0.2946, + "step": 13230 + }, + { + "epoch": 0.5885229141663333, + "grad_norm": 0.2146412581205368, + "learning_rate": 9.639772063722096e-05, + "loss": 0.2936, + "step": 13240 + }, + { + "epoch": 0.5889674178779393, + "grad_norm": 0.252351850271225, + "learning_rate": 9.639001370070324e-05, + "loss": 0.2924, + "step": 13250 + }, + { + "epoch": 0.5894119215895453, + "grad_norm": 0.21913199126720428, + "learning_rate": 9.638229883740095e-05, + "loss": 0.2923, + "step": 13260 + }, + { + "epoch": 0.5898564253011512, + "grad_norm": 0.29293736815452576, + "learning_rate": 9.637457604863233e-05, + "loss": 0.2956, + "step": 13270 + }, + { + "epoch": 0.5903009290127572, + "grad_norm": 0.24902328848838806, + "learning_rate": 9.636684533571703e-05, + "loss": 0.2929, + "step": 13280 + }, + { + "epoch": 0.5907454327243632, + "grad_norm": 0.23993617296218872, + "learning_rate": 9.635910669997599e-05, + "loss": 0.2928, + "step": 13290 + }, + { + "epoch": 0.5911899364359693, + "grad_norm": 0.2508469521999359, + "learning_rate": 9.635136014273154e-05, + "loss": 0.2955, + "step": 13300 + }, + { + "epoch": 0.5916344401475753, + "grad_norm": 0.2342986762523651, + "learning_rate": 9.634360566530735e-05, + "loss": 0.2889, + "step": 13310 + }, + { + "epoch": 0.5920789438591813, + "grad_norm": 0.2091214954853058, + "learning_rate": 9.633584326902845e-05, + "loss": 0.2898, + "step": 13320 + }, + { + "epoch": 0.5925234475707872, + "grad_norm": 0.1979387104511261, + "learning_rate": 9.632807295522124e-05, + "loss": 0.2928, + "step": 13330 + }, + { + "epoch": 0.5929679512823932, + "grad_norm": 0.25069698691368103, + "learning_rate": 9.632029472521342e-05, + "loss": 0.2903, + "step": 13340 + }, + { + "epoch": 0.5934124549939992, + "grad_norm": 0.26270222663879395, + "learning_rate": 9.631250858033409e-05, + "loss": 0.2959, + "step": 13350 + }, + { + "epoch": 0.5938569587056052, + "grad_norm": 0.21734240651130676, + "learning_rate": 9.630471452191371e-05, + "loss": 0.2932, + "step": 13360 + }, + { + "epoch": 0.5943014624172112, + "grad_norm": 0.2515488266944885, + "learning_rate": 9.629691255128405e-05, + "loss": 0.2924, + "step": 13370 + }, + { + "epoch": 0.5947459661288171, + "grad_norm": 0.2586407959461212, + "learning_rate": 9.628910266977825e-05, + "loss": 0.2947, + "step": 13380 + }, + { + "epoch": 0.5951904698404231, + "grad_norm": 0.21888893842697144, + "learning_rate": 9.628128487873083e-05, + "loss": 0.2935, + "step": 13390 + }, + { + "epoch": 0.5956349735520292, + "grad_norm": 0.2218107134103775, + "learning_rate": 9.627345917947761e-05, + "loss": 0.2917, + "step": 13400 + }, + { + "epoch": 0.5960794772636352, + "grad_norm": 0.2722899317741394, + "learning_rate": 9.626562557335579e-05, + "loss": 0.295, + "step": 13410 + }, + { + "epoch": 0.5965239809752412, + "grad_norm": 0.2192695587873459, + "learning_rate": 9.625778406170393e-05, + "loss": 0.2918, + "step": 13420 + }, + { + "epoch": 0.5969684846868472, + "grad_norm": 0.28228551149368286, + "learning_rate": 9.624993464586193e-05, + "loss": 0.291, + "step": 13430 + }, + { + "epoch": 0.5974129883984531, + "grad_norm": 0.2721642553806305, + "learning_rate": 9.624207732717105e-05, + "loss": 0.2955, + "step": 13440 + }, + { + "epoch": 0.5978574921100591, + "grad_norm": 0.24658510088920593, + "learning_rate": 9.623421210697386e-05, + "loss": 0.2922, + "step": 13450 + }, + { + "epoch": 0.5983019958216651, + "grad_norm": 0.28427234292030334, + "learning_rate": 9.622633898661434e-05, + "loss": 0.2959, + "step": 13460 + }, + { + "epoch": 0.5987464995332711, + "grad_norm": 0.24708615243434906, + "learning_rate": 9.621845796743778e-05, + "loss": 0.2939, + "step": 13470 + }, + { + "epoch": 0.5991910032448771, + "grad_norm": 0.21128752827644348, + "learning_rate": 9.621056905079082e-05, + "loss": 0.2935, + "step": 13480 + }, + { + "epoch": 0.599635506956483, + "grad_norm": 0.26649633049964905, + "learning_rate": 9.620267223802149e-05, + "loss": 0.2975, + "step": 13490 + }, + { + "epoch": 0.600080010668089, + "grad_norm": 0.2462792545557022, + "learning_rate": 9.619476753047911e-05, + "loss": 0.2923, + "step": 13500 + }, + { + "epoch": 0.6005245143796951, + "grad_norm": 0.19200995564460754, + "learning_rate": 9.618685492951438e-05, + "loss": 0.2941, + "step": 13510 + }, + { + "epoch": 0.6009690180913011, + "grad_norm": 0.23851639032363892, + "learning_rate": 9.617893443647938e-05, + "loss": 0.2954, + "step": 13520 + }, + { + "epoch": 0.6014135218029071, + "grad_norm": 0.21056975424289703, + "learning_rate": 9.617100605272746e-05, + "loss": 0.2917, + "step": 13530 + }, + { + "epoch": 0.6018580255145131, + "grad_norm": 0.22378350794315338, + "learning_rate": 9.616306977961338e-05, + "loss": 0.2949, + "step": 13540 + }, + { + "epoch": 0.602302529226119, + "grad_norm": 0.20829790830612183, + "learning_rate": 9.615512561849326e-05, + "loss": 0.2946, + "step": 13550 + }, + { + "epoch": 0.602747032937725, + "grad_norm": 0.18553480505943298, + "learning_rate": 9.61471735707245e-05, + "loss": 0.2927, + "step": 13560 + }, + { + "epoch": 0.603191536649331, + "grad_norm": 0.19074584543704987, + "learning_rate": 9.613921363766592e-05, + "loss": 0.2941, + "step": 13570 + }, + { + "epoch": 0.603636040360937, + "grad_norm": 0.20161645114421844, + "learning_rate": 9.613124582067763e-05, + "loss": 0.2968, + "step": 13580 + }, + { + "epoch": 0.604080544072543, + "grad_norm": 0.22098539769649506, + "learning_rate": 9.612327012112112e-05, + "loss": 0.2936, + "step": 13590 + }, + { + "epoch": 0.604525047784149, + "grad_norm": 0.1977303922176361, + "learning_rate": 9.611528654035921e-05, + "loss": 0.2939, + "step": 13600 + }, + { + "epoch": 0.604969551495755, + "grad_norm": 0.22272013127803802, + "learning_rate": 9.610729507975611e-05, + "loss": 0.2952, + "step": 13610 + }, + { + "epoch": 0.605414055207361, + "grad_norm": 0.1849678009748459, + "learning_rate": 9.609929574067731e-05, + "loss": 0.2935, + "step": 13620 + }, + { + "epoch": 0.605858558918967, + "grad_norm": 0.24855802953243256, + "learning_rate": 9.609128852448967e-05, + "loss": 0.2939, + "step": 13630 + }, + { + "epoch": 0.606303062630573, + "grad_norm": 0.210510715842247, + "learning_rate": 9.608327343256143e-05, + "loss": 0.2937, + "step": 13640 + }, + { + "epoch": 0.606747566342179, + "grad_norm": 0.2504960894584656, + "learning_rate": 9.607525046626216e-05, + "loss": 0.2955, + "step": 13650 + }, + { + "epoch": 0.607192070053785, + "grad_norm": 0.23873715102672577, + "learning_rate": 9.606721962696272e-05, + "loss": 0.2913, + "step": 13660 + }, + { + "epoch": 0.6076365737653909, + "grad_norm": 0.23605088889598846, + "learning_rate": 9.60591809160354e-05, + "loss": 0.2889, + "step": 13670 + }, + { + "epoch": 0.6080810774769969, + "grad_norm": 0.2491219937801361, + "learning_rate": 9.605113433485378e-05, + "loss": 0.2942, + "step": 13680 + }, + { + "epoch": 0.6085255811886029, + "grad_norm": 0.23628704249858856, + "learning_rate": 9.604307988479279e-05, + "loss": 0.2929, + "step": 13690 + }, + { + "epoch": 0.6089700849002089, + "grad_norm": 0.2023860365152359, + "learning_rate": 9.603501756722876e-05, + "loss": 0.2919, + "step": 13700 + }, + { + "epoch": 0.6094145886118149, + "grad_norm": 0.2388572096824646, + "learning_rate": 9.602694738353927e-05, + "loss": 0.2945, + "step": 13710 + }, + { + "epoch": 0.609859092323421, + "grad_norm": 0.2436075210571289, + "learning_rate": 9.601886933510331e-05, + "loss": 0.2926, + "step": 13720 + }, + { + "epoch": 0.6103035960350269, + "grad_norm": 0.21807387471199036, + "learning_rate": 9.60107834233012e-05, + "loss": 0.2933, + "step": 13730 + }, + { + "epoch": 0.6107480997466329, + "grad_norm": 0.24790386855602264, + "learning_rate": 9.60026896495146e-05, + "loss": 0.2905, + "step": 13740 + }, + { + "epoch": 0.6111926034582389, + "grad_norm": 0.23644591867923737, + "learning_rate": 9.599458801512652e-05, + "loss": 0.293, + "step": 13750 + }, + { + "epoch": 0.6116371071698449, + "grad_norm": 0.20205363631248474, + "learning_rate": 9.598647852152129e-05, + "loss": 0.2883, + "step": 13760 + }, + { + "epoch": 0.6120816108814509, + "grad_norm": 0.2291758954524994, + "learning_rate": 9.597836117008462e-05, + "loss": 0.2912, + "step": 13770 + }, + { + "epoch": 0.6125261145930568, + "grad_norm": 0.21136589348316193, + "learning_rate": 9.597023596220356e-05, + "loss": 0.2933, + "step": 13780 + }, + { + "epoch": 0.6129706183046628, + "grad_norm": 0.27605098485946655, + "learning_rate": 9.596210289926643e-05, + "loss": 0.2913, + "step": 13790 + }, + { + "epoch": 0.6134151220162688, + "grad_norm": 0.18874259293079376, + "learning_rate": 9.5953961982663e-05, + "loss": 0.292, + "step": 13800 + }, + { + "epoch": 0.6138596257278748, + "grad_norm": 0.23314709961414337, + "learning_rate": 9.594581321378431e-05, + "loss": 0.2909, + "step": 13810 + }, + { + "epoch": 0.6143041294394809, + "grad_norm": 0.25570550560951233, + "learning_rate": 9.593765659402276e-05, + "loss": 0.2918, + "step": 13820 + }, + { + "epoch": 0.6147486331510869, + "grad_norm": 0.23088257014751434, + "learning_rate": 9.59294921247721e-05, + "loss": 0.2915, + "step": 13830 + }, + { + "epoch": 0.6151931368626928, + "grad_norm": 0.24342095851898193, + "learning_rate": 9.59213198074274e-05, + "loss": 0.2932, + "step": 13840 + }, + { + "epoch": 0.6156376405742988, + "grad_norm": 0.18993878364562988, + "learning_rate": 9.59131396433851e-05, + "loss": 0.2943, + "step": 13850 + }, + { + "epoch": 0.6160821442859048, + "grad_norm": 0.208570197224617, + "learning_rate": 9.590495163404297e-05, + "loss": 0.2946, + "step": 13860 + }, + { + "epoch": 0.6165266479975108, + "grad_norm": 0.23513616621494293, + "learning_rate": 9.589675578080009e-05, + "loss": 0.2908, + "step": 13870 + }, + { + "epoch": 0.6169711517091168, + "grad_norm": 0.19924335181713104, + "learning_rate": 9.588855208505694e-05, + "loss": 0.2913, + "step": 13880 + }, + { + "epoch": 0.6174156554207227, + "grad_norm": 0.20710091292858124, + "learning_rate": 9.588034054821529e-05, + "loss": 0.2905, + "step": 13890 + }, + { + "epoch": 0.6178601591323287, + "grad_norm": 0.23125386238098145, + "learning_rate": 9.587212117167826e-05, + "loss": 0.2933, + "step": 13900 + }, + { + "epoch": 0.6183046628439347, + "grad_norm": 0.2200896441936493, + "learning_rate": 9.586389395685033e-05, + "loss": 0.2932, + "step": 13910 + }, + { + "epoch": 0.6187491665555407, + "grad_norm": 0.24554872512817383, + "learning_rate": 9.585565890513733e-05, + "loss": 0.2916, + "step": 13920 + }, + { + "epoch": 0.6191936702671468, + "grad_norm": 0.23980040848255157, + "learning_rate": 9.584741601794636e-05, + "loss": 0.2887, + "step": 13930 + }, + { + "epoch": 0.6196381739787528, + "grad_norm": 0.2679573893547058, + "learning_rate": 9.58391652966859e-05, + "loss": 0.2935, + "step": 13940 + }, + { + "epoch": 0.6200826776903587, + "grad_norm": 0.22600266337394714, + "learning_rate": 9.583090674276583e-05, + "loss": 0.2946, + "step": 13950 + }, + { + "epoch": 0.6205271814019647, + "grad_norm": 0.2285483181476593, + "learning_rate": 9.582264035759726e-05, + "loss": 0.2935, + "step": 13960 + }, + { + "epoch": 0.6209716851135707, + "grad_norm": 0.22103048861026764, + "learning_rate": 9.58143661425927e-05, + "loss": 0.2919, + "step": 13970 + }, + { + "epoch": 0.6214161888251767, + "grad_norm": 0.2737506330013275, + "learning_rate": 9.580608409916601e-05, + "loss": 0.2946, + "step": 13980 + }, + { + "epoch": 0.6218606925367827, + "grad_norm": 0.21250726282596588, + "learning_rate": 9.579779422873233e-05, + "loss": 0.2891, + "step": 13990 + }, + { + "epoch": 0.6223051962483886, + "grad_norm": 0.3101194500923157, + "learning_rate": 9.578949653270819e-05, + "loss": 0.2964, + "step": 14000 + }, + { + "epoch": 0.6227496999599946, + "grad_norm": 0.21635420620441437, + "learning_rate": 9.578119101251144e-05, + "loss": 0.2905, + "step": 14010 + }, + { + "epoch": 0.6231942036716006, + "grad_norm": 0.2211102843284607, + "learning_rate": 9.577287766956127e-05, + "loss": 0.2927, + "step": 14020 + }, + { + "epoch": 0.6236387073832067, + "grad_norm": 0.1869858056306839, + "learning_rate": 9.57645565052782e-05, + "loss": 0.2902, + "step": 14030 + }, + { + "epoch": 0.6240832110948127, + "grad_norm": 0.2566467225551605, + "learning_rate": 9.575622752108407e-05, + "loss": 0.2917, + "step": 14040 + }, + { + "epoch": 0.6245277148064187, + "grad_norm": 0.21392905712127686, + "learning_rate": 9.57478907184021e-05, + "loss": 0.2921, + "step": 14050 + }, + { + "epoch": 0.6249722185180246, + "grad_norm": 0.23270486295223236, + "learning_rate": 9.573954609865681e-05, + "loss": 0.294, + "step": 14060 + }, + { + "epoch": 0.6254167222296306, + "grad_norm": 0.21338635683059692, + "learning_rate": 9.573119366327408e-05, + "loss": 0.2912, + "step": 14070 + }, + { + "epoch": 0.6258612259412366, + "grad_norm": 0.22177354991436005, + "learning_rate": 9.57228334136811e-05, + "loss": 0.2906, + "step": 14080 + }, + { + "epoch": 0.6263057296528426, + "grad_norm": 0.221465602517128, + "learning_rate": 9.571446535130641e-05, + "loss": 0.288, + "step": 14090 + }, + { + "epoch": 0.6267502333644486, + "grad_norm": 0.19387534260749817, + "learning_rate": 9.570608947757988e-05, + "loss": 0.2915, + "step": 14100 + }, + { + "epoch": 0.6271947370760546, + "grad_norm": 0.2042166143655777, + "learning_rate": 9.569770579393274e-05, + "loss": 0.2913, + "step": 14110 + }, + { + "epoch": 0.6276392407876605, + "grad_norm": 0.22024546563625336, + "learning_rate": 9.56893143017975e-05, + "loss": 0.2882, + "step": 14120 + }, + { + "epoch": 0.6280837444992665, + "grad_norm": 0.22668105363845825, + "learning_rate": 9.568091500260806e-05, + "loss": 0.2911, + "step": 14130 + }, + { + "epoch": 0.6285282482108726, + "grad_norm": 0.2528311312198639, + "learning_rate": 9.567250789779961e-05, + "loss": 0.2903, + "step": 14140 + }, + { + "epoch": 0.6289727519224786, + "grad_norm": 0.2108316719532013, + "learning_rate": 9.566409298880872e-05, + "loss": 0.2931, + "step": 14150 + }, + { + "epoch": 0.6294172556340846, + "grad_norm": 0.24099969863891602, + "learning_rate": 9.565567027707326e-05, + "loss": 0.2901, + "step": 14160 + }, + { + "epoch": 0.6298617593456906, + "grad_norm": 0.21072988212108612, + "learning_rate": 9.56472397640324e-05, + "loss": 0.2955, + "step": 14170 + }, + { + "epoch": 0.6303062630572965, + "grad_norm": 0.23889514803886414, + "learning_rate": 9.563880145112675e-05, + "loss": 0.2916, + "step": 14180 + }, + { + "epoch": 0.6307507667689025, + "grad_norm": 0.2909705936908722, + "learning_rate": 9.563035533979814e-05, + "loss": 0.2929, + "step": 14190 + }, + { + "epoch": 0.6311952704805085, + "grad_norm": 0.2501347064971924, + "learning_rate": 9.562190143148981e-05, + "loss": 0.2925, + "step": 14200 + }, + { + "epoch": 0.6316397741921145, + "grad_norm": 0.23779483139514923, + "learning_rate": 9.561343972764627e-05, + "loss": 0.2938, + "step": 14210 + }, + { + "epoch": 0.6320842779037205, + "grad_norm": 0.24885855615139008, + "learning_rate": 9.560497022971343e-05, + "loss": 0.2886, + "step": 14220 + }, + { + "epoch": 0.6325287816153264, + "grad_norm": 0.29619598388671875, + "learning_rate": 9.559649293913847e-05, + "loss": 0.2897, + "step": 14230 + }, + { + "epoch": 0.6329732853269325, + "grad_norm": 0.253176748752594, + "learning_rate": 9.558800785736993e-05, + "loss": 0.2931, + "step": 14240 + }, + { + "epoch": 0.6334177890385385, + "grad_norm": 0.25891977548599243, + "learning_rate": 9.557951498585767e-05, + "loss": 0.294, + "step": 14250 + }, + { + "epoch": 0.6338622927501445, + "grad_norm": 0.20830971002578735, + "learning_rate": 9.557101432605293e-05, + "loss": 0.2924, + "step": 14260 + }, + { + "epoch": 0.6343067964617505, + "grad_norm": 0.22559061646461487, + "learning_rate": 9.556250587940818e-05, + "loss": 0.2909, + "step": 14270 + }, + { + "epoch": 0.6347513001733565, + "grad_norm": 0.20921248197555542, + "learning_rate": 9.555398964737734e-05, + "loss": 0.2929, + "step": 14280 + }, + { + "epoch": 0.6351958038849624, + "grad_norm": 0.21230767667293549, + "learning_rate": 9.554546563141555e-05, + "loss": 0.2908, + "step": 14290 + }, + { + "epoch": 0.6356403075965684, + "grad_norm": 0.2028498500585556, + "learning_rate": 9.553693383297937e-05, + "loss": 0.2926, + "step": 14300 + }, + { + "epoch": 0.6360848113081744, + "grad_norm": 0.2349056750535965, + "learning_rate": 9.552839425352663e-05, + "loss": 0.2908, + "step": 14310 + }, + { + "epoch": 0.6365293150197804, + "grad_norm": 0.19671319425106049, + "learning_rate": 9.551984689451652e-05, + "loss": 0.2913, + "step": 14320 + }, + { + "epoch": 0.6369738187313864, + "grad_norm": 0.2429915964603424, + "learning_rate": 9.551129175740953e-05, + "loss": 0.2961, + "step": 14330 + }, + { + "epoch": 0.6374183224429923, + "grad_norm": 0.27373942732810974, + "learning_rate": 9.550272884366754e-05, + "loss": 0.2939, + "step": 14340 + }, + { + "epoch": 0.6378628261545984, + "grad_norm": 0.22703099250793457, + "learning_rate": 9.549415815475369e-05, + "loss": 0.2922, + "step": 14350 + }, + { + "epoch": 0.6383073298662044, + "grad_norm": 0.23448185622692108, + "learning_rate": 9.548557969213247e-05, + "loss": 0.2942, + "step": 14360 + }, + { + "epoch": 0.6387518335778104, + "grad_norm": 0.24584273993968964, + "learning_rate": 9.547699345726972e-05, + "loss": 0.2923, + "step": 14370 + }, + { + "epoch": 0.6391963372894164, + "grad_norm": 0.2418471723794937, + "learning_rate": 9.546839945163257e-05, + "loss": 0.2932, + "step": 14380 + }, + { + "epoch": 0.6396408410010224, + "grad_norm": 0.2441529929637909, + "learning_rate": 9.545979767668953e-05, + "loss": 0.2928, + "step": 14390 + }, + { + "epoch": 0.6400853447126283, + "grad_norm": 0.1952664703130722, + "learning_rate": 9.54511881339104e-05, + "loss": 0.291, + "step": 14400 + }, + { + "epoch": 0.6405298484242343, + "grad_norm": 0.2221074104309082, + "learning_rate": 9.54425708247663e-05, + "loss": 0.2944, + "step": 14410 + }, + { + "epoch": 0.6409743521358403, + "grad_norm": 0.21778002381324768, + "learning_rate": 9.543394575072972e-05, + "loss": 0.2891, + "step": 14420 + }, + { + "epoch": 0.6414188558474463, + "grad_norm": 0.21125002205371857, + "learning_rate": 9.542531291327441e-05, + "loss": 0.2891, + "step": 14430 + }, + { + "epoch": 0.6418633595590523, + "grad_norm": 0.27299436926841736, + "learning_rate": 9.541667231387552e-05, + "loss": 0.2927, + "step": 14440 + }, + { + "epoch": 0.6423078632706584, + "grad_norm": 0.2517746388912201, + "learning_rate": 9.540802395400949e-05, + "loss": 0.2938, + "step": 14450 + }, + { + "epoch": 0.6427523669822643, + "grad_norm": 0.2190462201833725, + "learning_rate": 9.539936783515406e-05, + "loss": 0.2904, + "step": 14460 + }, + { + "epoch": 0.6431968706938703, + "grad_norm": 0.2017941176891327, + "learning_rate": 9.539070395878835e-05, + "loss": 0.2912, + "step": 14470 + }, + { + "epoch": 0.6436413744054763, + "grad_norm": 0.22620974481105804, + "learning_rate": 9.538203232639277e-05, + "loss": 0.2911, + "step": 14480 + }, + { + "epoch": 0.6440858781170823, + "grad_norm": 0.22500421106815338, + "learning_rate": 9.537335293944907e-05, + "loss": 0.2917, + "step": 14490 + }, + { + "epoch": 0.6445303818286883, + "grad_norm": 0.25869783759117126, + "learning_rate": 9.536466579944032e-05, + "loss": 0.2893, + "step": 14500 + }, + { + "epoch": 0.6449748855402943, + "grad_norm": 0.242716446518898, + "learning_rate": 9.535597090785091e-05, + "loss": 0.2896, + "step": 14510 + }, + { + "epoch": 0.6454193892519002, + "grad_norm": 0.222727432847023, + "learning_rate": 9.534726826616656e-05, + "loss": 0.2884, + "step": 14520 + }, + { + "epoch": 0.6458638929635062, + "grad_norm": 0.21931979060173035, + "learning_rate": 9.53385578758743e-05, + "loss": 0.2924, + "step": 14530 + }, + { + "epoch": 0.6463083966751122, + "grad_norm": 0.20828424394130707, + "learning_rate": 9.532983973846252e-05, + "loss": 0.2931, + "step": 14540 + }, + { + "epoch": 0.6467529003867182, + "grad_norm": 0.21044687926769257, + "learning_rate": 9.53211138554209e-05, + "loss": 0.2926, + "step": 14550 + }, + { + "epoch": 0.6471974040983243, + "grad_norm": 0.17766281962394714, + "learning_rate": 9.531238022824047e-05, + "loss": 0.2911, + "step": 14560 + }, + { + "epoch": 0.6476419078099303, + "grad_norm": 0.19698946177959442, + "learning_rate": 9.530363885841355e-05, + "loss": 0.2925, + "step": 14570 + }, + { + "epoch": 0.6480864115215362, + "grad_norm": 0.20408159494400024, + "learning_rate": 9.52948897474338e-05, + "loss": 0.2913, + "step": 14580 + }, + { + "epoch": 0.6485309152331422, + "grad_norm": 0.23600099980831146, + "learning_rate": 9.528613289679622e-05, + "loss": 0.294, + "step": 14590 + }, + { + "epoch": 0.6489754189447482, + "grad_norm": 0.17637085914611816, + "learning_rate": 9.52773683079971e-05, + "loss": 0.2908, + "step": 14600 + }, + { + "epoch": 0.6494199226563542, + "grad_norm": 0.19947145879268646, + "learning_rate": 9.526859598253407e-05, + "loss": 0.2887, + "step": 14610 + }, + { + "epoch": 0.6498644263679602, + "grad_norm": 0.21139349043369293, + "learning_rate": 9.525981592190609e-05, + "loss": 0.2885, + "step": 14620 + }, + { + "epoch": 0.6503089300795661, + "grad_norm": 0.22897647321224213, + "learning_rate": 9.525102812761342e-05, + "loss": 0.2943, + "step": 14630 + }, + { + "epoch": 0.6507534337911721, + "grad_norm": 0.20887704193592072, + "learning_rate": 9.524223260115768e-05, + "loss": 0.2902, + "step": 14640 + }, + { + "epoch": 0.6511979375027781, + "grad_norm": 0.21517477929592133, + "learning_rate": 9.523342934404175e-05, + "loss": 0.289, + "step": 14650 + }, + { + "epoch": 0.6516424412143842, + "grad_norm": 0.20503242313861847, + "learning_rate": 9.522461835776989e-05, + "loss": 0.2922, + "step": 14660 + }, + { + "epoch": 0.6520869449259902, + "grad_norm": 0.24324099719524384, + "learning_rate": 9.521579964384764e-05, + "loss": 0.2944, + "step": 14670 + }, + { + "epoch": 0.6525314486375962, + "grad_norm": 0.22146524488925934, + "learning_rate": 9.52069732037819e-05, + "loss": 0.2918, + "step": 14680 + }, + { + "epoch": 0.6529759523492021, + "grad_norm": 0.21615010499954224, + "learning_rate": 9.519813903908083e-05, + "loss": 0.2884, + "step": 14690 + }, + { + "epoch": 0.6534204560608081, + "grad_norm": 0.21874581277370453, + "learning_rate": 9.5189297151254e-05, + "loss": 0.2906, + "step": 14700 + }, + { + "epoch": 0.6538649597724141, + "grad_norm": 0.1962144672870636, + "learning_rate": 9.518044754181218e-05, + "loss": 0.2959, + "step": 14710 + }, + { + "epoch": 0.6543094634840201, + "grad_norm": 0.18904908001422882, + "learning_rate": 9.51715902122676e-05, + "loss": 0.2924, + "step": 14720 + }, + { + "epoch": 0.6547539671956261, + "grad_norm": 0.2235316038131714, + "learning_rate": 9.516272516413368e-05, + "loss": 0.2952, + "step": 14730 + }, + { + "epoch": 0.655198470907232, + "grad_norm": 0.22245411574840546, + "learning_rate": 9.515385239892525e-05, + "loss": 0.2965, + "step": 14740 + }, + { + "epoch": 0.655642974618838, + "grad_norm": 0.21350640058517456, + "learning_rate": 9.514497191815839e-05, + "loss": 0.2917, + "step": 14750 + }, + { + "epoch": 0.656087478330444, + "grad_norm": 0.20212040841579437, + "learning_rate": 9.513608372335055e-05, + "loss": 0.292, + "step": 14760 + }, + { + "epoch": 0.6565319820420501, + "grad_norm": 0.22116468846797943, + "learning_rate": 9.512718781602045e-05, + "loss": 0.2949, + "step": 14770 + }, + { + "epoch": 0.6569764857536561, + "grad_norm": 0.1938534826040268, + "learning_rate": 9.511828419768823e-05, + "loss": 0.2927, + "step": 14780 + }, + { + "epoch": 0.6574209894652621, + "grad_norm": 0.24380844831466675, + "learning_rate": 9.510937286987521e-05, + "loss": 0.2909, + "step": 14790 + }, + { + "epoch": 0.657865493176868, + "grad_norm": 0.2212378829717636, + "learning_rate": 9.510045383410408e-05, + "loss": 0.2898, + "step": 14800 + }, + { + "epoch": 0.658309996888474, + "grad_norm": 0.21420595049858093, + "learning_rate": 9.509152709189892e-05, + "loss": 0.2897, + "step": 14810 + }, + { + "epoch": 0.65875450060008, + "grad_norm": 0.21828797459602356, + "learning_rate": 9.508259264478504e-05, + "loss": 0.2884, + "step": 14820 + }, + { + "epoch": 0.659199004311686, + "grad_norm": 0.2795988619327545, + "learning_rate": 9.507365049428909e-05, + "loss": 0.2904, + "step": 14830 + }, + { + "epoch": 0.659643508023292, + "grad_norm": 0.2507339417934418, + "learning_rate": 9.506470064193902e-05, + "loss": 0.2903, + "step": 14840 + }, + { + "epoch": 0.660088011734898, + "grad_norm": 0.17130638659000397, + "learning_rate": 9.505574308926414e-05, + "loss": 0.2935, + "step": 14850 + }, + { + "epoch": 0.6605325154465039, + "grad_norm": 0.23332062363624573, + "learning_rate": 9.504677783779505e-05, + "loss": 0.29, + "step": 14860 + }, + { + "epoch": 0.66097701915811, + "grad_norm": 0.24144849181175232, + "learning_rate": 9.503780488906365e-05, + "loss": 0.288, + "step": 14870 + }, + { + "epoch": 0.661421522869716, + "grad_norm": 0.2569524347782135, + "learning_rate": 9.502882424460319e-05, + "loss": 0.2913, + "step": 14880 + }, + { + "epoch": 0.661866026581322, + "grad_norm": 0.21876977384090424, + "learning_rate": 9.501983590594821e-05, + "loss": 0.2905, + "step": 14890 + }, + { + "epoch": 0.662310530292928, + "grad_norm": 0.24605530500411987, + "learning_rate": 9.501083987463455e-05, + "loss": 0.2897, + "step": 14900 + }, + { + "epoch": 0.662755034004534, + "grad_norm": 0.16842903196811676, + "learning_rate": 9.500183615219942e-05, + "loss": 0.2913, + "step": 14910 + }, + { + "epoch": 0.6631995377161399, + "grad_norm": 0.206191286444664, + "learning_rate": 9.49928247401813e-05, + "loss": 0.2903, + "step": 14920 + }, + { + "epoch": 0.6636440414277459, + "grad_norm": 0.20742075145244598, + "learning_rate": 9.498380564011997e-05, + "loss": 0.2885, + "step": 14930 + }, + { + "epoch": 0.6640885451393519, + "grad_norm": 0.2692226469516754, + "learning_rate": 9.497477885355656e-05, + "loss": 0.2888, + "step": 14940 + }, + { + "epoch": 0.6645330488509579, + "grad_norm": 0.23623409867286682, + "learning_rate": 9.496574438203353e-05, + "loss": 0.2912, + "step": 14950 + }, + { + "epoch": 0.6649775525625639, + "grad_norm": 0.22740353643894196, + "learning_rate": 9.495670222709459e-05, + "loss": 0.292, + "step": 14960 + }, + { + "epoch": 0.6654220562741698, + "grad_norm": 0.1862693428993225, + "learning_rate": 9.494765239028483e-05, + "loss": 0.2875, + "step": 14970 + }, + { + "epoch": 0.6658665599857759, + "grad_norm": 0.20652931928634644, + "learning_rate": 9.493859487315057e-05, + "loss": 0.2914, + "step": 14980 + }, + { + "epoch": 0.6663110636973819, + "grad_norm": 0.18576860427856445, + "learning_rate": 9.492952967723953e-05, + "loss": 0.2903, + "step": 14990 + }, + { + "epoch": 0.6667555674089879, + "grad_norm": 0.229792058467865, + "learning_rate": 9.492045680410068e-05, + "loss": 0.2897, + "step": 15000 + }, + { + "epoch": 0.6672000711205939, + "grad_norm": 0.2598484456539154, + "learning_rate": 9.491137625528436e-05, + "loss": 0.2896, + "step": 15010 + }, + { + "epoch": 0.6676445748321999, + "grad_norm": 0.22516033053398132, + "learning_rate": 9.490228803234215e-05, + "loss": 0.2885, + "step": 15020 + }, + { + "epoch": 0.6680890785438058, + "grad_norm": 0.21808360517024994, + "learning_rate": 9.489319213682701e-05, + "loss": 0.2914, + "step": 15030 + }, + { + "epoch": 0.6685335822554118, + "grad_norm": 0.21472325921058655, + "learning_rate": 9.488408857029316e-05, + "loss": 0.2894, + "step": 15040 + }, + { + "epoch": 0.6689780859670178, + "grad_norm": 0.19098135828971863, + "learning_rate": 9.487497733429616e-05, + "loss": 0.292, + "step": 15050 + }, + { + "epoch": 0.6694225896786238, + "grad_norm": 0.23374873399734497, + "learning_rate": 9.486585843039286e-05, + "loss": 0.289, + "step": 15060 + }, + { + "epoch": 0.6698670933902298, + "grad_norm": 0.2401442527770996, + "learning_rate": 9.485673186014143e-05, + "loss": 0.2909, + "step": 15070 + }, + { + "epoch": 0.6703115971018359, + "grad_norm": 0.20441067218780518, + "learning_rate": 9.484759762510137e-05, + "loss": 0.2924, + "step": 15080 + }, + { + "epoch": 0.6707561008134418, + "grad_norm": 0.2354183793067932, + "learning_rate": 9.483845572683346e-05, + "loss": 0.2901, + "step": 15090 + }, + { + "epoch": 0.6712006045250478, + "grad_norm": 0.2582967281341553, + "learning_rate": 9.48293061668998e-05, + "loss": 0.2875, + "step": 15100 + }, + { + "epoch": 0.6716451082366538, + "grad_norm": 0.22870159149169922, + "learning_rate": 9.48201489468638e-05, + "loss": 0.2916, + "step": 15110 + }, + { + "epoch": 0.6720896119482598, + "grad_norm": 0.27931562066078186, + "learning_rate": 9.481098406829016e-05, + "loss": 0.2961, + "step": 15120 + }, + { + "epoch": 0.6725341156598658, + "grad_norm": 0.24444738030433655, + "learning_rate": 9.480181153274495e-05, + "loss": 0.2887, + "step": 15130 + }, + { + "epoch": 0.6729786193714717, + "grad_norm": 0.18969137966632843, + "learning_rate": 9.479263134179548e-05, + "loss": 0.2924, + "step": 15140 + }, + { + "epoch": 0.6734231230830777, + "grad_norm": 0.24172449111938477, + "learning_rate": 9.478344349701039e-05, + "loss": 0.2972, + "step": 15150 + }, + { + "epoch": 0.6738676267946837, + "grad_norm": 0.2141011357307434, + "learning_rate": 9.477424799995964e-05, + "loss": 0.2867, + "step": 15160 + }, + { + "epoch": 0.6743121305062897, + "grad_norm": 0.24616685509681702, + "learning_rate": 9.476504485221448e-05, + "loss": 0.2957, + "step": 15170 + }, + { + "epoch": 0.6747566342178957, + "grad_norm": 0.20443964004516602, + "learning_rate": 9.475583405534748e-05, + "loss": 0.2915, + "step": 15180 + }, + { + "epoch": 0.6752011379295018, + "grad_norm": 0.25528889894485474, + "learning_rate": 9.474661561093251e-05, + "loss": 0.2924, + "step": 15190 + }, + { + "epoch": 0.6756456416411077, + "grad_norm": 0.22207650542259216, + "learning_rate": 9.473738952054478e-05, + "loss": 0.2877, + "step": 15200 + }, + { + "epoch": 0.6760901453527137, + "grad_norm": 0.19762466847896576, + "learning_rate": 9.472815578576073e-05, + "loss": 0.289, + "step": 15210 + }, + { + "epoch": 0.6765346490643197, + "grad_norm": 0.20064474642276764, + "learning_rate": 9.471891440815817e-05, + "loss": 0.2874, + "step": 15220 + }, + { + "epoch": 0.6769791527759257, + "grad_norm": 0.22799699008464813, + "learning_rate": 9.470966538931621e-05, + "loss": 0.2928, + "step": 15230 + }, + { + "epoch": 0.6774236564875317, + "grad_norm": 0.24235612154006958, + "learning_rate": 9.470040873081525e-05, + "loss": 0.2929, + "step": 15240 + }, + { + "epoch": 0.6778681601991376, + "grad_norm": 0.20847643911838531, + "learning_rate": 9.469114443423698e-05, + "loss": 0.2935, + "step": 15250 + }, + { + "epoch": 0.6783126639107436, + "grad_norm": 0.23287686705589294, + "learning_rate": 9.468187250116445e-05, + "loss": 0.292, + "step": 15260 + }, + { + "epoch": 0.6787571676223496, + "grad_norm": 0.269117146730423, + "learning_rate": 9.467259293318197e-05, + "loss": 0.2913, + "step": 15270 + }, + { + "epoch": 0.6792016713339556, + "grad_norm": 0.2021692395210266, + "learning_rate": 9.466330573187514e-05, + "loss": 0.2905, + "step": 15280 + }, + { + "epoch": 0.6796461750455617, + "grad_norm": 0.21194225549697876, + "learning_rate": 9.46540108988309e-05, + "loss": 0.2884, + "step": 15290 + }, + { + "epoch": 0.6800906787571677, + "grad_norm": 0.19145971536636353, + "learning_rate": 9.46447084356375e-05, + "loss": 0.2925, + "step": 15300 + }, + { + "epoch": 0.6805351824687736, + "grad_norm": 0.22055578231811523, + "learning_rate": 9.463539834388447e-05, + "loss": 0.2919, + "step": 15310 + }, + { + "epoch": 0.6809796861803796, + "grad_norm": 0.2477518618106842, + "learning_rate": 9.462608062516263e-05, + "loss": 0.2959, + "step": 15320 + }, + { + "epoch": 0.6814241898919856, + "grad_norm": 0.22680461406707764, + "learning_rate": 9.461675528106413e-05, + "loss": 0.2912, + "step": 15330 + }, + { + "epoch": 0.6818686936035916, + "grad_norm": 0.1872292160987854, + "learning_rate": 9.460742231318244e-05, + "loss": 0.2886, + "step": 15340 + }, + { + "epoch": 0.6823131973151976, + "grad_norm": 0.20792971551418304, + "learning_rate": 9.459808172311229e-05, + "loss": 0.2901, + "step": 15350 + }, + { + "epoch": 0.6827577010268036, + "grad_norm": 0.1928422898054123, + "learning_rate": 9.458873351244972e-05, + "loss": 0.2909, + "step": 15360 + }, + { + "epoch": 0.6832022047384095, + "grad_norm": 0.20499174296855927, + "learning_rate": 9.457937768279211e-05, + "loss": 0.289, + "step": 15370 + }, + { + "epoch": 0.6836467084500155, + "grad_norm": 0.24837933480739594, + "learning_rate": 9.45700142357381e-05, + "loss": 0.2908, + "step": 15380 + }, + { + "epoch": 0.6840912121616215, + "grad_norm": 0.23071816563606262, + "learning_rate": 9.456064317288765e-05, + "loss": 0.2936, + "step": 15390 + }, + { + "epoch": 0.6845357158732276, + "grad_norm": 0.21943485736846924, + "learning_rate": 9.455126449584201e-05, + "loss": 0.293, + "step": 15400 + }, + { + "epoch": 0.6849802195848336, + "grad_norm": 0.21623441576957703, + "learning_rate": 9.454187820620375e-05, + "loss": 0.2891, + "step": 15410 + }, + { + "epoch": 0.6854247232964396, + "grad_norm": 0.23946836590766907, + "learning_rate": 9.453248430557673e-05, + "loss": 0.2898, + "step": 15420 + }, + { + "epoch": 0.6858692270080455, + "grad_norm": 0.20182698965072632, + "learning_rate": 9.452308279556611e-05, + "loss": 0.2895, + "step": 15430 + }, + { + "epoch": 0.6863137307196515, + "grad_norm": 0.20034335553646088, + "learning_rate": 9.451367367777835e-05, + "loss": 0.2883, + "step": 15440 + }, + { + "epoch": 0.6867582344312575, + "grad_norm": 0.25833064317703247, + "learning_rate": 9.450425695382122e-05, + "loss": 0.2929, + "step": 15450 + }, + { + "epoch": 0.6872027381428635, + "grad_norm": 0.20678383111953735, + "learning_rate": 9.449483262530375e-05, + "loss": 0.2917, + "step": 15460 + }, + { + "epoch": 0.6876472418544695, + "grad_norm": 0.2608693838119507, + "learning_rate": 9.448540069383633e-05, + "loss": 0.2904, + "step": 15470 + }, + { + "epoch": 0.6880917455660754, + "grad_norm": 0.19091123342514038, + "learning_rate": 9.447596116103061e-05, + "loss": 0.2892, + "step": 15480 + }, + { + "epoch": 0.6885362492776814, + "grad_norm": 0.1996569186449051, + "learning_rate": 9.446651402849955e-05, + "loss": 0.2908, + "step": 15490 + }, + { + "epoch": 0.6889807529892875, + "grad_norm": 0.23807081580162048, + "learning_rate": 9.44570592978574e-05, + "loss": 0.2889, + "step": 15500 + }, + { + "epoch": 0.6894252567008935, + "grad_norm": 0.23884430527687073, + "learning_rate": 9.444759697071972e-05, + "loss": 0.2907, + "step": 15510 + }, + { + "epoch": 0.6898697604124995, + "grad_norm": 0.22889664769172668, + "learning_rate": 9.443812704870336e-05, + "loss": 0.2917, + "step": 15520 + }, + { + "epoch": 0.6903142641241055, + "grad_norm": 0.2474754899740219, + "learning_rate": 9.442864953342649e-05, + "loss": 0.2949, + "step": 15530 + }, + { + "epoch": 0.6907587678357114, + "grad_norm": 0.2382301688194275, + "learning_rate": 9.441916442650852e-05, + "loss": 0.2951, + "step": 15540 + }, + { + "epoch": 0.6912032715473174, + "grad_norm": 0.25355544686317444, + "learning_rate": 9.440967172957023e-05, + "loss": 0.2901, + "step": 15550 + }, + { + "epoch": 0.6916477752589234, + "grad_norm": 0.18935124576091766, + "learning_rate": 9.440017144423364e-05, + "loss": 0.2917, + "step": 15560 + }, + { + "epoch": 0.6920922789705294, + "grad_norm": 0.209207683801651, + "learning_rate": 9.439066357212209e-05, + "loss": 0.2911, + "step": 15570 + }, + { + "epoch": 0.6925367826821354, + "grad_norm": 0.21590560674667358, + "learning_rate": 9.438114811486022e-05, + "loss": 0.2889, + "step": 15580 + }, + { + "epoch": 0.6929812863937413, + "grad_norm": 0.22163251042366028, + "learning_rate": 9.4371625074074e-05, + "loss": 0.2904, + "step": 15590 + }, + { + "epoch": 0.6934257901053473, + "grad_norm": 0.2624526619911194, + "learning_rate": 9.436209445139059e-05, + "loss": 0.2908, + "step": 15600 + }, + { + "epoch": 0.6938702938169534, + "grad_norm": 0.20899975299835205, + "learning_rate": 9.435255624843855e-05, + "loss": 0.2905, + "step": 15610 + }, + { + "epoch": 0.6943147975285594, + "grad_norm": 0.2292497754096985, + "learning_rate": 9.43430104668477e-05, + "loss": 0.2945, + "step": 15620 + }, + { + "epoch": 0.6947593012401654, + "grad_norm": 0.2596757709980011, + "learning_rate": 9.433345710824914e-05, + "loss": 0.2942, + "step": 15630 + }, + { + "epoch": 0.6952038049517714, + "grad_norm": 0.19990935921669006, + "learning_rate": 9.432389617427529e-05, + "loss": 0.2884, + "step": 15640 + }, + { + "epoch": 0.6956483086633773, + "grad_norm": 0.21539120376110077, + "learning_rate": 9.431432766655984e-05, + "loss": 0.2932, + "step": 15650 + }, + { + "epoch": 0.6960928123749833, + "grad_norm": 0.29919689893722534, + "learning_rate": 9.430475158673778e-05, + "loss": 0.2881, + "step": 15660 + }, + { + "epoch": 0.6965373160865893, + "grad_norm": 0.26138249039649963, + "learning_rate": 9.429516793644542e-05, + "loss": 0.294, + "step": 15670 + }, + { + "epoch": 0.6969818197981953, + "grad_norm": 0.2409445196390152, + "learning_rate": 9.428557671732034e-05, + "loss": 0.292, + "step": 15680 + }, + { + "epoch": 0.6974263235098013, + "grad_norm": 0.22881083190441132, + "learning_rate": 9.42759779310014e-05, + "loss": 0.2921, + "step": 15690 + }, + { + "epoch": 0.6978708272214073, + "grad_norm": 0.22152121365070343, + "learning_rate": 9.426637157912879e-05, + "loss": 0.2872, + "step": 15700 + }, + { + "epoch": 0.6983153309330133, + "grad_norm": 0.21363615989685059, + "learning_rate": 9.425675766334397e-05, + "loss": 0.292, + "step": 15710 + }, + { + "epoch": 0.6987598346446193, + "grad_norm": 0.23690061271190643, + "learning_rate": 9.424713618528968e-05, + "loss": 0.2912, + "step": 15720 + }, + { + "epoch": 0.6992043383562253, + "grad_norm": 0.2041909396648407, + "learning_rate": 9.423750714661e-05, + "loss": 0.2886, + "step": 15730 + }, + { + "epoch": 0.6996488420678313, + "grad_norm": 0.2034180760383606, + "learning_rate": 9.422787054895022e-05, + "loss": 0.2897, + "step": 15740 + }, + { + "epoch": 0.7000933457794373, + "grad_norm": 0.23591306805610657, + "learning_rate": 9.4218226393957e-05, + "loss": 0.2898, + "step": 15750 + }, + { + "epoch": 0.7005378494910433, + "grad_norm": 0.2248162031173706, + "learning_rate": 9.420857468327828e-05, + "loss": 0.2945, + "step": 15760 + }, + { + "epoch": 0.7009823532026492, + "grad_norm": 0.2198588103055954, + "learning_rate": 9.419891541856323e-05, + "loss": 0.2915, + "step": 15770 + }, + { + "epoch": 0.7014268569142552, + "grad_norm": 0.2130742222070694, + "learning_rate": 9.41892486014624e-05, + "loss": 0.289, + "step": 15780 + }, + { + "epoch": 0.7018713606258612, + "grad_norm": 0.2140454351902008, + "learning_rate": 9.417957423362756e-05, + "loss": 0.291, + "step": 15790 + }, + { + "epoch": 0.7023158643374672, + "grad_norm": 0.2830973267555237, + "learning_rate": 9.416989231671178e-05, + "loss": 0.2899, + "step": 15800 + }, + { + "epoch": 0.7027603680490732, + "grad_norm": 0.356505811214447, + "learning_rate": 9.416020285236946e-05, + "loss": 0.2882, + "step": 15810 + }, + { + "epoch": 0.7032048717606793, + "grad_norm": 0.21308265626430511, + "learning_rate": 9.415050584225626e-05, + "loss": 0.2908, + "step": 15820 + }, + { + "epoch": 0.7036493754722852, + "grad_norm": 0.20541507005691528, + "learning_rate": 9.414080128802914e-05, + "loss": 0.289, + "step": 15830 + }, + { + "epoch": 0.7040938791838912, + "grad_norm": 0.19734206795692444, + "learning_rate": 9.413108919134632e-05, + "loss": 0.2895, + "step": 15840 + }, + { + "epoch": 0.7045383828954972, + "grad_norm": 0.2361324429512024, + "learning_rate": 9.412136955386734e-05, + "loss": 0.2911, + "step": 15850 + }, + { + "epoch": 0.7049828866071032, + "grad_norm": 0.19402864575386047, + "learning_rate": 9.411164237725303e-05, + "loss": 0.2915, + "step": 15860 + }, + { + "epoch": 0.7054273903187092, + "grad_norm": 0.1689562052488327, + "learning_rate": 9.41019076631655e-05, + "loss": 0.2886, + "step": 15870 + }, + { + "epoch": 0.7058718940303151, + "grad_norm": 0.2312348335981369, + "learning_rate": 9.409216541326815e-05, + "loss": 0.2897, + "step": 15880 + }, + { + "epoch": 0.7063163977419211, + "grad_norm": 0.23800358176231384, + "learning_rate": 9.408241562922564e-05, + "loss": 0.2885, + "step": 15890 + }, + { + "epoch": 0.7067609014535271, + "grad_norm": 0.22262319922447205, + "learning_rate": 9.407265831270395e-05, + "loss": 0.2919, + "step": 15900 + }, + { + "epoch": 0.7072054051651331, + "grad_norm": 0.18881255388259888, + "learning_rate": 9.406289346537035e-05, + "loss": 0.2904, + "step": 15910 + }, + { + "epoch": 0.7076499088767392, + "grad_norm": 0.24404709041118622, + "learning_rate": 9.405312108889339e-05, + "loss": 0.2908, + "step": 15920 + }, + { + "epoch": 0.7080944125883452, + "grad_norm": 0.19277353584766388, + "learning_rate": 9.404334118494288e-05, + "loss": 0.2934, + "step": 15930 + }, + { + "epoch": 0.7085389162999511, + "grad_norm": 0.20432919263839722, + "learning_rate": 9.403355375518995e-05, + "loss": 0.2921, + "step": 15940 + }, + { + "epoch": 0.7089834200115571, + "grad_norm": 0.19226998090744019, + "learning_rate": 9.4023758801307e-05, + "loss": 0.293, + "step": 15950 + }, + { + "epoch": 0.7094279237231631, + "grad_norm": 0.25011101365089417, + "learning_rate": 9.401395632496774e-05, + "loss": 0.2918, + "step": 15960 + }, + { + "epoch": 0.7098724274347691, + "grad_norm": 0.21745310723781586, + "learning_rate": 9.400414632784711e-05, + "loss": 0.292, + "step": 15970 + }, + { + "epoch": 0.7103169311463751, + "grad_norm": 0.24411346018314362, + "learning_rate": 9.39943288116214e-05, + "loss": 0.289, + "step": 15980 + }, + { + "epoch": 0.710761434857981, + "grad_norm": 0.2265699952840805, + "learning_rate": 9.398450377796815e-05, + "loss": 0.2881, + "step": 15990 + }, + { + "epoch": 0.711205938569587, + "grad_norm": 0.1894959956407547, + "learning_rate": 9.397467122856616e-05, + "loss": 0.2898, + "step": 16000 + }, + { + "epoch": 0.711650442281193, + "grad_norm": 0.1976759135723114, + "learning_rate": 9.396483116509558e-05, + "loss": 0.2902, + "step": 16010 + }, + { + "epoch": 0.712094945992799, + "grad_norm": 0.20652276277542114, + "learning_rate": 9.39549835892378e-05, + "loss": 0.2875, + "step": 16020 + }, + { + "epoch": 0.7125394497044051, + "grad_norm": 0.2876338064670563, + "learning_rate": 9.39451285026755e-05, + "loss": 0.2897, + "step": 16030 + }, + { + "epoch": 0.7129839534160111, + "grad_norm": 0.20354266464710236, + "learning_rate": 9.393526590709262e-05, + "loss": 0.2913, + "step": 16040 + }, + { + "epoch": 0.713428457127617, + "grad_norm": 0.1999918669462204, + "learning_rate": 9.392539580417444e-05, + "loss": 0.2874, + "step": 16050 + }, + { + "epoch": 0.713872960839223, + "grad_norm": 0.20784121751785278, + "learning_rate": 9.391551819560747e-05, + "loss": 0.2878, + "step": 16060 + }, + { + "epoch": 0.714317464550829, + "grad_norm": 0.24795717000961304, + "learning_rate": 9.390563308307955e-05, + "loss": 0.2909, + "step": 16070 + }, + { + "epoch": 0.714761968262435, + "grad_norm": 0.23468036949634552, + "learning_rate": 9.389574046827974e-05, + "loss": 0.2933, + "step": 16080 + }, + { + "epoch": 0.715206471974041, + "grad_norm": 0.20434793829917908, + "learning_rate": 9.388584035289845e-05, + "loss": 0.2905, + "step": 16090 + }, + { + "epoch": 0.715650975685647, + "grad_norm": 0.2185448259115219, + "learning_rate": 9.387593273862732e-05, + "loss": 0.2924, + "step": 16100 + }, + { + "epoch": 0.7160954793972529, + "grad_norm": 0.23272071778774261, + "learning_rate": 9.386601762715929e-05, + "loss": 0.2887, + "step": 16110 + }, + { + "epoch": 0.7165399831088589, + "grad_norm": 0.21122685074806213, + "learning_rate": 9.38560950201886e-05, + "loss": 0.291, + "step": 16120 + }, + { + "epoch": 0.716984486820465, + "grad_norm": 0.18756453692913055, + "learning_rate": 9.384616491941071e-05, + "loss": 0.2884, + "step": 16130 + }, + { + "epoch": 0.717428990532071, + "grad_norm": 0.1821896731853485, + "learning_rate": 9.383622732652245e-05, + "loss": 0.2894, + "step": 16140 + }, + { + "epoch": 0.717873494243677, + "grad_norm": 0.2465072125196457, + "learning_rate": 9.382628224322187e-05, + "loss": 0.2914, + "step": 16150 + }, + { + "epoch": 0.718317997955283, + "grad_norm": 0.23669716715812683, + "learning_rate": 9.381632967120829e-05, + "loss": 0.2939, + "step": 16160 + }, + { + "epoch": 0.7187625016668889, + "grad_norm": 0.23881597816944122, + "learning_rate": 9.380636961218235e-05, + "loss": 0.2869, + "step": 16170 + }, + { + "epoch": 0.7192070053784949, + "grad_norm": 0.2007044553756714, + "learning_rate": 9.379640206784597e-05, + "loss": 0.2888, + "step": 16180 + }, + { + "epoch": 0.7196515090901009, + "grad_norm": 0.21900629997253418, + "learning_rate": 9.378642703990229e-05, + "loss": 0.2885, + "step": 16190 + }, + { + "epoch": 0.7200960128017069, + "grad_norm": 0.21872341632843018, + "learning_rate": 9.37764445300558e-05, + "loss": 0.2914, + "step": 16200 + }, + { + "epoch": 0.7205405165133129, + "grad_norm": 0.23271869122982025, + "learning_rate": 9.376645454001222e-05, + "loss": 0.2909, + "step": 16210 + }, + { + "epoch": 0.7209850202249188, + "grad_norm": 0.2089512050151825, + "learning_rate": 9.375645707147858e-05, + "loss": 0.2907, + "step": 16220 + }, + { + "epoch": 0.7214295239365248, + "grad_norm": 0.2047145962715149, + "learning_rate": 9.374645212616316e-05, + "loss": 0.2914, + "step": 16230 + }, + { + "epoch": 0.7218740276481309, + "grad_norm": 0.2271762639284134, + "learning_rate": 9.373643970577555e-05, + "loss": 0.2892, + "step": 16240 + }, + { + "epoch": 0.7223185313597369, + "grad_norm": 0.22559966146945953, + "learning_rate": 9.372641981202659e-05, + "loss": 0.2903, + "step": 16250 + }, + { + "epoch": 0.7227630350713429, + "grad_norm": 0.19590812921524048, + "learning_rate": 9.37163924466284e-05, + "loss": 0.2912, + "step": 16260 + }, + { + "epoch": 0.7232075387829489, + "grad_norm": 0.2085324078798294, + "learning_rate": 9.370635761129438e-05, + "loss": 0.2895, + "step": 16270 + }, + { + "epoch": 0.7236520424945548, + "grad_norm": 0.22319433093070984, + "learning_rate": 9.36963153077392e-05, + "loss": 0.2907, + "step": 16280 + }, + { + "epoch": 0.7240965462061608, + "grad_norm": 0.22376002371311188, + "learning_rate": 9.368626553767888e-05, + "loss": 0.2911, + "step": 16290 + }, + { + "epoch": 0.7245410499177668, + "grad_norm": 0.24067999422550201, + "learning_rate": 9.367620830283057e-05, + "loss": 0.2897, + "step": 16300 + }, + { + "epoch": 0.7249855536293728, + "grad_norm": 0.22556430101394653, + "learning_rate": 9.366614360491281e-05, + "loss": 0.2919, + "step": 16310 + }, + { + "epoch": 0.7254300573409788, + "grad_norm": 0.20603424310684204, + "learning_rate": 9.365607144564539e-05, + "loss": 0.29, + "step": 16320 + }, + { + "epoch": 0.7258745610525847, + "grad_norm": 0.2185991257429123, + "learning_rate": 9.364599182674934e-05, + "loss": 0.2916, + "step": 16330 + }, + { + "epoch": 0.7263190647641908, + "grad_norm": 0.1974305957555771, + "learning_rate": 9.3635904749947e-05, + "loss": 0.2893, + "step": 16340 + }, + { + "epoch": 0.7267635684757968, + "grad_norm": 0.22803714871406555, + "learning_rate": 9.362581021696202e-05, + "loss": 0.2878, + "step": 16350 + }, + { + "epoch": 0.7272080721874028, + "grad_norm": 0.2523474097251892, + "learning_rate": 9.361570822951921e-05, + "loss": 0.2901, + "step": 16360 + }, + { + "epoch": 0.7276525758990088, + "grad_norm": 0.20490647852420807, + "learning_rate": 9.360559878934476e-05, + "loss": 0.2917, + "step": 16370 + }, + { + "epoch": 0.7280970796106148, + "grad_norm": 0.21194396913051605, + "learning_rate": 9.359548189816611e-05, + "loss": 0.2897, + "step": 16380 + }, + { + "epoch": 0.7285415833222207, + "grad_norm": 0.20971143245697021, + "learning_rate": 9.358535755771193e-05, + "loss": 0.2889, + "step": 16390 + }, + { + "epoch": 0.7289860870338267, + "grad_norm": 0.24629205465316772, + "learning_rate": 9.357522576971221e-05, + "loss": 0.2897, + "step": 16400 + }, + { + "epoch": 0.7294305907454327, + "grad_norm": 0.18430659174919128, + "learning_rate": 9.356508653589819e-05, + "loss": 0.293, + "step": 16410 + }, + { + "epoch": 0.7298750944570387, + "grad_norm": 0.20874996483325958, + "learning_rate": 9.355493985800237e-05, + "loss": 0.2896, + "step": 16420 + }, + { + "epoch": 0.7303195981686447, + "grad_norm": 0.19002752006053925, + "learning_rate": 9.354478573775857e-05, + "loss": 0.2878, + "step": 16430 + }, + { + "epoch": 0.7307641018802506, + "grad_norm": 0.23702043294906616, + "learning_rate": 9.353462417690186e-05, + "loss": 0.2902, + "step": 16440 + }, + { + "epoch": 0.7312086055918567, + "grad_norm": 0.2018144428730011, + "learning_rate": 9.352445517716853e-05, + "loss": 0.2923, + "step": 16450 + }, + { + "epoch": 0.7316531093034627, + "grad_norm": 0.22913408279418945, + "learning_rate": 9.351427874029621e-05, + "loss": 0.2924, + "step": 16460 + }, + { + "epoch": 0.7320976130150687, + "grad_norm": 0.22935131192207336, + "learning_rate": 9.350409486802379e-05, + "loss": 0.2901, + "step": 16470 + }, + { + "epoch": 0.7325421167266747, + "grad_norm": 0.2044687271118164, + "learning_rate": 9.349390356209138e-05, + "loss": 0.2895, + "step": 16480 + }, + { + "epoch": 0.7329866204382807, + "grad_norm": 0.22666142880916595, + "learning_rate": 9.348370482424042e-05, + "loss": 0.2964, + "step": 16490 + }, + { + "epoch": 0.7334311241498866, + "grad_norm": 0.20898889005184174, + "learning_rate": 9.347349865621357e-05, + "loss": 0.2938, + "step": 16500 + }, + { + "epoch": 0.7338756278614926, + "grad_norm": 0.20820383727550507, + "learning_rate": 9.346328505975481e-05, + "loss": 0.2921, + "step": 16510 + }, + { + "epoch": 0.7343201315730986, + "grad_norm": 0.23267610371112823, + "learning_rate": 9.345306403660936e-05, + "loss": 0.2904, + "step": 16520 + }, + { + "epoch": 0.7347646352847046, + "grad_norm": 0.2235429883003235, + "learning_rate": 9.344283558852371e-05, + "loss": 0.2896, + "step": 16530 + }, + { + "epoch": 0.7352091389963106, + "grad_norm": 0.20883344113826752, + "learning_rate": 9.343259971724563e-05, + "loss": 0.2899, + "step": 16540 + }, + { + "epoch": 0.7356536427079167, + "grad_norm": 0.22054952383041382, + "learning_rate": 9.342235642452413e-05, + "loss": 0.2905, + "step": 16550 + }, + { + "epoch": 0.7360981464195226, + "grad_norm": 0.23981346189975739, + "learning_rate": 9.341210571210954e-05, + "loss": 0.2906, + "step": 16560 + }, + { + "epoch": 0.7365426501311286, + "grad_norm": 0.2599547207355499, + "learning_rate": 9.340184758175338e-05, + "loss": 0.2887, + "step": 16570 + }, + { + "epoch": 0.7369871538427346, + "grad_norm": 0.23534707725048065, + "learning_rate": 9.339158203520854e-05, + "loss": 0.2925, + "step": 16580 + }, + { + "epoch": 0.7374316575543406, + "grad_norm": 0.22695417702198029, + "learning_rate": 9.338130907422908e-05, + "loss": 0.2922, + "step": 16590 + }, + { + "epoch": 0.7378761612659466, + "grad_norm": 0.2181772142648697, + "learning_rate": 9.337102870057037e-05, + "loss": 0.2886, + "step": 16600 + }, + { + "epoch": 0.7383206649775526, + "grad_norm": 0.20057891309261322, + "learning_rate": 9.336074091598907e-05, + "loss": 0.2906, + "step": 16610 + }, + { + "epoch": 0.7387651686891585, + "grad_norm": 0.20839272439479828, + "learning_rate": 9.335044572224306e-05, + "loss": 0.2867, + "step": 16620 + }, + { + "epoch": 0.7392096724007645, + "grad_norm": 0.2062559425830841, + "learning_rate": 9.334014312109151e-05, + "loss": 0.288, + "step": 16630 + }, + { + "epoch": 0.7396541761123705, + "grad_norm": 0.22265124320983887, + "learning_rate": 9.332983311429486e-05, + "loss": 0.2896, + "step": 16640 + }, + { + "epoch": 0.7400986798239765, + "grad_norm": 0.2958724796772003, + "learning_rate": 9.33195157036148e-05, + "loss": 0.2874, + "step": 16650 + }, + { + "epoch": 0.7405431835355826, + "grad_norm": 0.2510389983654022, + "learning_rate": 9.330919089081432e-05, + "loss": 0.2907, + "step": 16660 + }, + { + "epoch": 0.7409876872471886, + "grad_norm": 0.22096680104732513, + "learning_rate": 9.32988586776576e-05, + "loss": 0.2882, + "step": 16670 + }, + { + "epoch": 0.7414321909587945, + "grad_norm": 0.24993044137954712, + "learning_rate": 9.328851906591016e-05, + "loss": 0.2898, + "step": 16680 + }, + { + "epoch": 0.7418766946704005, + "grad_norm": 0.24192263185977936, + "learning_rate": 9.327817205733875e-05, + "loss": 0.2887, + "step": 16690 + }, + { + "epoch": 0.7423211983820065, + "grad_norm": 0.253587543964386, + "learning_rate": 9.326781765371142e-05, + "loss": 0.2886, + "step": 16700 + }, + { + "epoch": 0.7427657020936125, + "grad_norm": 0.19269509613513947, + "learning_rate": 9.325745585679741e-05, + "loss": 0.2881, + "step": 16710 + }, + { + "epoch": 0.7432102058052185, + "grad_norm": 0.21472059190273285, + "learning_rate": 9.32470866683673e-05, + "loss": 0.2904, + "step": 16720 + }, + { + "epoch": 0.7436547095168244, + "grad_norm": 0.2190518081188202, + "learning_rate": 9.323671009019288e-05, + "loss": 0.2889, + "step": 16730 + }, + { + "epoch": 0.7440992132284304, + "grad_norm": 0.18764488399028778, + "learning_rate": 9.322632612404725e-05, + "loss": 0.2868, + "step": 16740 + }, + { + "epoch": 0.7445437169400364, + "grad_norm": 0.1935891956090927, + "learning_rate": 9.321593477170471e-05, + "loss": 0.2909, + "step": 16750 + }, + { + "epoch": 0.7449882206516425, + "grad_norm": 0.21722060441970825, + "learning_rate": 9.320553603494088e-05, + "loss": 0.2882, + "step": 16760 + }, + { + "epoch": 0.7454327243632485, + "grad_norm": 0.2133447378873825, + "learning_rate": 9.319512991553261e-05, + "loss": 0.2924, + "step": 16770 + }, + { + "epoch": 0.7458772280748545, + "grad_norm": 0.21505603194236755, + "learning_rate": 9.318471641525803e-05, + "loss": 0.2892, + "step": 16780 + }, + { + "epoch": 0.7463217317864604, + "grad_norm": 0.22835853695869446, + "learning_rate": 9.317429553589652e-05, + "loss": 0.2912, + "step": 16790 + }, + { + "epoch": 0.7467662354980664, + "grad_norm": 0.19398781657218933, + "learning_rate": 9.316386727922873e-05, + "loss": 0.2891, + "step": 16800 + }, + { + "epoch": 0.7472107392096724, + "grad_norm": 0.21862220764160156, + "learning_rate": 9.315343164703656e-05, + "loss": 0.2895, + "step": 16810 + }, + { + "epoch": 0.7476552429212784, + "grad_norm": 0.2078087478876114, + "learning_rate": 9.314298864110316e-05, + "loss": 0.2959, + "step": 16820 + }, + { + "epoch": 0.7480997466328844, + "grad_norm": 0.24129560589790344, + "learning_rate": 9.313253826321295e-05, + "loss": 0.2913, + "step": 16830 + }, + { + "epoch": 0.7485442503444903, + "grad_norm": 0.22562184929847717, + "learning_rate": 9.312208051515165e-05, + "loss": 0.2897, + "step": 16840 + }, + { + "epoch": 0.7489887540560963, + "grad_norm": 0.20717740058898926, + "learning_rate": 9.311161539870618e-05, + "loss": 0.2918, + "step": 16850 + }, + { + "epoch": 0.7494332577677023, + "grad_norm": 0.2178065925836563, + "learning_rate": 9.310114291566474e-05, + "loss": 0.2929, + "step": 16860 + }, + { + "epoch": 0.7498777614793084, + "grad_norm": 0.21930982172489166, + "learning_rate": 9.309066306781679e-05, + "loss": 0.288, + "step": 16870 + }, + { + "epoch": 0.7503222651909144, + "grad_norm": 0.2164609283208847, + "learning_rate": 9.308017585695306e-05, + "loss": 0.2902, + "step": 16880 + }, + { + "epoch": 0.7507667689025204, + "grad_norm": 0.23059344291687012, + "learning_rate": 9.306968128486552e-05, + "loss": 0.2916, + "step": 16890 + }, + { + "epoch": 0.7512112726141263, + "grad_norm": 0.1963878720998764, + "learning_rate": 9.30591793533474e-05, + "loss": 0.2921, + "step": 16900 + }, + { + "epoch": 0.7516557763257323, + "grad_norm": 0.26408663392066956, + "learning_rate": 9.304867006419321e-05, + "loss": 0.2925, + "step": 16910 + }, + { + "epoch": 0.7521002800373383, + "grad_norm": 0.23890924453735352, + "learning_rate": 9.303815341919868e-05, + "loss": 0.2892, + "step": 16920 + }, + { + "epoch": 0.7525447837489443, + "grad_norm": 0.2202780544757843, + "learning_rate": 9.302762942016084e-05, + "loss": 0.2896, + "step": 16930 + }, + { + "epoch": 0.7529892874605503, + "grad_norm": 0.2059416025876999, + "learning_rate": 9.301709806887792e-05, + "loss": 0.2913, + "step": 16940 + }, + { + "epoch": 0.7534337911721563, + "grad_norm": 0.19753491878509521, + "learning_rate": 9.300655936714948e-05, + "loss": 0.2879, + "step": 16950 + }, + { + "epoch": 0.7538782948837622, + "grad_norm": 0.20094305276870728, + "learning_rate": 9.299601331677627e-05, + "loss": 0.291, + "step": 16960 + }, + { + "epoch": 0.7543227985953683, + "grad_norm": 0.22444501519203186, + "learning_rate": 9.298545991956033e-05, + "loss": 0.2906, + "step": 16970 + }, + { + "epoch": 0.7547673023069743, + "grad_norm": 0.25107792019844055, + "learning_rate": 9.297489917730493e-05, + "loss": 0.2901, + "step": 16980 + }, + { + "epoch": 0.7552118060185803, + "grad_norm": 0.2633213698863983, + "learning_rate": 9.296433109181464e-05, + "loss": 0.2905, + "step": 16990 + }, + { + "epoch": 0.7556563097301863, + "grad_norm": 0.263865202665329, + "learning_rate": 9.295375566489523e-05, + "loss": 0.292, + "step": 17000 + }, + { + "epoch": 0.7561008134417923, + "grad_norm": 0.20246116816997528, + "learning_rate": 9.294317289835379e-05, + "loss": 0.2878, + "step": 17010 + }, + { + "epoch": 0.7565453171533982, + "grad_norm": 0.22616371512413025, + "learning_rate": 9.293258279399859e-05, + "loss": 0.2912, + "step": 17020 + }, + { + "epoch": 0.7569898208650042, + "grad_norm": 0.22299261391162872, + "learning_rate": 9.292198535363919e-05, + "loss": 0.2881, + "step": 17030 + }, + { + "epoch": 0.7574343245766102, + "grad_norm": 0.21854163706302643, + "learning_rate": 9.291138057908641e-05, + "loss": 0.2883, + "step": 17040 + }, + { + "epoch": 0.7578788282882162, + "grad_norm": 0.26584187150001526, + "learning_rate": 9.290076847215234e-05, + "loss": 0.2853, + "step": 17050 + }, + { + "epoch": 0.7583233319998222, + "grad_norm": 0.2572014033794403, + "learning_rate": 9.289014903465025e-05, + "loss": 0.2913, + "step": 17060 + }, + { + "epoch": 0.7587678357114281, + "grad_norm": 0.22803877294063568, + "learning_rate": 9.287952226839475e-05, + "loss": 0.2918, + "step": 17070 + }, + { + "epoch": 0.7592123394230342, + "grad_norm": 0.19392791390419006, + "learning_rate": 9.286888817520164e-05, + "loss": 0.2919, + "step": 17080 + }, + { + "epoch": 0.7596568431346402, + "grad_norm": 0.21936994791030884, + "learning_rate": 9.285824675688803e-05, + "loss": 0.2891, + "step": 17090 + }, + { + "epoch": 0.7601013468462462, + "grad_norm": 0.20476983487606049, + "learning_rate": 9.28475980152722e-05, + "loss": 0.288, + "step": 17100 + }, + { + "epoch": 0.7605458505578522, + "grad_norm": 0.2500366270542145, + "learning_rate": 9.283694195217379e-05, + "loss": 0.2938, + "step": 17110 + }, + { + "epoch": 0.7609903542694582, + "grad_norm": 0.23167046904563904, + "learning_rate": 9.282627856941356e-05, + "loss": 0.2906, + "step": 17120 + }, + { + "epoch": 0.7614348579810641, + "grad_norm": 0.25171685218811035, + "learning_rate": 9.281560786881363e-05, + "loss": 0.2904, + "step": 17130 + }, + { + "epoch": 0.7618793616926701, + "grad_norm": 0.2657032310962677, + "learning_rate": 9.280492985219733e-05, + "loss": 0.2908, + "step": 17140 + }, + { + "epoch": 0.7623238654042761, + "grad_norm": 0.23177282512187958, + "learning_rate": 9.279424452138924e-05, + "loss": 0.2887, + "step": 17150 + }, + { + "epoch": 0.7627683691158821, + "grad_norm": 0.22732079029083252, + "learning_rate": 9.278355187821517e-05, + "loss": 0.2912, + "step": 17160 + }, + { + "epoch": 0.7632128728274881, + "grad_norm": 0.1977059245109558, + "learning_rate": 9.277285192450224e-05, + "loss": 0.2905, + "step": 17170 + }, + { + "epoch": 0.763657376539094, + "grad_norm": 0.1994643360376358, + "learning_rate": 9.276214466207875e-05, + "loss": 0.2907, + "step": 17180 + }, + { + "epoch": 0.7641018802507001, + "grad_norm": 0.20542633533477783, + "learning_rate": 9.275143009277427e-05, + "loss": 0.2905, + "step": 17190 + }, + { + "epoch": 0.7645463839623061, + "grad_norm": 0.23943546414375305, + "learning_rate": 9.274070821841964e-05, + "loss": 0.2884, + "step": 17200 + }, + { + "epoch": 0.7649908876739121, + "grad_norm": 0.18794040381908417, + "learning_rate": 9.272997904084696e-05, + "loss": 0.2889, + "step": 17210 + }, + { + "epoch": 0.7654353913855181, + "grad_norm": 0.21500150859355927, + "learning_rate": 9.271924256188951e-05, + "loss": 0.2882, + "step": 17220 + }, + { + "epoch": 0.7658798950971241, + "grad_norm": 0.2198152393102646, + "learning_rate": 9.270849878338189e-05, + "loss": 0.2908, + "step": 17230 + }, + { + "epoch": 0.76632439880873, + "grad_norm": 0.23472951352596283, + "learning_rate": 9.269774770715991e-05, + "loss": 0.2901, + "step": 17240 + }, + { + "epoch": 0.766768902520336, + "grad_norm": 0.19448411464691162, + "learning_rate": 9.268698933506061e-05, + "loss": 0.2906, + "step": 17250 + }, + { + "epoch": 0.767213406231942, + "grad_norm": 0.23522299528121948, + "learning_rate": 9.267622366892235e-05, + "loss": 0.2909, + "step": 17260 + }, + { + "epoch": 0.767657909943548, + "grad_norm": 0.21656745672225952, + "learning_rate": 9.266545071058465e-05, + "loss": 0.2877, + "step": 17270 + }, + { + "epoch": 0.768102413655154, + "grad_norm": 0.2504240572452545, + "learning_rate": 9.265467046188833e-05, + "loss": 0.2935, + "step": 17280 + }, + { + "epoch": 0.7685469173667601, + "grad_norm": 0.2673303186893463, + "learning_rate": 9.264388292467543e-05, + "loss": 0.2915, + "step": 17290 + }, + { + "epoch": 0.768991421078366, + "grad_norm": 0.21936066448688507, + "learning_rate": 9.263308810078926e-05, + "loss": 0.2909, + "step": 17300 + }, + { + "epoch": 0.769435924789972, + "grad_norm": 0.2306661456823349, + "learning_rate": 9.262228599207434e-05, + "loss": 0.2922, + "step": 17310 + }, + { + "epoch": 0.769880428501578, + "grad_norm": 0.2380002737045288, + "learning_rate": 9.261147660037647e-05, + "loss": 0.2912, + "step": 17320 + }, + { + "epoch": 0.770324932213184, + "grad_norm": 0.2283831089735031, + "learning_rate": 9.26006599275427e-05, + "loss": 0.2892, + "step": 17330 + }, + { + "epoch": 0.77076943592479, + "grad_norm": 0.17667889595031738, + "learning_rate": 9.258983597542124e-05, + "loss": 0.2925, + "step": 17340 + }, + { + "epoch": 0.771213939636396, + "grad_norm": 0.18991316854953766, + "learning_rate": 9.257900474586167e-05, + "loss": 0.2882, + "step": 17350 + }, + { + "epoch": 0.7716584433480019, + "grad_norm": 0.23380625247955322, + "learning_rate": 9.256816624071471e-05, + "loss": 0.2894, + "step": 17360 + }, + { + "epoch": 0.7721029470596079, + "grad_norm": 0.2461053431034088, + "learning_rate": 9.25573204618324e-05, + "loss": 0.2915, + "step": 17370 + }, + { + "epoch": 0.7725474507712139, + "grad_norm": 0.22893846035003662, + "learning_rate": 9.254646741106796e-05, + "loss": 0.2882, + "step": 17380 + }, + { + "epoch": 0.7729919544828199, + "grad_norm": 0.1973293274641037, + "learning_rate": 9.253560709027589e-05, + "loss": 0.2902, + "step": 17390 + }, + { + "epoch": 0.773436458194426, + "grad_norm": 0.18660560250282288, + "learning_rate": 9.252473950131192e-05, + "loss": 0.2865, + "step": 17400 + }, + { + "epoch": 0.773880961906032, + "grad_norm": 0.22649244964122772, + "learning_rate": 9.251386464603302e-05, + "loss": 0.2911, + "step": 17410 + }, + { + "epoch": 0.7743254656176379, + "grad_norm": 0.2613992989063263, + "learning_rate": 9.250298252629741e-05, + "loss": 0.2901, + "step": 17420 + }, + { + "epoch": 0.7747699693292439, + "grad_norm": 0.2163373827934265, + "learning_rate": 9.249209314396454e-05, + "loss": 0.289, + "step": 17430 + }, + { + "epoch": 0.7752144730408499, + "grad_norm": 0.22795487940311432, + "learning_rate": 9.248119650089513e-05, + "loss": 0.2885, + "step": 17440 + }, + { + "epoch": 0.7756589767524559, + "grad_norm": 0.23152180016040802, + "learning_rate": 9.247029259895108e-05, + "loss": 0.2909, + "step": 17450 + }, + { + "epoch": 0.7761034804640619, + "grad_norm": 0.23507897555828094, + "learning_rate": 9.24593814399956e-05, + "loss": 0.2859, + "step": 17460 + }, + { + "epoch": 0.7765479841756678, + "grad_norm": 0.21061542630195618, + "learning_rate": 9.244846302589309e-05, + "loss": 0.2879, + "step": 17470 + }, + { + "epoch": 0.7769924878872738, + "grad_norm": 0.20696163177490234, + "learning_rate": 9.243753735850923e-05, + "loss": 0.2898, + "step": 17480 + }, + { + "epoch": 0.7774369915988798, + "grad_norm": 0.20148029923439026, + "learning_rate": 9.24266044397109e-05, + "loss": 0.2891, + "step": 17490 + }, + { + "epoch": 0.7778814953104859, + "grad_norm": 0.18055729568004608, + "learning_rate": 9.241566427136624e-05, + "loss": 0.2894, + "step": 17500 + }, + { + "epoch": 0.7783259990220919, + "grad_norm": 0.1979028731584549, + "learning_rate": 9.240471685534463e-05, + "loss": 0.2847, + "step": 17510 + }, + { + "epoch": 0.7787705027336979, + "grad_norm": 0.20576724410057068, + "learning_rate": 9.239376219351667e-05, + "loss": 0.289, + "step": 17520 + }, + { + "epoch": 0.7792150064453038, + "grad_norm": 0.22438207268714905, + "learning_rate": 9.238280028775425e-05, + "loss": 0.2937, + "step": 17530 + }, + { + "epoch": 0.7796595101569098, + "grad_norm": 0.1909661740064621, + "learning_rate": 9.237183113993041e-05, + "loss": 0.2873, + "step": 17540 + }, + { + "epoch": 0.7801040138685158, + "grad_norm": 0.2425980567932129, + "learning_rate": 9.236085475191952e-05, + "loss": 0.291, + "step": 17550 + }, + { + "epoch": 0.7805485175801218, + "grad_norm": 0.2301710546016693, + "learning_rate": 9.234987112559709e-05, + "loss": 0.2911, + "step": 17560 + }, + { + "epoch": 0.7809930212917278, + "grad_norm": 0.19222646951675415, + "learning_rate": 9.233888026283999e-05, + "loss": 0.2915, + "step": 17570 + }, + { + "epoch": 0.7814375250033337, + "grad_norm": 0.19547642767429352, + "learning_rate": 9.232788216552619e-05, + "loss": 0.2915, + "step": 17580 + }, + { + "epoch": 0.7818820287149397, + "grad_norm": 0.19510358572006226, + "learning_rate": 9.231687683553502e-05, + "loss": 0.29, + "step": 17590 + }, + { + "epoch": 0.7823265324265457, + "grad_norm": 0.2347671240568161, + "learning_rate": 9.230586427474698e-05, + "loss": 0.2855, + "step": 17600 + }, + { + "epoch": 0.7827710361381518, + "grad_norm": 0.19546934962272644, + "learning_rate": 9.229484448504379e-05, + "loss": 0.2887, + "step": 17610 + }, + { + "epoch": 0.7832155398497578, + "grad_norm": 0.2455178201198578, + "learning_rate": 9.228381746830843e-05, + "loss": 0.2865, + "step": 17620 + }, + { + "epoch": 0.7836600435613638, + "grad_norm": 0.20810623466968536, + "learning_rate": 9.227278322642514e-05, + "loss": 0.2909, + "step": 17630 + }, + { + "epoch": 0.7841045472729697, + "grad_norm": 0.20320165157318115, + "learning_rate": 9.226174176127937e-05, + "loss": 0.2857, + "step": 17640 + }, + { + "epoch": 0.7845490509845757, + "grad_norm": 0.20392152667045593, + "learning_rate": 9.22506930747578e-05, + "loss": 0.2873, + "step": 17650 + }, + { + "epoch": 0.7849935546961817, + "grad_norm": 0.23823688924312592, + "learning_rate": 9.223963716874831e-05, + "loss": 0.2894, + "step": 17660 + }, + { + "epoch": 0.7854380584077877, + "grad_norm": 0.20455223321914673, + "learning_rate": 9.222857404514012e-05, + "loss": 0.2903, + "step": 17670 + }, + { + "epoch": 0.7858825621193937, + "grad_norm": 0.18850520253181458, + "learning_rate": 9.221750370582355e-05, + "loss": 0.2874, + "step": 17680 + }, + { + "epoch": 0.7863270658309996, + "grad_norm": 0.258590966463089, + "learning_rate": 9.220642615269028e-05, + "loss": 0.2873, + "step": 17690 + }, + { + "epoch": 0.7867715695426056, + "grad_norm": 0.22908785939216614, + "learning_rate": 9.219534138763311e-05, + "loss": 0.2898, + "step": 17700 + }, + { + "epoch": 0.7872160732542117, + "grad_norm": 0.22067002952098846, + "learning_rate": 9.218424941254613e-05, + "loss": 0.2883, + "step": 17710 + }, + { + "epoch": 0.7876605769658177, + "grad_norm": 0.2155945599079132, + "learning_rate": 9.217315022932468e-05, + "loss": 0.2924, + "step": 17720 + }, + { + "epoch": 0.7881050806774237, + "grad_norm": 0.23398137092590332, + "learning_rate": 9.216204383986528e-05, + "loss": 0.2894, + "step": 17730 + }, + { + "epoch": 0.7885495843890297, + "grad_norm": 0.1946781426668167, + "learning_rate": 9.215093024606574e-05, + "loss": 0.2888, + "step": 17740 + }, + { + "epoch": 0.7889940881006356, + "grad_norm": 0.21989277005195618, + "learning_rate": 9.213980944982506e-05, + "loss": 0.2871, + "step": 17750 + }, + { + "epoch": 0.7894385918122416, + "grad_norm": 0.19894817471504211, + "learning_rate": 9.212868145304346e-05, + "loss": 0.287, + "step": 17760 + }, + { + "epoch": 0.7898830955238476, + "grad_norm": 0.18465135991573334, + "learning_rate": 9.211754625762241e-05, + "loss": 0.2905, + "step": 17770 + }, + { + "epoch": 0.7903275992354536, + "grad_norm": 0.18721796572208405, + "learning_rate": 9.210640386546463e-05, + "loss": 0.2892, + "step": 17780 + }, + { + "epoch": 0.7907721029470596, + "grad_norm": 0.20291151106357574, + "learning_rate": 9.209525427847405e-05, + "loss": 0.2894, + "step": 17790 + }, + { + "epoch": 0.7912166066586656, + "grad_norm": 0.22519926726818085, + "learning_rate": 9.208409749855583e-05, + "loss": 0.2915, + "step": 17800 + }, + { + "epoch": 0.7916611103702715, + "grad_norm": 0.19438864290714264, + "learning_rate": 9.207293352761633e-05, + "loss": 0.2865, + "step": 17810 + }, + { + "epoch": 0.7921056140818776, + "grad_norm": 0.19716954231262207, + "learning_rate": 9.206176236756319e-05, + "loss": 0.2872, + "step": 17820 + }, + { + "epoch": 0.7925501177934836, + "grad_norm": 0.19870737195014954, + "learning_rate": 9.205058402030525e-05, + "loss": 0.2884, + "step": 17830 + }, + { + "epoch": 0.7929946215050896, + "grad_norm": 0.25780704617500305, + "learning_rate": 9.203939848775259e-05, + "loss": 0.2924, + "step": 17840 + }, + { + "epoch": 0.7934391252166956, + "grad_norm": 0.2054378241300583, + "learning_rate": 9.202820577181652e-05, + "loss": 0.295, + "step": 17850 + }, + { + "epoch": 0.7938836289283016, + "grad_norm": 0.18711857497692108, + "learning_rate": 9.201700587440953e-05, + "loss": 0.2917, + "step": 17860 + }, + { + "epoch": 0.7943281326399075, + "grad_norm": 0.1920965611934662, + "learning_rate": 9.200579879744544e-05, + "loss": 0.2891, + "step": 17870 + }, + { + "epoch": 0.7947726363515135, + "grad_norm": 0.21714958548545837, + "learning_rate": 9.199458454283918e-05, + "loss": 0.2889, + "step": 17880 + }, + { + "epoch": 0.7952171400631195, + "grad_norm": 0.20590005815029144, + "learning_rate": 9.198336311250697e-05, + "loss": 0.2859, + "step": 17890 + }, + { + "epoch": 0.7956616437747255, + "grad_norm": 0.19712309539318085, + "learning_rate": 9.197213450836626e-05, + "loss": 0.2896, + "step": 17900 + }, + { + "epoch": 0.7961061474863315, + "grad_norm": 0.20911626517772675, + "learning_rate": 9.19608987323357e-05, + "loss": 0.2882, + "step": 17910 + }, + { + "epoch": 0.7965506511979376, + "grad_norm": 0.20392149686813354, + "learning_rate": 9.194965578633517e-05, + "loss": 0.2877, + "step": 17920 + }, + { + "epoch": 0.7969951549095435, + "grad_norm": 0.2401251196861267, + "learning_rate": 9.193840567228582e-05, + "loss": 0.2914, + "step": 17930 + }, + { + "epoch": 0.7974396586211495, + "grad_norm": 0.25378668308258057, + "learning_rate": 9.192714839210994e-05, + "loss": 0.2899, + "step": 17940 + }, + { + "epoch": 0.7978841623327555, + "grad_norm": 0.20151598751544952, + "learning_rate": 9.19158839477311e-05, + "loss": 0.289, + "step": 17950 + }, + { + "epoch": 0.7983286660443615, + "grad_norm": 0.23238864541053772, + "learning_rate": 9.190461234107411e-05, + "loss": 0.29, + "step": 17960 + }, + { + "epoch": 0.7987731697559675, + "grad_norm": 0.19964738190174103, + "learning_rate": 9.189333357406496e-05, + "loss": 0.29, + "step": 17970 + }, + { + "epoch": 0.7992176734675734, + "grad_norm": 0.2463115006685257, + "learning_rate": 9.188204764863089e-05, + "loss": 0.2904, + "step": 17980 + }, + { + "epoch": 0.7996621771791794, + "grad_norm": 0.2036748230457306, + "learning_rate": 9.187075456670033e-05, + "loss": 0.2869, + "step": 17990 + }, + { + "epoch": 0.8001066808907854, + "grad_norm": 0.21991455554962158, + "learning_rate": 9.1859454330203e-05, + "loss": 0.2871, + "step": 18000 + }, + { + "epoch": 0.8005511846023914, + "grad_norm": 0.16370610892772675, + "learning_rate": 9.18481469410698e-05, + "loss": 0.2887, + "step": 18010 + }, + { + "epoch": 0.8009956883139974, + "grad_norm": 0.18809974193572998, + "learning_rate": 9.183683240123281e-05, + "loss": 0.2919, + "step": 18020 + }, + { + "epoch": 0.8014401920256035, + "grad_norm": 0.21554593741893768, + "learning_rate": 9.182551071262541e-05, + "loss": 0.2885, + "step": 18030 + }, + { + "epoch": 0.8018846957372094, + "grad_norm": 0.2079133689403534, + "learning_rate": 9.181418187718218e-05, + "loss": 0.2919, + "step": 18040 + }, + { + "epoch": 0.8023291994488154, + "grad_norm": 0.1999206840991974, + "learning_rate": 9.180284589683888e-05, + "loss": 0.2864, + "step": 18050 + }, + { + "epoch": 0.8027737031604214, + "grad_norm": 0.2058839052915573, + "learning_rate": 9.17915027735325e-05, + "loss": 0.2887, + "step": 18060 + }, + { + "epoch": 0.8032182068720274, + "grad_norm": 0.2480715811252594, + "learning_rate": 9.178015250920133e-05, + "loss": 0.291, + "step": 18070 + }, + { + "epoch": 0.8036627105836334, + "grad_norm": 0.20384395122528076, + "learning_rate": 9.176879510578477e-05, + "loss": 0.2871, + "step": 18080 + }, + { + "epoch": 0.8041072142952393, + "grad_norm": 0.22268345952033997, + "learning_rate": 9.17574305652235e-05, + "loss": 0.2874, + "step": 18090 + }, + { + "epoch": 0.8045517180068453, + "grad_norm": 0.1801220178604126, + "learning_rate": 9.174605888945942e-05, + "loss": 0.292, + "step": 18100 + }, + { + "epoch": 0.8049962217184513, + "grad_norm": 0.2048281729221344, + "learning_rate": 9.173468008043564e-05, + "loss": 0.29, + "step": 18110 + }, + { + "epoch": 0.8054407254300573, + "grad_norm": 0.16342446208000183, + "learning_rate": 9.172329414009648e-05, + "loss": 0.2877, + "step": 18120 + }, + { + "epoch": 0.8058852291416634, + "grad_norm": 0.17818965017795563, + "learning_rate": 9.171190107038747e-05, + "loss": 0.2894, + "step": 18130 + }, + { + "epoch": 0.8063297328532694, + "grad_norm": 0.2524281442165375, + "learning_rate": 9.170050087325541e-05, + "loss": 0.2893, + "step": 18140 + }, + { + "epoch": 0.8067742365648753, + "grad_norm": 0.2202974259853363, + "learning_rate": 9.168909355064824e-05, + "loss": 0.2885, + "step": 18150 + }, + { + "epoch": 0.8072187402764813, + "grad_norm": 0.25603848695755005, + "learning_rate": 9.167767910451519e-05, + "loss": 0.2877, + "step": 18160 + }, + { + "epoch": 0.8076632439880873, + "grad_norm": 0.27308204770088196, + "learning_rate": 9.166625753680669e-05, + "loss": 0.2865, + "step": 18170 + }, + { + "epoch": 0.8081077476996933, + "grad_norm": 0.2159678190946579, + "learning_rate": 9.165482884947431e-05, + "loss": 0.2952, + "step": 18180 + }, + { + "epoch": 0.8085522514112993, + "grad_norm": 0.2174590826034546, + "learning_rate": 9.164339304447098e-05, + "loss": 0.2921, + "step": 18190 + }, + { + "epoch": 0.8089967551229053, + "grad_norm": 0.23047032952308655, + "learning_rate": 9.163195012375072e-05, + "loss": 0.2901, + "step": 18200 + }, + { + "epoch": 0.8094412588345112, + "grad_norm": 0.22709664702415466, + "learning_rate": 9.16205000892688e-05, + "loss": 0.2901, + "step": 18210 + }, + { + "epoch": 0.8098857625461172, + "grad_norm": 0.25190794467926025, + "learning_rate": 9.160904294298175e-05, + "loss": 0.2887, + "step": 18220 + }, + { + "epoch": 0.8103302662577232, + "grad_norm": 0.25007376074790955, + "learning_rate": 9.159757868684727e-05, + "loss": 0.2922, + "step": 18230 + }, + { + "epoch": 0.8107747699693293, + "grad_norm": 0.22614233195781708, + "learning_rate": 9.15861073228243e-05, + "loss": 0.2904, + "step": 18240 + }, + { + "epoch": 0.8112192736809353, + "grad_norm": 0.2195560336112976, + "learning_rate": 9.157462885287296e-05, + "loss": 0.2897, + "step": 18250 + }, + { + "epoch": 0.8116637773925413, + "grad_norm": 0.23962444067001343, + "learning_rate": 9.156314327895461e-05, + "loss": 0.2893, + "step": 18260 + }, + { + "epoch": 0.8121082811041472, + "grad_norm": 0.23270273208618164, + "learning_rate": 9.155165060303185e-05, + "loss": 0.2892, + "step": 18270 + }, + { + "epoch": 0.8125527848157532, + "grad_norm": 0.21510063111782074, + "learning_rate": 9.154015082706841e-05, + "loss": 0.2883, + "step": 18280 + }, + { + "epoch": 0.8129972885273592, + "grad_norm": 0.20564672350883484, + "learning_rate": 9.152864395302936e-05, + "loss": 0.289, + "step": 18290 + }, + { + "epoch": 0.8134417922389652, + "grad_norm": 0.23929302394390106, + "learning_rate": 9.151712998288085e-05, + "loss": 0.289, + "step": 18300 + }, + { + "epoch": 0.8138862959505712, + "grad_norm": 0.21438227593898773, + "learning_rate": 9.150560891859031e-05, + "loss": 0.2891, + "step": 18310 + }, + { + "epoch": 0.8143307996621771, + "grad_norm": 0.2142563909292221, + "learning_rate": 9.14940807621264e-05, + "loss": 0.2886, + "step": 18320 + }, + { + "epoch": 0.8147753033737831, + "grad_norm": 0.21702875196933746, + "learning_rate": 9.148254551545894e-05, + "loss": 0.2905, + "step": 18330 + }, + { + "epoch": 0.8152198070853892, + "grad_norm": 0.23118703067302704, + "learning_rate": 9.147100318055901e-05, + "loss": 0.2891, + "step": 18340 + }, + { + "epoch": 0.8156643107969952, + "grad_norm": 0.2034335434436798, + "learning_rate": 9.145945375939888e-05, + "loss": 0.2899, + "step": 18350 + }, + { + "epoch": 0.8161088145086012, + "grad_norm": 0.24472199380397797, + "learning_rate": 9.144789725395203e-05, + "loss": 0.2924, + "step": 18360 + }, + { + "epoch": 0.8165533182202072, + "grad_norm": 0.2250252366065979, + "learning_rate": 9.14363336661931e-05, + "loss": 0.2884, + "step": 18370 + }, + { + "epoch": 0.8169978219318131, + "grad_norm": 0.23003078997135162, + "learning_rate": 9.142476299809806e-05, + "loss": 0.2886, + "step": 18380 + }, + { + "epoch": 0.8174423256434191, + "grad_norm": 0.2101273536682129, + "learning_rate": 9.1413185251644e-05, + "loss": 0.2851, + "step": 18390 + }, + { + "epoch": 0.8178868293550251, + "grad_norm": 0.2694001793861389, + "learning_rate": 9.140160042880923e-05, + "loss": 0.2879, + "step": 18400 + }, + { + "epoch": 0.8183313330666311, + "grad_norm": 0.24313752353191376, + "learning_rate": 9.139000853157327e-05, + "loss": 0.2908, + "step": 18410 + }, + { + "epoch": 0.8187758367782371, + "grad_norm": 0.24285052716732025, + "learning_rate": 9.137840956191688e-05, + "loss": 0.2877, + "step": 18420 + }, + { + "epoch": 0.819220340489843, + "grad_norm": 0.20998723804950714, + "learning_rate": 9.136680352182199e-05, + "loss": 0.2888, + "step": 18430 + }, + { + "epoch": 0.819664844201449, + "grad_norm": 0.20978602766990662, + "learning_rate": 9.135519041327177e-05, + "loss": 0.2873, + "step": 18440 + }, + { + "epoch": 0.8201093479130551, + "grad_norm": 0.1849726438522339, + "learning_rate": 9.134357023825058e-05, + "loss": 0.2863, + "step": 18450 + }, + { + "epoch": 0.8205538516246611, + "grad_norm": 0.2401159107685089, + "learning_rate": 9.133194299874398e-05, + "loss": 0.291, + "step": 18460 + }, + { + "epoch": 0.8209983553362671, + "grad_norm": 0.23615358769893646, + "learning_rate": 9.132030869673876e-05, + "loss": 0.2856, + "step": 18470 + }, + { + "epoch": 0.8214428590478731, + "grad_norm": 0.22787421941757202, + "learning_rate": 9.130866733422288e-05, + "loss": 0.2864, + "step": 18480 + }, + { + "epoch": 0.821887362759479, + "grad_norm": 0.19128349423408508, + "learning_rate": 9.129701891318556e-05, + "loss": 0.2857, + "step": 18490 + }, + { + "epoch": 0.822331866471085, + "grad_norm": 0.2211848348379135, + "learning_rate": 9.128536343561718e-05, + "loss": 0.2876, + "step": 18500 + }, + { + "epoch": 0.822776370182691, + "grad_norm": 0.21617449820041656, + "learning_rate": 9.127370090350934e-05, + "loss": 0.2887, + "step": 18510 + }, + { + "epoch": 0.823220873894297, + "grad_norm": 0.215943843126297, + "learning_rate": 9.126203131885487e-05, + "loss": 0.2878, + "step": 18520 + }, + { + "epoch": 0.823665377605903, + "grad_norm": 0.2308463156223297, + "learning_rate": 9.125035468364775e-05, + "loss": 0.2872, + "step": 18530 + }, + { + "epoch": 0.824109881317509, + "grad_norm": 0.22894418239593506, + "learning_rate": 9.123867099988322e-05, + "loss": 0.2899, + "step": 18540 + }, + { + "epoch": 0.824554385029115, + "grad_norm": 0.251848429441452, + "learning_rate": 9.122698026955769e-05, + "loss": 0.287, + "step": 18550 + }, + { + "epoch": 0.824998888740721, + "grad_norm": 0.19755275547504425, + "learning_rate": 9.12152824946688e-05, + "loss": 0.2895, + "step": 18560 + }, + { + "epoch": 0.825443392452327, + "grad_norm": 0.19325914978981018, + "learning_rate": 9.120357767721538e-05, + "loss": 0.2875, + "step": 18570 + }, + { + "epoch": 0.825887896163933, + "grad_norm": 0.20446135103702545, + "learning_rate": 9.119186581919745e-05, + "loss": 0.2878, + "step": 18580 + }, + { + "epoch": 0.826332399875539, + "grad_norm": 0.2706654369831085, + "learning_rate": 9.118014692261624e-05, + "loss": 0.2924, + "step": 18590 + }, + { + "epoch": 0.826776903587145, + "grad_norm": 0.2281988263130188, + "learning_rate": 9.116842098947422e-05, + "loss": 0.2883, + "step": 18600 + }, + { + "epoch": 0.8272214072987509, + "grad_norm": 0.21616512537002563, + "learning_rate": 9.115668802177499e-05, + "loss": 0.2864, + "step": 18610 + }, + { + "epoch": 0.8276659110103569, + "grad_norm": 0.18872126936912537, + "learning_rate": 9.114494802152342e-05, + "loss": 0.2881, + "step": 18620 + }, + { + "epoch": 0.8281104147219629, + "grad_norm": 0.17766550183296204, + "learning_rate": 9.113320099072555e-05, + "loss": 0.2869, + "step": 18630 + }, + { + "epoch": 0.8285549184335689, + "grad_norm": 0.2011968046426773, + "learning_rate": 9.112144693138864e-05, + "loss": 0.2862, + "step": 18640 + }, + { + "epoch": 0.8289994221451749, + "grad_norm": 0.2246347814798355, + "learning_rate": 9.110968584552111e-05, + "loss": 0.2885, + "step": 18650 + }, + { + "epoch": 0.829443925856781, + "grad_norm": 0.19551733136177063, + "learning_rate": 9.109791773513264e-05, + "loss": 0.2892, + "step": 18660 + }, + { + "epoch": 0.8298884295683869, + "grad_norm": 0.2530602514743805, + "learning_rate": 9.108614260223403e-05, + "loss": 0.2935, + "step": 18670 + }, + { + "epoch": 0.8303329332799929, + "grad_norm": 0.2172243446111679, + "learning_rate": 9.107436044883738e-05, + "loss": 0.2902, + "step": 18680 + }, + { + "epoch": 0.8307774369915989, + "grad_norm": 0.24085254967212677, + "learning_rate": 9.10625712769559e-05, + "loss": 0.2916, + "step": 18690 + }, + { + "epoch": 0.8312219407032049, + "grad_norm": 0.23989354074001312, + "learning_rate": 9.105077508860406e-05, + "loss": 0.2886, + "step": 18700 + }, + { + "epoch": 0.8316664444148109, + "grad_norm": 0.22510085999965668, + "learning_rate": 9.103897188579751e-05, + "loss": 0.2875, + "step": 18710 + }, + { + "epoch": 0.8321109481264168, + "grad_norm": 0.23277044296264648, + "learning_rate": 9.102716167055308e-05, + "loss": 0.2893, + "step": 18720 + }, + { + "epoch": 0.8325554518380228, + "grad_norm": 0.22949925065040588, + "learning_rate": 9.10153444448888e-05, + "loss": 0.2881, + "step": 18730 + }, + { + "epoch": 0.8329999555496288, + "grad_norm": 0.20159633457660675, + "learning_rate": 9.100352021082393e-05, + "loss": 0.2876, + "step": 18740 + }, + { + "epoch": 0.8334444592612348, + "grad_norm": 0.20955421030521393, + "learning_rate": 9.099168897037891e-05, + "loss": 0.2861, + "step": 18750 + }, + { + "epoch": 0.8338889629728409, + "grad_norm": 0.21373696625232697, + "learning_rate": 9.097985072557538e-05, + "loss": 0.2894, + "step": 18760 + }, + { + "epoch": 0.8343334666844469, + "grad_norm": 0.23319487273693085, + "learning_rate": 9.096800547843615e-05, + "loss": 0.286, + "step": 18770 + }, + { + "epoch": 0.8347779703960528, + "grad_norm": 0.20574775338172913, + "learning_rate": 9.095615323098526e-05, + "loss": 0.2905, + "step": 18780 + }, + { + "epoch": 0.8352224741076588, + "grad_norm": 0.20683489739894867, + "learning_rate": 9.094429398524795e-05, + "loss": 0.2867, + "step": 18790 + }, + { + "epoch": 0.8356669778192648, + "grad_norm": 0.23305939137935638, + "learning_rate": 9.093242774325061e-05, + "loss": 0.2877, + "step": 18800 + }, + { + "epoch": 0.8361114815308708, + "grad_norm": 0.1774464249610901, + "learning_rate": 9.092055450702088e-05, + "loss": 0.2836, + "step": 18810 + }, + { + "epoch": 0.8365559852424768, + "grad_norm": 0.25114738941192627, + "learning_rate": 9.090867427858756e-05, + "loss": 0.289, + "step": 18820 + }, + { + "epoch": 0.8370004889540827, + "grad_norm": 0.22567002475261688, + "learning_rate": 9.089678705998066e-05, + "loss": 0.2893, + "step": 18830 + }, + { + "epoch": 0.8374449926656887, + "grad_norm": 0.1942245066165924, + "learning_rate": 9.088489285323139e-05, + "loss": 0.288, + "step": 18840 + }, + { + "epoch": 0.8378894963772947, + "grad_norm": 0.21658207476139069, + "learning_rate": 9.087299166037212e-05, + "loss": 0.2884, + "step": 18850 + }, + { + "epoch": 0.8383340000889007, + "grad_norm": 0.1886741816997528, + "learning_rate": 9.086108348343647e-05, + "loss": 0.287, + "step": 18860 + }, + { + "epoch": 0.8387785038005068, + "grad_norm": 0.216348797082901, + "learning_rate": 9.08491683244592e-05, + "loss": 0.2883, + "step": 18870 + }, + { + "epoch": 0.8392230075121128, + "grad_norm": 0.21271423995494843, + "learning_rate": 9.08372461854763e-05, + "loss": 0.2886, + "step": 18880 + }, + { + "epoch": 0.8396675112237187, + "grad_norm": 0.24785968661308289, + "learning_rate": 9.082531706852492e-05, + "loss": 0.2872, + "step": 18890 + }, + { + "epoch": 0.8401120149353247, + "grad_norm": 0.20796790719032288, + "learning_rate": 9.081338097564342e-05, + "loss": 0.2882, + "step": 18900 + }, + { + "epoch": 0.8405565186469307, + "grad_norm": 0.20422984659671783, + "learning_rate": 9.080143790887137e-05, + "loss": 0.2847, + "step": 18910 + }, + { + "epoch": 0.8410010223585367, + "grad_norm": 0.1867852360010147, + "learning_rate": 9.07894878702495e-05, + "loss": 0.2892, + "step": 18920 + }, + { + "epoch": 0.8414455260701427, + "grad_norm": 0.15694192051887512, + "learning_rate": 9.077753086181974e-05, + "loss": 0.2885, + "step": 18930 + }, + { + "epoch": 0.8418900297817486, + "grad_norm": 0.20701149106025696, + "learning_rate": 9.076556688562524e-05, + "loss": 0.2857, + "step": 18940 + }, + { + "epoch": 0.8423345334933546, + "grad_norm": 0.23766520619392395, + "learning_rate": 9.075359594371029e-05, + "loss": 0.2888, + "step": 18950 + }, + { + "epoch": 0.8427790372049606, + "grad_norm": 0.25990429520606995, + "learning_rate": 9.07416180381204e-05, + "loss": 0.2902, + "step": 18960 + }, + { + "epoch": 0.8432235409165667, + "grad_norm": 0.24720941483974457, + "learning_rate": 9.072963317090228e-05, + "loss": 0.2865, + "step": 18970 + }, + { + "epoch": 0.8436680446281727, + "grad_norm": 0.24465782940387726, + "learning_rate": 9.071764134410382e-05, + "loss": 0.2858, + "step": 18980 + }, + { + "epoch": 0.8441125483397787, + "grad_norm": 0.21010565757751465, + "learning_rate": 9.070564255977407e-05, + "loss": 0.288, + "step": 18990 + }, + { + "epoch": 0.8445570520513846, + "grad_norm": 0.2113366276025772, + "learning_rate": 9.06936368199633e-05, + "loss": 0.2892, + "step": 19000 + }, + { + "epoch": 0.8450015557629906, + "grad_norm": 0.22082073986530304, + "learning_rate": 9.0681624126723e-05, + "loss": 0.2852, + "step": 19010 + }, + { + "epoch": 0.8454460594745966, + "grad_norm": 0.21731679141521454, + "learning_rate": 9.066960448210576e-05, + "loss": 0.2903, + "step": 19020 + }, + { + "epoch": 0.8458905631862026, + "grad_norm": 0.23836812376976013, + "learning_rate": 9.065757788816543e-05, + "loss": 0.2903, + "step": 19030 + }, + { + "epoch": 0.8463350668978086, + "grad_norm": 0.2074563354253769, + "learning_rate": 9.064554434695705e-05, + "loss": 0.2845, + "step": 19040 + }, + { + "epoch": 0.8467795706094146, + "grad_norm": 0.19798171520233154, + "learning_rate": 9.063350386053677e-05, + "loss": 0.2861, + "step": 19050 + }, + { + "epoch": 0.8472240743210205, + "grad_norm": 0.19916881620883942, + "learning_rate": 9.062145643096202e-05, + "loss": 0.2853, + "step": 19060 + }, + { + "epoch": 0.8476685780326265, + "grad_norm": 0.2086537629365921, + "learning_rate": 9.060940206029136e-05, + "loss": 0.2901, + "step": 19070 + }, + { + "epoch": 0.8481130817442326, + "grad_norm": 0.251191645860672, + "learning_rate": 9.059734075058457e-05, + "loss": 0.2874, + "step": 19080 + }, + { + "epoch": 0.8485575854558386, + "grad_norm": 0.19416822493076324, + "learning_rate": 9.058527250390257e-05, + "loss": 0.2899, + "step": 19090 + }, + { + "epoch": 0.8490020891674446, + "grad_norm": 0.19989728927612305, + "learning_rate": 9.057319732230752e-05, + "loss": 0.2861, + "step": 19100 + }, + { + "epoch": 0.8494465928790506, + "grad_norm": 0.21339130401611328, + "learning_rate": 9.056111520786273e-05, + "loss": 0.2852, + "step": 19110 + }, + { + "epoch": 0.8498910965906565, + "grad_norm": 0.21671739220619202, + "learning_rate": 9.054902616263268e-05, + "loss": 0.2885, + "step": 19120 + }, + { + "epoch": 0.8503356003022625, + "grad_norm": 0.2141282707452774, + "learning_rate": 9.05369301886831e-05, + "loss": 0.2883, + "step": 19130 + }, + { + "epoch": 0.8507801040138685, + "grad_norm": 0.25111904740333557, + "learning_rate": 9.052482728808083e-05, + "loss": 0.2882, + "step": 19140 + }, + { + "epoch": 0.8512246077254745, + "grad_norm": 0.255504310131073, + "learning_rate": 9.051271746289391e-05, + "loss": 0.2882, + "step": 19150 + }, + { + "epoch": 0.8516691114370805, + "grad_norm": 0.23405325412750244, + "learning_rate": 9.050060071519162e-05, + "loss": 0.2883, + "step": 19160 + }, + { + "epoch": 0.8521136151486864, + "grad_norm": 0.23493774235248566, + "learning_rate": 9.048847704704437e-05, + "loss": 0.2902, + "step": 19170 + }, + { + "epoch": 0.8525581188602925, + "grad_norm": 0.2488742172718048, + "learning_rate": 9.047634646052376e-05, + "loss": 0.2862, + "step": 19180 + }, + { + "epoch": 0.8530026225718985, + "grad_norm": 0.21303194761276245, + "learning_rate": 9.046420895770256e-05, + "loss": 0.2874, + "step": 19190 + }, + { + "epoch": 0.8534471262835045, + "grad_norm": 0.2293306440114975, + "learning_rate": 9.045206454065473e-05, + "loss": 0.2871, + "step": 19200 + }, + { + "epoch": 0.8538916299951105, + "grad_norm": 0.23175828158855438, + "learning_rate": 9.043991321145546e-05, + "loss": 0.2869, + "step": 19210 + }, + { + "epoch": 0.8543361337067165, + "grad_norm": 0.26365622878074646, + "learning_rate": 9.042775497218105e-05, + "loss": 0.2855, + "step": 19220 + }, + { + "epoch": 0.8547806374183224, + "grad_norm": 0.24871045351028442, + "learning_rate": 9.041558982490901e-05, + "loss": 0.2873, + "step": 19230 + }, + { + "epoch": 0.8552251411299284, + "grad_norm": 0.24246707558631897, + "learning_rate": 9.040341777171805e-05, + "loss": 0.2862, + "step": 19240 + }, + { + "epoch": 0.8556696448415344, + "grad_norm": 0.20960356295108795, + "learning_rate": 9.039123881468802e-05, + "loss": 0.2857, + "step": 19250 + }, + { + "epoch": 0.8561141485531404, + "grad_norm": 0.22992296516895294, + "learning_rate": 9.037905295589998e-05, + "loss": 0.2915, + "step": 19260 + }, + { + "epoch": 0.8565586522647464, + "grad_norm": 0.2571873664855957, + "learning_rate": 9.036686019743617e-05, + "loss": 0.2855, + "step": 19270 + }, + { + "epoch": 0.8570031559763523, + "grad_norm": 0.23060892522335052, + "learning_rate": 9.035466054137997e-05, + "loss": 0.2864, + "step": 19280 + }, + { + "epoch": 0.8574476596879584, + "grad_norm": 0.2366427183151245, + "learning_rate": 9.0342453989816e-05, + "loss": 0.2878, + "step": 19290 + }, + { + "epoch": 0.8578921633995644, + "grad_norm": 0.19430424273014069, + "learning_rate": 9.033024054483e-05, + "loss": 0.2875, + "step": 19300 + }, + { + "epoch": 0.8583366671111704, + "grad_norm": 0.22259487211704254, + "learning_rate": 9.031802020850894e-05, + "loss": 0.2877, + "step": 19310 + }, + { + "epoch": 0.8587811708227764, + "grad_norm": 0.21962390840053558, + "learning_rate": 9.030579298294092e-05, + "loss": 0.285, + "step": 19320 + }, + { + "epoch": 0.8592256745343824, + "grad_norm": 0.18831084668636322, + "learning_rate": 9.029355887021524e-05, + "loss": 0.2864, + "step": 19330 + }, + { + "epoch": 0.8596701782459883, + "grad_norm": 0.18497298657894135, + "learning_rate": 9.028131787242238e-05, + "loss": 0.2895, + "step": 19340 + }, + { + "epoch": 0.8601146819575943, + "grad_norm": 0.22880423069000244, + "learning_rate": 9.026906999165399e-05, + "loss": 0.2894, + "step": 19350 + }, + { + "epoch": 0.8605591856692003, + "grad_norm": 0.20703339576721191, + "learning_rate": 9.025681523000291e-05, + "loss": 0.2843, + "step": 19360 + }, + { + "epoch": 0.8610036893808063, + "grad_norm": 0.19785954058170319, + "learning_rate": 9.024455358956315e-05, + "loss": 0.2863, + "step": 19370 + }, + { + "epoch": 0.8614481930924123, + "grad_norm": 0.19115914404392242, + "learning_rate": 9.023228507242984e-05, + "loss": 0.2872, + "step": 19380 + }, + { + "epoch": 0.8618926968040184, + "grad_norm": 0.19641077518463135, + "learning_rate": 9.022000968069937e-05, + "loss": 0.2865, + "step": 19390 + }, + { + "epoch": 0.8623372005156243, + "grad_norm": 0.1780019849538803, + "learning_rate": 9.020772741646928e-05, + "loss": 0.2897, + "step": 19400 + }, + { + "epoch": 0.8627817042272303, + "grad_norm": 0.19793617725372314, + "learning_rate": 9.019543828183826e-05, + "loss": 0.2913, + "step": 19410 + }, + { + "epoch": 0.8632262079388363, + "grad_norm": 0.24847498536109924, + "learning_rate": 9.018314227890616e-05, + "loss": 0.2879, + "step": 19420 + }, + { + "epoch": 0.8636707116504423, + "grad_norm": 0.22530101239681244, + "learning_rate": 9.017083940977408e-05, + "loss": 0.2843, + "step": 19430 + }, + { + "epoch": 0.8641152153620483, + "grad_norm": 0.20458349585533142, + "learning_rate": 9.015852967654422e-05, + "loss": 0.2873, + "step": 19440 + }, + { + "epoch": 0.8645597190736543, + "grad_norm": 0.1975868046283722, + "learning_rate": 9.014621308131996e-05, + "loss": 0.2876, + "step": 19450 + }, + { + "epoch": 0.8650042227852602, + "grad_norm": 0.20350395143032074, + "learning_rate": 9.01338896262059e-05, + "loss": 0.2852, + "step": 19460 + }, + { + "epoch": 0.8654487264968662, + "grad_norm": 0.2227078527212143, + "learning_rate": 9.012155931330777e-05, + "loss": 0.2896, + "step": 19470 + }, + { + "epoch": 0.8658932302084722, + "grad_norm": 0.23490095138549805, + "learning_rate": 9.010922214473246e-05, + "loss": 0.2924, + "step": 19480 + }, + { + "epoch": 0.8663377339200782, + "grad_norm": 0.18287859857082367, + "learning_rate": 9.009687812258808e-05, + "loss": 0.2865, + "step": 19490 + }, + { + "epoch": 0.8667822376316843, + "grad_norm": 0.21488289535045624, + "learning_rate": 9.00845272489839e-05, + "loss": 0.2847, + "step": 19500 + }, + { + "epoch": 0.8672267413432903, + "grad_norm": 0.19956588745117188, + "learning_rate": 9.007216952603031e-05, + "loss": 0.2852, + "step": 19510 + }, + { + "epoch": 0.8676712450548962, + "grad_norm": 0.2144080102443695, + "learning_rate": 9.005980495583894e-05, + "loss": 0.288, + "step": 19520 + }, + { + "epoch": 0.8681157487665022, + "grad_norm": 0.23315469920635223, + "learning_rate": 9.004743354052252e-05, + "loss": 0.2922, + "step": 19530 + }, + { + "epoch": 0.8685602524781082, + "grad_norm": 0.22933147847652435, + "learning_rate": 9.003505528219503e-05, + "loss": 0.2875, + "step": 19540 + }, + { + "epoch": 0.8690047561897142, + "grad_norm": 0.17963457107543945, + "learning_rate": 9.002267018297154e-05, + "loss": 0.2875, + "step": 19550 + }, + { + "epoch": 0.8694492599013202, + "grad_norm": 0.2118120640516281, + "learning_rate": 9.001027824496834e-05, + "loss": 0.2863, + "step": 19560 + }, + { + "epoch": 0.8698937636129261, + "grad_norm": 0.2311461865901947, + "learning_rate": 8.999787947030287e-05, + "loss": 0.2884, + "step": 19570 + }, + { + "epoch": 0.8703382673245321, + "grad_norm": 0.1955631524324417, + "learning_rate": 8.998547386109376e-05, + "loss": 0.2911, + "step": 19580 + }, + { + "epoch": 0.8707827710361381, + "grad_norm": 0.21979399025440216, + "learning_rate": 8.997306141946073e-05, + "loss": 0.2861, + "step": 19590 + }, + { + "epoch": 0.8712272747477442, + "grad_norm": 0.19063378870487213, + "learning_rate": 8.996064214752481e-05, + "loss": 0.2872, + "step": 19600 + }, + { + "epoch": 0.8716717784593502, + "grad_norm": 0.23471404612064362, + "learning_rate": 8.994821604740806e-05, + "loss": 0.2855, + "step": 19610 + }, + { + "epoch": 0.8721162821709562, + "grad_norm": 0.24433661997318268, + "learning_rate": 8.993578312123377e-05, + "loss": 0.2863, + "step": 19620 + }, + { + "epoch": 0.8725607858825621, + "grad_norm": 0.2802627384662628, + "learning_rate": 8.992334337112639e-05, + "loss": 0.2888, + "step": 19630 + }, + { + "epoch": 0.8730052895941681, + "grad_norm": 0.20356890559196472, + "learning_rate": 8.991089679921154e-05, + "loss": 0.287, + "step": 19640 + }, + { + "epoch": 0.8734497933057741, + "grad_norm": 0.18862652778625488, + "learning_rate": 8.989844340761599e-05, + "loss": 0.2875, + "step": 19650 + }, + { + "epoch": 0.8738942970173801, + "grad_norm": 0.20406414568424225, + "learning_rate": 8.988598319846768e-05, + "loss": 0.2896, + "step": 19660 + }, + { + "epoch": 0.8743388007289861, + "grad_norm": 0.24295397102832794, + "learning_rate": 8.987351617389574e-05, + "loss": 0.2895, + "step": 19670 + }, + { + "epoch": 0.874783304440592, + "grad_norm": 0.22150704264640808, + "learning_rate": 8.98610423360304e-05, + "loss": 0.2878, + "step": 19680 + }, + { + "epoch": 0.875227808152198, + "grad_norm": 0.2215898633003235, + "learning_rate": 8.984856168700317e-05, + "loss": 0.289, + "step": 19690 + }, + { + "epoch": 0.875672311863804, + "grad_norm": 0.2159605175256729, + "learning_rate": 8.983607422894658e-05, + "loss": 0.2909, + "step": 19700 + }, + { + "epoch": 0.8761168155754101, + "grad_norm": 0.21004503965377808, + "learning_rate": 8.982357996399442e-05, + "loss": 0.2873, + "step": 19710 + }, + { + "epoch": 0.8765613192870161, + "grad_norm": 0.2196304053068161, + "learning_rate": 8.981107889428164e-05, + "loss": 0.286, + "step": 19720 + }, + { + "epoch": 0.8770058229986221, + "grad_norm": 0.19225017726421356, + "learning_rate": 8.979857102194428e-05, + "loss": 0.2887, + "step": 19730 + }, + { + "epoch": 0.877450326710228, + "grad_norm": 0.2576403021812439, + "learning_rate": 8.978605634911968e-05, + "loss": 0.2914, + "step": 19740 + }, + { + "epoch": 0.877894830421834, + "grad_norm": 0.20059438049793243, + "learning_rate": 8.977353487794616e-05, + "loss": 0.2943, + "step": 19750 + }, + { + "epoch": 0.87833933413344, + "grad_norm": 0.2598167955875397, + "learning_rate": 8.976100661056334e-05, + "loss": 0.2899, + "step": 19760 + }, + { + "epoch": 0.878783837845046, + "grad_norm": 0.2350751757621765, + "learning_rate": 8.974847154911197e-05, + "loss": 0.289, + "step": 19770 + }, + { + "epoch": 0.879228341556652, + "grad_norm": 0.2333267629146576, + "learning_rate": 8.973592969573393e-05, + "loss": 0.2911, + "step": 19780 + }, + { + "epoch": 0.879672845268258, + "grad_norm": 0.2327432632446289, + "learning_rate": 8.972338105257228e-05, + "loss": 0.2881, + "step": 19790 + }, + { + "epoch": 0.8801173489798639, + "grad_norm": 0.24448876082897186, + "learning_rate": 8.971082562177125e-05, + "loss": 0.2884, + "step": 19800 + }, + { + "epoch": 0.88056185269147, + "grad_norm": 0.2166738361120224, + "learning_rate": 8.96982634054762e-05, + "loss": 0.2897, + "step": 19810 + }, + { + "epoch": 0.881006356403076, + "grad_norm": 0.24695725739002228, + "learning_rate": 8.96856944058337e-05, + "loss": 0.2888, + "step": 19820 + }, + { + "epoch": 0.881450860114682, + "grad_norm": 0.20495590567588806, + "learning_rate": 8.967311862499144e-05, + "loss": 0.2906, + "step": 19830 + }, + { + "epoch": 0.881895363826288, + "grad_norm": 0.3439072370529175, + "learning_rate": 8.966053606509825e-05, + "loss": 0.2932, + "step": 19840 + }, + { + "epoch": 0.882339867537894, + "grad_norm": 0.25282931327819824, + "learning_rate": 8.964794672830417e-05, + "loss": 0.2927, + "step": 19850 + }, + { + "epoch": 0.8827843712494999, + "grad_norm": 0.5219501852989197, + "learning_rate": 8.963535061676038e-05, + "loss": 0.2959, + "step": 19860 + }, + { + "epoch": 0.8832288749611059, + "grad_norm": 0.25916600227355957, + "learning_rate": 8.962274773261918e-05, + "loss": 0.2925, + "step": 19870 + }, + { + "epoch": 0.8836733786727119, + "grad_norm": 0.2827596664428711, + "learning_rate": 8.961013807803409e-05, + "loss": 0.2905, + "step": 19880 + }, + { + "epoch": 0.8841178823843179, + "grad_norm": 0.18868356943130493, + "learning_rate": 8.959752165515973e-05, + "loss": 0.2896, + "step": 19890 + }, + { + "epoch": 0.8845623860959239, + "grad_norm": 0.2097701132297516, + "learning_rate": 8.958489846615193e-05, + "loss": 0.2901, + "step": 19900 + }, + { + "epoch": 0.8850068898075298, + "grad_norm": 0.19578172266483307, + "learning_rate": 8.957226851316762e-05, + "loss": 0.2888, + "step": 19910 + }, + { + "epoch": 0.8854513935191359, + "grad_norm": 0.19094161689281464, + "learning_rate": 8.955963179836493e-05, + "loss": 0.2858, + "step": 19920 + }, + { + "epoch": 0.8858958972307419, + "grad_norm": 0.21804337203502655, + "learning_rate": 8.954698832390312e-05, + "loss": 0.2884, + "step": 19930 + }, + { + "epoch": 0.8863404009423479, + "grad_norm": 0.20276577770709991, + "learning_rate": 8.953433809194263e-05, + "loss": 0.2893, + "step": 19940 + }, + { + "epoch": 0.8867849046539539, + "grad_norm": 0.2336679995059967, + "learning_rate": 8.9521681104645e-05, + "loss": 0.2902, + "step": 19950 + }, + { + "epoch": 0.8872294083655599, + "grad_norm": 0.21730323135852814, + "learning_rate": 8.9509017364173e-05, + "loss": 0.2843, + "step": 19960 + }, + { + "epoch": 0.8876739120771658, + "grad_norm": 0.2265961915254593, + "learning_rate": 8.949634687269052e-05, + "loss": 0.2867, + "step": 19970 + }, + { + "epoch": 0.8881184157887718, + "grad_norm": 0.19379612803459167, + "learning_rate": 8.948366963236259e-05, + "loss": 0.2877, + "step": 19980 + }, + { + "epoch": 0.8885629195003778, + "grad_norm": 0.18722079694271088, + "learning_rate": 8.947098564535538e-05, + "loss": 0.2913, + "step": 19990 + }, + { + "epoch": 0.8890074232119838, + "grad_norm": 0.22166478633880615, + "learning_rate": 8.945829491383627e-05, + "loss": 0.2868, + "step": 20000 + }, + { + "epoch": 0.8894519269235898, + "grad_norm": 0.3297671973705292, + "learning_rate": 8.944559743997374e-05, + "loss": 0.2896, + "step": 20010 + }, + { + "epoch": 0.8898964306351959, + "grad_norm": 0.21437719464302063, + "learning_rate": 8.943289322593746e-05, + "loss": 0.2864, + "step": 20020 + }, + { + "epoch": 0.8903409343468018, + "grad_norm": 0.2711682915687561, + "learning_rate": 8.942018227389821e-05, + "loss": 0.2893, + "step": 20030 + }, + { + "epoch": 0.8907854380584078, + "grad_norm": 0.22318241000175476, + "learning_rate": 8.940746458602795e-05, + "loss": 0.2877, + "step": 20040 + }, + { + "epoch": 0.8912299417700138, + "grad_norm": 0.19608239829540253, + "learning_rate": 8.939474016449979e-05, + "loss": 0.2863, + "step": 20050 + }, + { + "epoch": 0.8916744454816198, + "grad_norm": 0.19349351525306702, + "learning_rate": 8.938200901148799e-05, + "loss": 0.2872, + "step": 20060 + }, + { + "epoch": 0.8921189491932258, + "grad_norm": 0.21365799009799957, + "learning_rate": 8.936927112916795e-05, + "loss": 0.2865, + "step": 20070 + }, + { + "epoch": 0.8925634529048317, + "grad_norm": 0.21074442565441132, + "learning_rate": 8.935652651971622e-05, + "loss": 0.2868, + "step": 20080 + }, + { + "epoch": 0.8930079566164377, + "grad_norm": 0.22471585869789124, + "learning_rate": 8.934377518531052e-05, + "loss": 0.2838, + "step": 20090 + }, + { + "epoch": 0.8934524603280437, + "grad_norm": 0.1902826726436615, + "learning_rate": 8.933101712812967e-05, + "loss": 0.2884, + "step": 20100 + }, + { + "epoch": 0.8938969640396497, + "grad_norm": 0.2372112274169922, + "learning_rate": 8.931825235035374e-05, + "loss": 0.2889, + "step": 20110 + }, + { + "epoch": 0.8943414677512557, + "grad_norm": 0.18209533393383026, + "learning_rate": 8.930548085416382e-05, + "loss": 0.2873, + "step": 20120 + }, + { + "epoch": 0.8947859714628618, + "grad_norm": 0.24077290296554565, + "learning_rate": 8.92927026417422e-05, + "loss": 0.2853, + "step": 20130 + }, + { + "epoch": 0.8952304751744677, + "grad_norm": 0.22887621819972992, + "learning_rate": 8.92799177152724e-05, + "loss": 0.2859, + "step": 20140 + }, + { + "epoch": 0.8956749788860737, + "grad_norm": 0.20180678367614746, + "learning_rate": 8.926712607693895e-05, + "loss": 0.2899, + "step": 20150 + }, + { + "epoch": 0.8961194825976797, + "grad_norm": 0.20313097536563873, + "learning_rate": 8.925432772892762e-05, + "loss": 0.2897, + "step": 20160 + }, + { + "epoch": 0.8965639863092857, + "grad_norm": 0.18204542994499207, + "learning_rate": 8.924152267342529e-05, + "loss": 0.2887, + "step": 20170 + }, + { + "epoch": 0.8970084900208917, + "grad_norm": 0.19537118077278137, + "learning_rate": 8.922871091261998e-05, + "loss": 0.2906, + "step": 20180 + }, + { + "epoch": 0.8974529937324977, + "grad_norm": 0.17112915217876434, + "learning_rate": 8.92158924487009e-05, + "loss": 0.2869, + "step": 20190 + }, + { + "epoch": 0.8978974974441036, + "grad_norm": 0.20942117273807526, + "learning_rate": 8.920306728385834e-05, + "loss": 0.291, + "step": 20200 + }, + { + "epoch": 0.8983420011557096, + "grad_norm": 0.1961066573858261, + "learning_rate": 8.919023542028379e-05, + "loss": 0.2887, + "step": 20210 + }, + { + "epoch": 0.8987865048673156, + "grad_norm": 0.2078867405653, + "learning_rate": 8.917739686016988e-05, + "loss": 0.2901, + "step": 20220 + }, + { + "epoch": 0.8992310085789217, + "grad_norm": 0.2107987105846405, + "learning_rate": 8.916455160571033e-05, + "loss": 0.2867, + "step": 20230 + }, + { + "epoch": 0.8996755122905277, + "grad_norm": 0.19251300394535065, + "learning_rate": 8.915169965910008e-05, + "loss": 0.2843, + "step": 20240 + }, + { + "epoch": 0.9001200160021336, + "grad_norm": 0.19603776931762695, + "learning_rate": 8.913884102253514e-05, + "loss": 0.2847, + "step": 20250 + }, + { + "epoch": 0.9005645197137396, + "grad_norm": 0.206146240234375, + "learning_rate": 8.912597569821273e-05, + "loss": 0.2864, + "step": 20260 + }, + { + "epoch": 0.9010090234253456, + "grad_norm": 0.2036316990852356, + "learning_rate": 8.911310368833118e-05, + "loss": 0.29, + "step": 20270 + }, + { + "epoch": 0.9014535271369516, + "grad_norm": 0.22867105901241302, + "learning_rate": 8.910022499508994e-05, + "loss": 0.2869, + "step": 20280 + }, + { + "epoch": 0.9018980308485576, + "grad_norm": 0.23685576021671295, + "learning_rate": 8.908733962068965e-05, + "loss": 0.2851, + "step": 20290 + }, + { + "epoch": 0.9023425345601636, + "grad_norm": 0.2348591387271881, + "learning_rate": 8.907444756733207e-05, + "loss": 0.2869, + "step": 20300 + }, + { + "epoch": 0.9027870382717695, + "grad_norm": 0.22354812920093536, + "learning_rate": 8.906154883722006e-05, + "loss": 0.2867, + "step": 20310 + }, + { + "epoch": 0.9032315419833755, + "grad_norm": 0.23438198864459991, + "learning_rate": 8.904864343255773e-05, + "loss": 0.2841, + "step": 20320 + }, + { + "epoch": 0.9036760456949815, + "grad_norm": 0.22021642327308655, + "learning_rate": 8.90357313555502e-05, + "loss": 0.2878, + "step": 20330 + }, + { + "epoch": 0.9041205494065876, + "grad_norm": 0.20101749897003174, + "learning_rate": 8.90228126084038e-05, + "loss": 0.2891, + "step": 20340 + }, + { + "epoch": 0.9045650531181936, + "grad_norm": 0.22642160952091217, + "learning_rate": 8.900988719332601e-05, + "loss": 0.2846, + "step": 20350 + }, + { + "epoch": 0.9050095568297996, + "grad_norm": 0.191105455160141, + "learning_rate": 8.899695511252542e-05, + "loss": 0.2892, + "step": 20360 + }, + { + "epoch": 0.9054540605414055, + "grad_norm": 0.29583021998405457, + "learning_rate": 8.898401636821176e-05, + "loss": 0.2877, + "step": 20370 + }, + { + "epoch": 0.9058985642530115, + "grad_norm": 0.19929298758506775, + "learning_rate": 8.897107096259593e-05, + "loss": 0.2845, + "step": 20380 + }, + { + "epoch": 0.9063430679646175, + "grad_norm": 0.23036779463291168, + "learning_rate": 8.895811889788994e-05, + "loss": 0.2861, + "step": 20390 + }, + { + "epoch": 0.9067875716762235, + "grad_norm": 0.2615116536617279, + "learning_rate": 8.894516017630692e-05, + "loss": 0.2866, + "step": 20400 + }, + { + "epoch": 0.9072320753878295, + "grad_norm": 0.22640827298164368, + "learning_rate": 8.893219480006118e-05, + "loss": 0.2867, + "step": 20410 + }, + { + "epoch": 0.9076765790994354, + "grad_norm": 0.19286619126796722, + "learning_rate": 8.891922277136817e-05, + "loss": 0.2841, + "step": 20420 + }, + { + "epoch": 0.9081210828110414, + "grad_norm": 0.19050325453281403, + "learning_rate": 8.890624409244441e-05, + "loss": 0.287, + "step": 20430 + }, + { + "epoch": 0.9085655865226475, + "grad_norm": 0.19482602179050446, + "learning_rate": 8.889325876550763e-05, + "loss": 0.2851, + "step": 20440 + }, + { + "epoch": 0.9090100902342535, + "grad_norm": 0.19766418635845184, + "learning_rate": 8.888026679277666e-05, + "loss": 0.2855, + "step": 20450 + }, + { + "epoch": 0.9094545939458595, + "grad_norm": 0.21815912425518036, + "learning_rate": 8.886726817647147e-05, + "loss": 0.2877, + "step": 20460 + }, + { + "epoch": 0.9098990976574655, + "grad_norm": 0.2000798135995865, + "learning_rate": 8.885426291881319e-05, + "loss": 0.2853, + "step": 20470 + }, + { + "epoch": 0.9103436013690714, + "grad_norm": 0.2354675829410553, + "learning_rate": 8.884125102202401e-05, + "loss": 0.2896, + "step": 20480 + }, + { + "epoch": 0.9107881050806774, + "grad_norm": 0.19726552069187164, + "learning_rate": 8.882823248832736e-05, + "loss": 0.2861, + "step": 20490 + }, + { + "epoch": 0.9112326087922834, + "grad_norm": 0.17359787225723267, + "learning_rate": 8.881520731994772e-05, + "loss": 0.2855, + "step": 20500 + }, + { + "epoch": 0.9116771125038894, + "grad_norm": 0.21020789444446564, + "learning_rate": 8.880217551911077e-05, + "loss": 0.2879, + "step": 20510 + }, + { + "epoch": 0.9121216162154954, + "grad_norm": 0.19445955753326416, + "learning_rate": 8.878913708804323e-05, + "loss": 0.2866, + "step": 20520 + }, + { + "epoch": 0.9125661199271013, + "grad_norm": 0.18668349087238312, + "learning_rate": 8.877609202897308e-05, + "loss": 0.2837, + "step": 20530 + }, + { + "epoch": 0.9130106236387073, + "grad_norm": 0.2054426372051239, + "learning_rate": 8.876304034412933e-05, + "loss": 0.2888, + "step": 20540 + }, + { + "epoch": 0.9134551273503134, + "grad_norm": 0.19913633167743683, + "learning_rate": 8.874998203574214e-05, + "loss": 0.2876, + "step": 20550 + }, + { + "epoch": 0.9138996310619194, + "grad_norm": 0.28192493319511414, + "learning_rate": 8.873691710604284e-05, + "loss": 0.2841, + "step": 20560 + }, + { + "epoch": 0.9143441347735254, + "grad_norm": 0.21379776298999786, + "learning_rate": 8.872384555726387e-05, + "loss": 0.2857, + "step": 20570 + }, + { + "epoch": 0.9147886384851314, + "grad_norm": 0.22697536647319794, + "learning_rate": 8.871076739163878e-05, + "loss": 0.2862, + "step": 20580 + }, + { + "epoch": 0.9152331421967373, + "grad_norm": 0.20832444727420807, + "learning_rate": 8.86976826114023e-05, + "loss": 0.2886, + "step": 20590 + }, + { + "epoch": 0.9156776459083433, + "grad_norm": 0.1741347759962082, + "learning_rate": 8.868459121879023e-05, + "loss": 0.2888, + "step": 20600 + }, + { + "epoch": 0.9161221496199493, + "grad_norm": 0.21149474382400513, + "learning_rate": 8.867149321603956e-05, + "loss": 0.288, + "step": 20610 + }, + { + "epoch": 0.9165666533315553, + "grad_norm": 0.20199015736579895, + "learning_rate": 8.865838860538835e-05, + "loss": 0.2869, + "step": 20620 + }, + { + "epoch": 0.9170111570431613, + "grad_norm": 0.19953487813472748, + "learning_rate": 8.864527738907585e-05, + "loss": 0.2874, + "step": 20630 + }, + { + "epoch": 0.9174556607547673, + "grad_norm": 0.2081412971019745, + "learning_rate": 8.863215956934239e-05, + "loss": 0.2873, + "step": 20640 + }, + { + "epoch": 0.9179001644663733, + "grad_norm": 0.21652430295944214, + "learning_rate": 8.861903514842947e-05, + "loss": 0.2823, + "step": 20650 + }, + { + "epoch": 0.9183446681779793, + "grad_norm": 0.22786001861095428, + "learning_rate": 8.860590412857966e-05, + "loss": 0.2865, + "step": 20660 + }, + { + "epoch": 0.9187891718895853, + "grad_norm": 0.22002747654914856, + "learning_rate": 8.85927665120367e-05, + "loss": 0.2855, + "step": 20670 + }, + { + "epoch": 0.9192336756011913, + "grad_norm": 0.22682002186775208, + "learning_rate": 8.857962230104546e-05, + "loss": 0.2836, + "step": 20680 + }, + { + "epoch": 0.9196781793127973, + "grad_norm": 0.20121242105960846, + "learning_rate": 8.856647149785193e-05, + "loss": 0.2835, + "step": 20690 + }, + { + "epoch": 0.9201226830244033, + "grad_norm": 0.19886410236358643, + "learning_rate": 8.855331410470322e-05, + "loss": 0.2855, + "step": 20700 + }, + { + "epoch": 0.9205671867360092, + "grad_norm": 0.20569437742233276, + "learning_rate": 8.854015012384756e-05, + "loss": 0.2867, + "step": 20710 + }, + { + "epoch": 0.9210116904476152, + "grad_norm": 0.2405637949705124, + "learning_rate": 8.852697955753433e-05, + "loss": 0.2851, + "step": 20720 + }, + { + "epoch": 0.9214561941592212, + "grad_norm": 0.18456090986728668, + "learning_rate": 8.851380240801399e-05, + "loss": 0.2847, + "step": 20730 + }, + { + "epoch": 0.9219006978708272, + "grad_norm": 0.18014436960220337, + "learning_rate": 8.850061867753818e-05, + "loss": 0.2857, + "step": 20740 + }, + { + "epoch": 0.9223452015824332, + "grad_norm": 0.19033090770244598, + "learning_rate": 8.848742836835963e-05, + "loss": 0.2832, + "step": 20750 + }, + { + "epoch": 0.9227897052940393, + "grad_norm": 0.22539082169532776, + "learning_rate": 8.847423148273221e-05, + "loss": 0.288, + "step": 20760 + }, + { + "epoch": 0.9232342090056452, + "grad_norm": 0.17783905565738678, + "learning_rate": 8.846102802291092e-05, + "loss": 0.2848, + "step": 20770 + }, + { + "epoch": 0.9236787127172512, + "grad_norm": 0.18219666182994843, + "learning_rate": 8.844781799115183e-05, + "loss": 0.2885, + "step": 20780 + }, + { + "epoch": 0.9241232164288572, + "grad_norm": 0.22263391315937042, + "learning_rate": 8.84346013897122e-05, + "loss": 0.2883, + "step": 20790 + }, + { + "epoch": 0.9245677201404632, + "grad_norm": 0.2174525111913681, + "learning_rate": 8.842137822085038e-05, + "loss": 0.2879, + "step": 20800 + }, + { + "epoch": 0.9250122238520692, + "grad_norm": 0.19990549981594086, + "learning_rate": 8.840814848682585e-05, + "loss": 0.2868, + "step": 20810 + }, + { + "epoch": 0.9254567275636751, + "grad_norm": 0.2042006254196167, + "learning_rate": 8.83949121898992e-05, + "loss": 0.2852, + "step": 20820 + }, + { + "epoch": 0.9259012312752811, + "grad_norm": 0.21485215425491333, + "learning_rate": 8.838166933233217e-05, + "loss": 0.2878, + "step": 20830 + }, + { + "epoch": 0.9263457349868871, + "grad_norm": 0.2129141390323639, + "learning_rate": 8.83684199163876e-05, + "loss": 0.2853, + "step": 20840 + }, + { + "epoch": 0.9267902386984931, + "grad_norm": 0.2036207616329193, + "learning_rate": 8.835516394432943e-05, + "loss": 0.2866, + "step": 20850 + }, + { + "epoch": 0.9272347424100992, + "grad_norm": 0.2066977322101593, + "learning_rate": 8.834190141842276e-05, + "loss": 0.2831, + "step": 20860 + }, + { + "epoch": 0.9276792461217052, + "grad_norm": 0.19915182888507843, + "learning_rate": 8.83286323409338e-05, + "loss": 0.2868, + "step": 20870 + }, + { + "epoch": 0.9281237498333111, + "grad_norm": 0.2636634111404419, + "learning_rate": 8.831535671412986e-05, + "loss": 0.2868, + "step": 20880 + }, + { + "epoch": 0.9285682535449171, + "grad_norm": 0.21008621156215668, + "learning_rate": 8.830207454027938e-05, + "loss": 0.2842, + "step": 20890 + }, + { + "epoch": 0.9290127572565231, + "grad_norm": 0.20558589696884155, + "learning_rate": 8.828878582165192e-05, + "loss": 0.2895, + "step": 20900 + }, + { + "epoch": 0.9294572609681291, + "grad_norm": 0.2073362022638321, + "learning_rate": 8.827549056051818e-05, + "loss": 0.2844, + "step": 20910 + }, + { + "epoch": 0.9299017646797351, + "grad_norm": 0.20125731825828552, + "learning_rate": 8.826218875914993e-05, + "loss": 0.2875, + "step": 20920 + }, + { + "epoch": 0.930346268391341, + "grad_norm": 0.19267988204956055, + "learning_rate": 8.82488804198201e-05, + "loss": 0.2866, + "step": 20930 + }, + { + "epoch": 0.930790772102947, + "grad_norm": 0.19301755726337433, + "learning_rate": 8.82355655448027e-05, + "loss": 0.2877, + "step": 20940 + }, + { + "epoch": 0.931235275814553, + "grad_norm": 0.23200508952140808, + "learning_rate": 8.822224413637293e-05, + "loss": 0.2865, + "step": 20950 + }, + { + "epoch": 0.931679779526159, + "grad_norm": 0.19400951266288757, + "learning_rate": 8.820891619680697e-05, + "loss": 0.2885, + "step": 20960 + }, + { + "epoch": 0.9321242832377651, + "grad_norm": 0.20186099410057068, + "learning_rate": 8.819558172838227e-05, + "loss": 0.2865, + "step": 20970 + }, + { + "epoch": 0.9325687869493711, + "grad_norm": 0.24447843432426453, + "learning_rate": 8.818224073337731e-05, + "loss": 0.2888, + "step": 20980 + }, + { + "epoch": 0.933013290660977, + "grad_norm": 0.21228766441345215, + "learning_rate": 8.816889321407169e-05, + "loss": 0.2882, + "step": 20990 + }, + { + "epoch": 0.933457794372583, + "grad_norm": 0.197832852602005, + "learning_rate": 8.815553917274615e-05, + "loss": 0.2894, + "step": 21000 + }, + { + "epoch": 0.933902298084189, + "grad_norm": 0.2214290201663971, + "learning_rate": 8.81421786116825e-05, + "loss": 0.2864, + "step": 21010 + }, + { + "epoch": 0.934346801795795, + "grad_norm": 0.23728783428668976, + "learning_rate": 8.812881153316373e-05, + "loss": 0.2845, + "step": 21020 + }, + { + "epoch": 0.934791305507401, + "grad_norm": 0.1808113157749176, + "learning_rate": 8.81154379394739e-05, + "loss": 0.2862, + "step": 21030 + }, + { + "epoch": 0.935235809219007, + "grad_norm": 0.2187618911266327, + "learning_rate": 8.810205783289818e-05, + "loss": 0.2887, + "step": 21040 + }, + { + "epoch": 0.9356803129306129, + "grad_norm": 0.20043271780014038, + "learning_rate": 8.808867121572286e-05, + "loss": 0.2854, + "step": 21050 + }, + { + "epoch": 0.9361248166422189, + "grad_norm": 0.2308405339717865, + "learning_rate": 8.807527809023537e-05, + "loss": 0.2876, + "step": 21060 + }, + { + "epoch": 0.936569320353825, + "grad_norm": 0.20436887443065643, + "learning_rate": 8.80618784587242e-05, + "loss": 0.2822, + "step": 21070 + }, + { + "epoch": 0.937013824065431, + "grad_norm": 0.22363734245300293, + "learning_rate": 8.804847232347902e-05, + "loss": 0.2876, + "step": 21080 + }, + { + "epoch": 0.937458327777037, + "grad_norm": 0.21201792359352112, + "learning_rate": 8.803505968679054e-05, + "loss": 0.2824, + "step": 21090 + }, + { + "epoch": 0.937902831488643, + "grad_norm": 0.21553900837898254, + "learning_rate": 8.802164055095061e-05, + "loss": 0.283, + "step": 21100 + }, + { + "epoch": 0.9383473352002489, + "grad_norm": 0.18418435752391815, + "learning_rate": 8.80082149182522e-05, + "loss": 0.286, + "step": 21110 + }, + { + "epoch": 0.9387918389118549, + "grad_norm": 0.21942591667175293, + "learning_rate": 8.79947827909894e-05, + "loss": 0.2888, + "step": 21120 + }, + { + "epoch": 0.9392363426234609, + "grad_norm": 0.18376722931861877, + "learning_rate": 8.798134417145738e-05, + "loss": 0.2845, + "step": 21130 + }, + { + "epoch": 0.9396808463350669, + "grad_norm": 0.21087674796581268, + "learning_rate": 8.796789906195243e-05, + "loss": 0.285, + "step": 21140 + }, + { + "epoch": 0.9401253500466729, + "grad_norm": 0.22102558612823486, + "learning_rate": 8.795444746477195e-05, + "loss": 0.2832, + "step": 21150 + }, + { + "epoch": 0.9405698537582788, + "grad_norm": 0.1888544261455536, + "learning_rate": 8.794098938221446e-05, + "loss": 0.285, + "step": 21160 + }, + { + "epoch": 0.9410143574698848, + "grad_norm": 0.22328124940395355, + "learning_rate": 8.792752481657957e-05, + "loss": 0.2863, + "step": 21170 + }, + { + "epoch": 0.9414588611814909, + "grad_norm": 0.20102040469646454, + "learning_rate": 8.791405377016802e-05, + "loss": 0.2858, + "step": 21180 + }, + { + "epoch": 0.9419033648930969, + "grad_norm": 0.20136421918869019, + "learning_rate": 8.790057624528163e-05, + "loss": 0.2865, + "step": 21190 + }, + { + "epoch": 0.9423478686047029, + "grad_norm": 0.2270604819059372, + "learning_rate": 8.788709224422333e-05, + "loss": 0.2853, + "step": 21200 + }, + { + "epoch": 0.9427923723163089, + "grad_norm": 0.2150307148694992, + "learning_rate": 8.787360176929717e-05, + "loss": 0.2859, + "step": 21210 + }, + { + "epoch": 0.9432368760279148, + "grad_norm": 0.16790537536144257, + "learning_rate": 8.786010482280834e-05, + "loss": 0.2845, + "step": 21220 + }, + { + "epoch": 0.9436813797395208, + "grad_norm": 0.2059943825006485, + "learning_rate": 8.784660140706306e-05, + "loss": 0.2847, + "step": 21230 + }, + { + "epoch": 0.9441258834511268, + "grad_norm": 0.2159920185804367, + "learning_rate": 8.783309152436872e-05, + "loss": 0.2898, + "step": 21240 + }, + { + "epoch": 0.9445703871627328, + "grad_norm": 0.22507859766483307, + "learning_rate": 8.781957517703375e-05, + "loss": 0.2861, + "step": 21250 + }, + { + "epoch": 0.9450148908743388, + "grad_norm": 0.19967587292194366, + "learning_rate": 8.780605236736776e-05, + "loss": 0.2868, + "step": 21260 + }, + { + "epoch": 0.9454593945859447, + "grad_norm": 0.25251051783561707, + "learning_rate": 8.779252309768142e-05, + "loss": 0.2871, + "step": 21270 + }, + { + "epoch": 0.9459038982975508, + "grad_norm": 0.2014295756816864, + "learning_rate": 8.777898737028652e-05, + "loss": 0.2877, + "step": 21280 + }, + { + "epoch": 0.9463484020091568, + "grad_norm": 0.22905200719833374, + "learning_rate": 8.776544518749591e-05, + "loss": 0.2852, + "step": 21290 + }, + { + "epoch": 0.9467929057207628, + "grad_norm": 0.23555803298950195, + "learning_rate": 8.775189655162364e-05, + "loss": 0.2878, + "step": 21300 + }, + { + "epoch": 0.9472374094323688, + "grad_norm": 0.21071316301822662, + "learning_rate": 8.773834146498474e-05, + "loss": 0.2823, + "step": 21310 + }, + { + "epoch": 0.9476819131439748, + "grad_norm": 0.2267817258834839, + "learning_rate": 8.772477992989545e-05, + "loss": 0.2897, + "step": 21320 + }, + { + "epoch": 0.9481264168555807, + "grad_norm": 0.20504282414913177, + "learning_rate": 8.771121194867304e-05, + "loss": 0.2841, + "step": 21330 + }, + { + "epoch": 0.9485709205671867, + "grad_norm": 0.1931641548871994, + "learning_rate": 8.769763752363589e-05, + "loss": 0.2827, + "step": 21340 + }, + { + "epoch": 0.9490154242787927, + "grad_norm": 0.20650248229503632, + "learning_rate": 8.768405665710352e-05, + "loss": 0.2829, + "step": 21350 + }, + { + "epoch": 0.9494599279903987, + "grad_norm": 0.1851925253868103, + "learning_rate": 8.767046935139655e-05, + "loss": 0.2873, + "step": 21360 + }, + { + "epoch": 0.9499044317020047, + "grad_norm": 0.19969575107097626, + "learning_rate": 8.765687560883666e-05, + "loss": 0.2878, + "step": 21370 + }, + { + "epoch": 0.9503489354136107, + "grad_norm": 0.19786034524440765, + "learning_rate": 8.764327543174664e-05, + "loss": 0.2839, + "step": 21380 + }, + { + "epoch": 0.9507934391252167, + "grad_norm": 0.20554155111312866, + "learning_rate": 8.762966882245038e-05, + "loss": 0.2852, + "step": 21390 + }, + { + "epoch": 0.9512379428368227, + "grad_norm": 0.19918090105056763, + "learning_rate": 8.761605578327291e-05, + "loss": 0.2857, + "step": 21400 + }, + { + "epoch": 0.9516824465484287, + "grad_norm": 0.24046184122562408, + "learning_rate": 8.76024363165403e-05, + "loss": 0.2864, + "step": 21410 + }, + { + "epoch": 0.9521269502600347, + "grad_norm": 0.23031142354011536, + "learning_rate": 8.758881042457976e-05, + "loss": 0.2857, + "step": 21420 + }, + { + "epoch": 0.9525714539716407, + "grad_norm": 0.24358920753002167, + "learning_rate": 8.757517810971957e-05, + "loss": 0.285, + "step": 21430 + }, + { + "epoch": 0.9530159576832467, + "grad_norm": 0.2229328751564026, + "learning_rate": 8.756153937428913e-05, + "loss": 0.2852, + "step": 21440 + }, + { + "epoch": 0.9534604613948526, + "grad_norm": 0.19420605897903442, + "learning_rate": 8.754789422061889e-05, + "loss": 0.2886, + "step": 21450 + }, + { + "epoch": 0.9539049651064586, + "grad_norm": 0.20622779428958893, + "learning_rate": 8.753424265104052e-05, + "loss": 0.2878, + "step": 21460 + }, + { + "epoch": 0.9543494688180646, + "grad_norm": 0.2746242880821228, + "learning_rate": 8.752058466788659e-05, + "loss": 0.2891, + "step": 21470 + }, + { + "epoch": 0.9547939725296706, + "grad_norm": 0.248408704996109, + "learning_rate": 8.750692027349097e-05, + "loss": 0.2833, + "step": 21480 + }, + { + "epoch": 0.9552384762412767, + "grad_norm": 0.20504093170166016, + "learning_rate": 8.749324947018847e-05, + "loss": 0.2877, + "step": 21490 + }, + { + "epoch": 0.9556829799528826, + "grad_norm": 0.23412281274795532, + "learning_rate": 8.747957226031507e-05, + "loss": 0.2856, + "step": 21500 + }, + { + "epoch": 0.9561274836644886, + "grad_norm": 0.22349660098552704, + "learning_rate": 8.746588864620787e-05, + "loss": 0.2864, + "step": 21510 + }, + { + "epoch": 0.9565719873760946, + "grad_norm": 0.19877462089061737, + "learning_rate": 8.745219863020498e-05, + "loss": 0.2917, + "step": 21520 + }, + { + "epoch": 0.9570164910877006, + "grad_norm": 0.19365739822387695, + "learning_rate": 8.743850221464564e-05, + "loss": 0.2844, + "step": 21530 + }, + { + "epoch": 0.9574609947993066, + "grad_norm": 0.21093305945396423, + "learning_rate": 8.742479940187026e-05, + "loss": 0.2858, + "step": 21540 + }, + { + "epoch": 0.9579054985109126, + "grad_norm": 0.23383134603500366, + "learning_rate": 8.74110901942202e-05, + "loss": 0.2863, + "step": 21550 + }, + { + "epoch": 0.9583500022225185, + "grad_norm": 0.2400875985622406, + "learning_rate": 8.739737459403803e-05, + "loss": 0.2875, + "step": 21560 + }, + { + "epoch": 0.9587945059341245, + "grad_norm": 0.14184463024139404, + "learning_rate": 8.738365260366737e-05, + "loss": 0.2845, + "step": 21570 + }, + { + "epoch": 0.9592390096457305, + "grad_norm": 0.21453776955604553, + "learning_rate": 8.736992422545292e-05, + "loss": 0.2858, + "step": 21580 + }, + { + "epoch": 0.9596835133573365, + "grad_norm": 0.22823677957057953, + "learning_rate": 8.73561894617405e-05, + "loss": 0.2876, + "step": 21590 + }, + { + "epoch": 0.9601280170689426, + "grad_norm": 0.18831486999988556, + "learning_rate": 8.734244831487697e-05, + "loss": 0.2817, + "step": 21600 + }, + { + "epoch": 0.9605725207805486, + "grad_norm": 0.18803910911083221, + "learning_rate": 8.732870078721035e-05, + "loss": 0.2859, + "step": 21610 + }, + { + "epoch": 0.9610170244921545, + "grad_norm": 0.21205885708332062, + "learning_rate": 8.731494688108972e-05, + "loss": 0.2847, + "step": 21620 + }, + { + "epoch": 0.9614615282037605, + "grad_norm": 0.19544701278209686, + "learning_rate": 8.730118659886523e-05, + "loss": 0.2889, + "step": 21630 + }, + { + "epoch": 0.9619060319153665, + "grad_norm": 0.1947368085384369, + "learning_rate": 8.728741994288814e-05, + "loss": 0.2867, + "step": 21640 + }, + { + "epoch": 0.9623505356269725, + "grad_norm": 0.18392261862754822, + "learning_rate": 8.727364691551079e-05, + "loss": 0.2857, + "step": 21650 + }, + { + "epoch": 0.9627950393385785, + "grad_norm": 0.16835522651672363, + "learning_rate": 8.725986751908661e-05, + "loss": 0.284, + "step": 21660 + }, + { + "epoch": 0.9632395430501844, + "grad_norm": 0.21965856850147247, + "learning_rate": 8.724608175597016e-05, + "loss": 0.2882, + "step": 21670 + }, + { + "epoch": 0.9636840467617904, + "grad_norm": 0.2219519168138504, + "learning_rate": 8.723228962851699e-05, + "loss": 0.2836, + "step": 21680 + }, + { + "epoch": 0.9641285504733964, + "grad_norm": 0.20163682103157043, + "learning_rate": 8.721849113908385e-05, + "loss": 0.2872, + "step": 21690 + }, + { + "epoch": 0.9645730541850025, + "grad_norm": 0.21122638881206512, + "learning_rate": 8.720468629002848e-05, + "loss": 0.2854, + "step": 21700 + }, + { + "epoch": 0.9650175578966085, + "grad_norm": 0.226548969745636, + "learning_rate": 8.719087508370978e-05, + "loss": 0.285, + "step": 21710 + }, + { + "epoch": 0.9654620616082145, + "grad_norm": 0.19882342219352722, + "learning_rate": 8.717705752248772e-05, + "loss": 0.2855, + "step": 21720 + }, + { + "epoch": 0.9659065653198204, + "grad_norm": 0.196372389793396, + "learning_rate": 8.71632336087233e-05, + "loss": 0.2841, + "step": 21730 + }, + { + "epoch": 0.9663510690314264, + "grad_norm": 0.1907067894935608, + "learning_rate": 8.71494033447787e-05, + "loss": 0.2849, + "step": 21740 + }, + { + "epoch": 0.9667955727430324, + "grad_norm": 0.206154003739357, + "learning_rate": 8.713556673301708e-05, + "loss": 0.2812, + "step": 21750 + }, + { + "epoch": 0.9672400764546384, + "grad_norm": 0.21554410457611084, + "learning_rate": 8.712172377580278e-05, + "loss": 0.2895, + "step": 21760 + }, + { + "epoch": 0.9676845801662444, + "grad_norm": 0.20774368941783905, + "learning_rate": 8.710787447550114e-05, + "loss": 0.2848, + "step": 21770 + }, + { + "epoch": 0.9681290838778503, + "grad_norm": 0.17254792153835297, + "learning_rate": 8.70940188344787e-05, + "loss": 0.2838, + "step": 21780 + }, + { + "epoch": 0.9685735875894563, + "grad_norm": 0.2150905877351761, + "learning_rate": 8.708015685510293e-05, + "loss": 0.2832, + "step": 21790 + }, + { + "epoch": 0.9690180913010623, + "grad_norm": 0.23857443034648895, + "learning_rate": 8.706628853974252e-05, + "loss": 0.2834, + "step": 21800 + }, + { + "epoch": 0.9694625950126684, + "grad_norm": 0.23214176297187805, + "learning_rate": 8.705241389076715e-05, + "loss": 0.2877, + "step": 21810 + }, + { + "epoch": 0.9699070987242744, + "grad_norm": 0.2012288123369217, + "learning_rate": 8.703853291054764e-05, + "loss": 0.2845, + "step": 21820 + }, + { + "epoch": 0.9703516024358804, + "grad_norm": 0.23750294744968414, + "learning_rate": 8.702464560145587e-05, + "loss": 0.2887, + "step": 21830 + }, + { + "epoch": 0.9707961061474863, + "grad_norm": 0.2570488452911377, + "learning_rate": 8.701075196586476e-05, + "loss": 0.2834, + "step": 21840 + }, + { + "epoch": 0.9712406098590923, + "grad_norm": 0.2311050444841385, + "learning_rate": 8.699685200614842e-05, + "loss": 0.2875, + "step": 21850 + }, + { + "epoch": 0.9716851135706983, + "grad_norm": 0.22391481697559357, + "learning_rate": 8.698294572468193e-05, + "loss": 0.2842, + "step": 21860 + }, + { + "epoch": 0.9721296172823043, + "grad_norm": 0.2376796305179596, + "learning_rate": 8.696903312384148e-05, + "loss": 0.2865, + "step": 21870 + }, + { + "epoch": 0.9725741209939103, + "grad_norm": 0.22527989745140076, + "learning_rate": 8.695511420600439e-05, + "loss": 0.2842, + "step": 21880 + }, + { + "epoch": 0.9730186247055163, + "grad_norm": 0.24586571753025055, + "learning_rate": 8.694118897354901e-05, + "loss": 0.2881, + "step": 21890 + }, + { + "epoch": 0.9734631284171222, + "grad_norm": 0.19603630900382996, + "learning_rate": 8.692725742885478e-05, + "loss": 0.2848, + "step": 21900 + }, + { + "epoch": 0.9739076321287283, + "grad_norm": 0.1776791512966156, + "learning_rate": 8.691331957430221e-05, + "loss": 0.2845, + "step": 21910 + }, + { + "epoch": 0.9743521358403343, + "grad_norm": 0.2423281967639923, + "learning_rate": 8.68993754122729e-05, + "loss": 0.2845, + "step": 21920 + }, + { + "epoch": 0.9747966395519403, + "grad_norm": 0.22268106043338776, + "learning_rate": 8.688542494514955e-05, + "loss": 0.2878, + "step": 21930 + }, + { + "epoch": 0.9752411432635463, + "grad_norm": 0.23511677980422974, + "learning_rate": 8.68714681753159e-05, + "loss": 0.2883, + "step": 21940 + }, + { + "epoch": 0.9756856469751523, + "grad_norm": 0.18719729781150818, + "learning_rate": 8.685750510515676e-05, + "loss": 0.2855, + "step": 21950 + }, + { + "epoch": 0.9761301506867582, + "grad_norm": 0.22939549386501312, + "learning_rate": 8.684353573705805e-05, + "loss": 0.2851, + "step": 21960 + }, + { + "epoch": 0.9765746543983642, + "grad_norm": 0.20325087010860443, + "learning_rate": 8.682956007340677e-05, + "loss": 0.2806, + "step": 21970 + }, + { + "epoch": 0.9770191581099702, + "grad_norm": 0.26367679238319397, + "learning_rate": 8.681557811659095e-05, + "loss": 0.2855, + "step": 21980 + }, + { + "epoch": 0.9774636618215762, + "grad_norm": 0.21626457571983337, + "learning_rate": 8.680158986899974e-05, + "loss": 0.2837, + "step": 21990 + }, + { + "epoch": 0.9779081655331822, + "grad_norm": 0.22004662454128265, + "learning_rate": 8.678759533302335e-05, + "loss": 0.2867, + "step": 22000 + }, + { + "epoch": 0.9783526692447881, + "grad_norm": 0.19084636867046356, + "learning_rate": 8.677359451105308e-05, + "loss": 0.2855, + "step": 22010 + }, + { + "epoch": 0.9787971729563942, + "grad_norm": 0.17401841282844543, + "learning_rate": 8.675958740548123e-05, + "loss": 0.285, + "step": 22020 + }, + { + "epoch": 0.9792416766680002, + "grad_norm": 0.2079363912343979, + "learning_rate": 8.674557401870129e-05, + "loss": 0.2887, + "step": 22030 + }, + { + "epoch": 0.9796861803796062, + "grad_norm": 0.2397294044494629, + "learning_rate": 8.673155435310775e-05, + "loss": 0.2826, + "step": 22040 + }, + { + "epoch": 0.9801306840912122, + "grad_norm": 0.2285705953836441, + "learning_rate": 8.671752841109617e-05, + "loss": 0.2848, + "step": 22050 + }, + { + "epoch": 0.9805751878028182, + "grad_norm": 0.2513130009174347, + "learning_rate": 8.670349619506321e-05, + "loss": 0.2847, + "step": 22060 + }, + { + "epoch": 0.9810196915144241, + "grad_norm": 0.26905110478401184, + "learning_rate": 8.66894577074066e-05, + "loss": 0.2833, + "step": 22070 + }, + { + "epoch": 0.9814641952260301, + "grad_norm": 0.20180313289165497, + "learning_rate": 8.667541295052513e-05, + "loss": 0.287, + "step": 22080 + }, + { + "epoch": 0.9819086989376361, + "grad_norm": 0.23970197141170502, + "learning_rate": 8.666136192681865e-05, + "loss": 0.281, + "step": 22090 + }, + { + "epoch": 0.9823532026492421, + "grad_norm": 0.1844562292098999, + "learning_rate": 8.664730463868811e-05, + "loss": 0.2836, + "step": 22100 + }, + { + "epoch": 0.9827977063608481, + "grad_norm": 0.23156458139419556, + "learning_rate": 8.663324108853552e-05, + "loss": 0.287, + "step": 22110 + }, + { + "epoch": 0.9832422100724542, + "grad_norm": 0.24855251610279083, + "learning_rate": 8.661917127876395e-05, + "loss": 0.2825, + "step": 22120 + }, + { + "epoch": 0.9836867137840601, + "grad_norm": 0.19501592218875885, + "learning_rate": 8.660509521177754e-05, + "loss": 0.2857, + "step": 22130 + }, + { + "epoch": 0.9841312174956661, + "grad_norm": 0.18647098541259766, + "learning_rate": 8.65910128899815e-05, + "loss": 0.2863, + "step": 22140 + }, + { + "epoch": 0.9845757212072721, + "grad_norm": 0.22549119591712952, + "learning_rate": 8.657692431578214e-05, + "loss": 0.286, + "step": 22150 + }, + { + "epoch": 0.9850202249188781, + "grad_norm": 0.19751624763011932, + "learning_rate": 8.656282949158679e-05, + "loss": 0.2849, + "step": 22160 + }, + { + "epoch": 0.9854647286304841, + "grad_norm": 0.20044152438640594, + "learning_rate": 8.654872841980388e-05, + "loss": 0.2874, + "step": 22170 + }, + { + "epoch": 0.98590923234209, + "grad_norm": 0.22029973566532135, + "learning_rate": 8.653462110284289e-05, + "loss": 0.2847, + "step": 22180 + }, + { + "epoch": 0.986353736053696, + "grad_norm": 0.21844220161437988, + "learning_rate": 8.652050754311437e-05, + "loss": 0.285, + "step": 22190 + }, + { + "epoch": 0.986798239765302, + "grad_norm": 0.17848628759384155, + "learning_rate": 8.650638774302995e-05, + "loss": 0.2837, + "step": 22200 + }, + { + "epoch": 0.987242743476908, + "grad_norm": 0.1939978003501892, + "learning_rate": 8.649226170500233e-05, + "loss": 0.2847, + "step": 22210 + }, + { + "epoch": 0.987687247188514, + "grad_norm": 0.19727306067943573, + "learning_rate": 8.647812943144524e-05, + "loss": 0.2834, + "step": 22220 + }, + { + "epoch": 0.9881317509001201, + "grad_norm": 0.2248048633337021, + "learning_rate": 8.646399092477351e-05, + "loss": 0.2858, + "step": 22230 + }, + { + "epoch": 0.988576254611726, + "grad_norm": 0.2486162632703781, + "learning_rate": 8.644984618740301e-05, + "loss": 0.2877, + "step": 22240 + }, + { + "epoch": 0.989020758323332, + "grad_norm": 0.2357853353023529, + "learning_rate": 8.643569522175073e-05, + "loss": 0.2853, + "step": 22250 + }, + { + "epoch": 0.989465262034938, + "grad_norm": 0.20718765258789062, + "learning_rate": 8.642153803023463e-05, + "loss": 0.2871, + "step": 22260 + }, + { + "epoch": 0.989909765746544, + "grad_norm": 0.245790034532547, + "learning_rate": 8.640737461527383e-05, + "loss": 0.2845, + "step": 22270 + }, + { + "epoch": 0.99035426945815, + "grad_norm": 0.23740999400615692, + "learning_rate": 8.639320497928845e-05, + "loss": 0.283, + "step": 22280 + }, + { + "epoch": 0.990798773169756, + "grad_norm": 0.22170059382915497, + "learning_rate": 8.637902912469969e-05, + "loss": 0.2841, + "step": 22290 + }, + { + "epoch": 0.9912432768813619, + "grad_norm": 0.251005083322525, + "learning_rate": 8.636484705392982e-05, + "loss": 0.2857, + "step": 22300 + }, + { + "epoch": 0.9916877805929679, + "grad_norm": 0.23998285830020905, + "learning_rate": 8.635065876940216e-05, + "loss": 0.2834, + "step": 22310 + }, + { + "epoch": 0.9921322843045739, + "grad_norm": 0.19140039384365082, + "learning_rate": 8.633646427354112e-05, + "loss": 0.291, + "step": 22320 + }, + { + "epoch": 0.99257678801618, + "grad_norm": 0.2125614434480667, + "learning_rate": 8.632226356877213e-05, + "loss": 0.2848, + "step": 22330 + }, + { + "epoch": 0.993021291727786, + "grad_norm": 0.19860072433948517, + "learning_rate": 8.630805665752173e-05, + "loss": 0.2881, + "step": 22340 + }, + { + "epoch": 0.993465795439392, + "grad_norm": 0.17292962968349457, + "learning_rate": 8.629384354221748e-05, + "loss": 0.2859, + "step": 22350 + }, + { + "epoch": 0.9939102991509979, + "grad_norm": 0.2270716428756714, + "learning_rate": 8.627962422528797e-05, + "loss": 0.2841, + "step": 22360 + }, + { + "epoch": 0.9943548028626039, + "grad_norm": 0.2340828776359558, + "learning_rate": 8.626539870916296e-05, + "loss": 0.2883, + "step": 22370 + }, + { + "epoch": 0.9947993065742099, + "grad_norm": 0.20040130615234375, + "learning_rate": 8.625116699627317e-05, + "loss": 0.2851, + "step": 22380 + }, + { + "epoch": 0.9952438102858159, + "grad_norm": 0.21415820717811584, + "learning_rate": 8.623692908905041e-05, + "loss": 0.285, + "step": 22390 + }, + { + "epoch": 0.9956883139974219, + "grad_norm": 0.19951145350933075, + "learning_rate": 8.622268498992755e-05, + "loss": 0.2882, + "step": 22400 + }, + { + "epoch": 0.9961328177090278, + "grad_norm": 0.1977194994688034, + "learning_rate": 8.620843470133851e-05, + "loss": 0.2857, + "step": 22410 + }, + { + "epoch": 0.9965773214206338, + "grad_norm": 0.1968337446451187, + "learning_rate": 8.619417822571829e-05, + "loss": 0.2873, + "step": 22420 + }, + { + "epoch": 0.9970218251322398, + "grad_norm": 0.21536047756671906, + "learning_rate": 8.617991556550292e-05, + "loss": 0.2882, + "step": 22430 + }, + { + "epoch": 0.9974663288438459, + "grad_norm": 0.21004442870616913, + "learning_rate": 8.616564672312952e-05, + "loss": 0.2893, + "step": 22440 + }, + { + "epoch": 0.9979108325554519, + "grad_norm": 0.21463918685913086, + "learning_rate": 8.61513717010362e-05, + "loss": 0.2874, + "step": 22450 + }, + { + "epoch": 0.9983553362670579, + "grad_norm": 0.20234911143779755, + "learning_rate": 8.613709050166221e-05, + "loss": 0.2874, + "step": 22460 + }, + { + "epoch": 0.9987998399786638, + "grad_norm": 0.20822583138942719, + "learning_rate": 8.61228031274478e-05, + "loss": 0.2858, + "step": 22470 + }, + { + "epoch": 0.9992443436902698, + "grad_norm": 0.2291851043701172, + "learning_rate": 8.610850958083431e-05, + "loss": 0.2857, + "step": 22480 + }, + { + "epoch": 0.9996888474018758, + "grad_norm": 0.22140301764011383, + "learning_rate": 8.609420986426409e-05, + "loss": 0.2871, + "step": 22490 + }, + { + "epoch": 1.0001333511134818, + "grad_norm": 0.21998822689056396, + "learning_rate": 8.60799039801806e-05, + "loss": 0.2847, + "step": 22500 + }, + { + "epoch": 1.0005778548250879, + "grad_norm": 0.2057441771030426, + "learning_rate": 8.606559193102828e-05, + "loss": 0.2842, + "step": 22510 + }, + { + "epoch": 1.0010223585366937, + "grad_norm": 0.24380086362361908, + "learning_rate": 8.605127371925273e-05, + "loss": 0.2839, + "step": 22520 + }, + { + "epoch": 1.0014668622482998, + "grad_norm": 0.2164497673511505, + "learning_rate": 8.603694934730047e-05, + "loss": 0.2852, + "step": 22530 + }, + { + "epoch": 1.0019113659599057, + "grad_norm": 0.25831329822540283, + "learning_rate": 8.602261881761919e-05, + "loss": 0.2819, + "step": 22540 + }, + { + "epoch": 1.0023558696715118, + "grad_norm": 0.23950448632240295, + "learning_rate": 8.600828213265759e-05, + "loss": 0.2825, + "step": 22550 + }, + { + "epoch": 1.0028003733831177, + "grad_norm": 0.2554028332233429, + "learning_rate": 8.599393929486539e-05, + "loss": 0.2838, + "step": 22560 + }, + { + "epoch": 1.0032448770947238, + "grad_norm": 0.203484907746315, + "learning_rate": 8.59795903066934e-05, + "loss": 0.2843, + "step": 22570 + }, + { + "epoch": 1.0036893808063296, + "grad_norm": 0.20317958295345306, + "learning_rate": 8.596523517059347e-05, + "loss": 0.2836, + "step": 22580 + }, + { + "epoch": 1.0041338845179357, + "grad_norm": 0.24887917935848236, + "learning_rate": 8.59508738890185e-05, + "loss": 0.2839, + "step": 22590 + }, + { + "epoch": 1.0045783882295418, + "grad_norm": 0.35537368059158325, + "learning_rate": 8.593650646442246e-05, + "loss": 0.2854, + "step": 22600 + }, + { + "epoch": 1.0050228919411477, + "grad_norm": 0.23055174946784973, + "learning_rate": 8.59221328992603e-05, + "loss": 0.2851, + "step": 22610 + }, + { + "epoch": 1.0054673956527538, + "grad_norm": 0.22832007706165314, + "learning_rate": 8.590775319598813e-05, + "loss": 0.2866, + "step": 22620 + }, + { + "epoch": 1.0059118993643597, + "grad_norm": 0.25105980038642883, + "learning_rate": 8.589336735706301e-05, + "loss": 0.2835, + "step": 22630 + }, + { + "epoch": 1.0063564030759657, + "grad_norm": 0.25318643450737, + "learning_rate": 8.587897538494307e-05, + "loss": 0.2877, + "step": 22640 + }, + { + "epoch": 1.0068009067875716, + "grad_norm": 0.23327985405921936, + "learning_rate": 8.586457728208756e-05, + "loss": 0.2875, + "step": 22650 + }, + { + "epoch": 1.0072454104991777, + "grad_norm": 0.1909414529800415, + "learning_rate": 8.585017305095667e-05, + "loss": 0.2851, + "step": 22660 + }, + { + "epoch": 1.0076899142107836, + "grad_norm": 0.15066620707511902, + "learning_rate": 8.583576269401173e-05, + "loss": 0.2841, + "step": 22670 + }, + { + "epoch": 1.0081344179223897, + "grad_norm": 0.22967170178890228, + "learning_rate": 8.582134621371504e-05, + "loss": 0.2879, + "step": 22680 + }, + { + "epoch": 1.0085789216339955, + "grad_norm": 0.22010201215744019, + "learning_rate": 8.580692361253e-05, + "loss": 0.2835, + "step": 22690 + }, + { + "epoch": 1.0090234253456016, + "grad_norm": 0.25801241397857666, + "learning_rate": 8.579249489292104e-05, + "loss": 0.2855, + "step": 22700 + }, + { + "epoch": 1.0094679290572077, + "grad_norm": 0.21460333466529846, + "learning_rate": 8.577806005735363e-05, + "loss": 0.289, + "step": 22710 + }, + { + "epoch": 1.0099124327688136, + "grad_norm": 0.1808026134967804, + "learning_rate": 8.576361910829429e-05, + "loss": 0.2864, + "step": 22720 + }, + { + "epoch": 1.0103569364804197, + "grad_norm": 0.20836983621120453, + "learning_rate": 8.574917204821057e-05, + "loss": 0.2861, + "step": 22730 + }, + { + "epoch": 1.0108014401920256, + "grad_norm": 0.19106151163578033, + "learning_rate": 8.57347188795711e-05, + "loss": 0.288, + "step": 22740 + }, + { + "epoch": 1.0112459439036316, + "grad_norm": 0.2121751457452774, + "learning_rate": 8.572025960484551e-05, + "loss": 0.2866, + "step": 22750 + }, + { + "epoch": 1.0116904476152375, + "grad_norm": 0.2210530787706375, + "learning_rate": 8.57057942265045e-05, + "loss": 0.2861, + "step": 22760 + }, + { + "epoch": 1.0121349513268436, + "grad_norm": 0.22036881744861603, + "learning_rate": 8.569132274701984e-05, + "loss": 0.2867, + "step": 22770 + }, + { + "epoch": 1.0125794550384495, + "grad_norm": 0.2240409404039383, + "learning_rate": 8.567684516886427e-05, + "loss": 0.2844, + "step": 22780 + }, + { + "epoch": 1.0130239587500556, + "grad_norm": 0.19729487597942352, + "learning_rate": 8.56623614945116e-05, + "loss": 0.2831, + "step": 22790 + }, + { + "epoch": 1.0134684624616614, + "grad_norm": 0.22171862423419952, + "learning_rate": 8.564787172643675e-05, + "loss": 0.2885, + "step": 22800 + }, + { + "epoch": 1.0139129661732675, + "grad_norm": 0.19485469162464142, + "learning_rate": 8.563337586711559e-05, + "loss": 0.2847, + "step": 22810 + }, + { + "epoch": 1.0143574698848736, + "grad_norm": 0.21540407836437225, + "learning_rate": 8.561887391902506e-05, + "loss": 0.2809, + "step": 22820 + }, + { + "epoch": 1.0148019735964795, + "grad_norm": 0.21858492493629456, + "learning_rate": 8.560436588464316e-05, + "loss": 0.2856, + "step": 22830 + }, + { + "epoch": 1.0152464773080856, + "grad_norm": 0.2021738588809967, + "learning_rate": 8.55898517664489e-05, + "loss": 0.2868, + "step": 22840 + }, + { + "epoch": 1.0156909810196915, + "grad_norm": 0.20373794436454773, + "learning_rate": 8.557533156692236e-05, + "loss": 0.2835, + "step": 22850 + }, + { + "epoch": 1.0161354847312976, + "grad_norm": 0.17993588745594025, + "learning_rate": 8.556080528854467e-05, + "loss": 0.283, + "step": 22860 + }, + { + "epoch": 1.0165799884429034, + "grad_norm": 0.18273459374904633, + "learning_rate": 8.554627293379791e-05, + "loss": 0.2843, + "step": 22870 + }, + { + "epoch": 1.0170244921545095, + "grad_norm": 0.19930361211299896, + "learning_rate": 8.553173450516531e-05, + "loss": 0.2862, + "step": 22880 + }, + { + "epoch": 1.0174689958661154, + "grad_norm": 0.18726754188537598, + "learning_rate": 8.551719000513108e-05, + "loss": 0.2873, + "step": 22890 + }, + { + "epoch": 1.0179134995777215, + "grad_norm": 0.2464345544576645, + "learning_rate": 8.550263943618049e-05, + "loss": 0.2832, + "step": 22900 + }, + { + "epoch": 1.0183580032893276, + "grad_norm": 0.22199828922748566, + "learning_rate": 8.54880828007998e-05, + "loss": 0.2858, + "step": 22910 + }, + { + "epoch": 1.0188025070009334, + "grad_norm": 0.21015194058418274, + "learning_rate": 8.547352010147637e-05, + "loss": 0.2851, + "step": 22920 + }, + { + "epoch": 1.0192470107125395, + "grad_norm": 0.22019918262958527, + "learning_rate": 8.545895134069855e-05, + "loss": 0.2836, + "step": 22930 + }, + { + "epoch": 1.0196915144241454, + "grad_norm": 0.2338276505470276, + "learning_rate": 8.544437652095576e-05, + "loss": 0.284, + "step": 22940 + }, + { + "epoch": 1.0201360181357515, + "grad_norm": 0.237715944647789, + "learning_rate": 8.542979564473843e-05, + "loss": 0.2867, + "step": 22950 + }, + { + "epoch": 1.0205805218473574, + "grad_norm": 0.198927104473114, + "learning_rate": 8.541520871453802e-05, + "loss": 0.2879, + "step": 22960 + }, + { + "epoch": 1.0210250255589635, + "grad_norm": 0.18723343312740326, + "learning_rate": 8.540061573284705e-05, + "loss": 0.2845, + "step": 22970 + }, + { + "epoch": 1.0214695292705693, + "grad_norm": 0.1830514818429947, + "learning_rate": 8.538601670215906e-05, + "loss": 0.2837, + "step": 22980 + }, + { + "epoch": 1.0219140329821754, + "grad_norm": 0.21825067698955536, + "learning_rate": 8.537141162496864e-05, + "loss": 0.2834, + "step": 22990 + }, + { + "epoch": 1.0223585366937813, + "grad_norm": 0.23563195765018463, + "learning_rate": 8.535680050377137e-05, + "loss": 0.2858, + "step": 23000 + }, + { + "epoch": 1.0228030404053874, + "grad_norm": 0.20014141499996185, + "learning_rate": 8.534218334106391e-05, + "loss": 0.2848, + "step": 23010 + }, + { + "epoch": 1.0232475441169935, + "grad_norm": 0.1776667982339859, + "learning_rate": 8.532756013934393e-05, + "loss": 0.2851, + "step": 23020 + }, + { + "epoch": 1.0236920478285993, + "grad_norm": 0.20681698620319366, + "learning_rate": 8.531293090111012e-05, + "loss": 0.2848, + "step": 23030 + }, + { + "epoch": 1.0241365515402054, + "grad_norm": 0.21014995872974396, + "learning_rate": 8.529829562886225e-05, + "loss": 0.2818, + "step": 23040 + }, + { + "epoch": 1.0245810552518113, + "grad_norm": 0.20087607204914093, + "learning_rate": 8.528365432510105e-05, + "loss": 0.2854, + "step": 23050 + }, + { + "epoch": 1.0250255589634174, + "grad_norm": 0.17493745684623718, + "learning_rate": 8.526900699232833e-05, + "loss": 0.2848, + "step": 23060 + }, + { + "epoch": 1.0254700626750233, + "grad_norm": 0.2005140483379364, + "learning_rate": 8.525435363304695e-05, + "loss": 0.2871, + "step": 23070 + }, + { + "epoch": 1.0259145663866294, + "grad_norm": 0.19307619333267212, + "learning_rate": 8.523969424976072e-05, + "loss": 0.2818, + "step": 23080 + }, + { + "epoch": 1.0263590700982352, + "grad_norm": 0.24825409054756165, + "learning_rate": 8.522502884497457e-05, + "loss": 0.2808, + "step": 23090 + }, + { + "epoch": 1.0268035738098413, + "grad_norm": 0.2150994837284088, + "learning_rate": 8.521035742119437e-05, + "loss": 0.2849, + "step": 23100 + }, + { + "epoch": 1.0272480775214472, + "grad_norm": 0.21042293310165405, + "learning_rate": 8.519567998092712e-05, + "loss": 0.2862, + "step": 23110 + }, + { + "epoch": 1.0276925812330533, + "grad_norm": 0.22569799423217773, + "learning_rate": 8.518099652668075e-05, + "loss": 0.2839, + "step": 23120 + }, + { + "epoch": 1.0281370849446594, + "grad_norm": 0.19646969437599182, + "learning_rate": 8.516630706096429e-05, + "loss": 0.2826, + "step": 23130 + }, + { + "epoch": 1.0285815886562653, + "grad_norm": 0.18817059695720673, + "learning_rate": 8.515161158628773e-05, + "loss": 0.2852, + "step": 23140 + }, + { + "epoch": 1.0290260923678713, + "grad_norm": 0.19851480424404144, + "learning_rate": 8.513691010516216e-05, + "loss": 0.2842, + "step": 23150 + }, + { + "epoch": 1.0294705960794772, + "grad_norm": 0.25522705912590027, + "learning_rate": 8.512220262009966e-05, + "loss": 0.2849, + "step": 23160 + }, + { + "epoch": 1.0299150997910833, + "grad_norm": 0.21209797263145447, + "learning_rate": 8.510748913361332e-05, + "loss": 0.2847, + "step": 23170 + }, + { + "epoch": 1.0303596035026892, + "grad_norm": 0.1845623254776001, + "learning_rate": 8.509276964821726e-05, + "loss": 0.2833, + "step": 23180 + }, + { + "epoch": 1.0308041072142953, + "grad_norm": 0.2480151653289795, + "learning_rate": 8.507804416642669e-05, + "loss": 0.2818, + "step": 23190 + }, + { + "epoch": 1.0312486109259011, + "grad_norm": 0.21553418040275574, + "learning_rate": 8.506331269075774e-05, + "loss": 0.284, + "step": 23200 + }, + { + "epoch": 1.0316931146375072, + "grad_norm": 0.22711710631847382, + "learning_rate": 8.504857522372765e-05, + "loss": 0.2876, + "step": 23210 + }, + { + "epoch": 1.0321376183491133, + "grad_norm": 0.17455615103244781, + "learning_rate": 8.503383176785461e-05, + "loss": 0.2856, + "step": 23220 + }, + { + "epoch": 1.0325821220607192, + "grad_norm": 0.1891695111989975, + "learning_rate": 8.501908232565792e-05, + "loss": 0.2865, + "step": 23230 + }, + { + "epoch": 1.0330266257723253, + "grad_norm": 0.2186918705701828, + "learning_rate": 8.50043268996578e-05, + "loss": 0.2867, + "step": 23240 + }, + { + "epoch": 1.0334711294839312, + "grad_norm": 0.2389574944972992, + "learning_rate": 8.498956549237562e-05, + "loss": 0.2843, + "step": 23250 + }, + { + "epoch": 1.0339156331955373, + "grad_norm": 0.20914609730243683, + "learning_rate": 8.497479810633366e-05, + "loss": 0.2814, + "step": 23260 + }, + { + "epoch": 1.0343601369071431, + "grad_norm": 0.205342099070549, + "learning_rate": 8.496002474405525e-05, + "loss": 0.2842, + "step": 23270 + }, + { + "epoch": 1.0348046406187492, + "grad_norm": 0.2072753757238388, + "learning_rate": 8.494524540806478e-05, + "loss": 0.2813, + "step": 23280 + }, + { + "epoch": 1.035249144330355, + "grad_norm": 0.19118621945381165, + "learning_rate": 8.493046010088761e-05, + "loss": 0.2854, + "step": 23290 + }, + { + "epoch": 1.0356936480419612, + "grad_norm": 0.21872453391551971, + "learning_rate": 8.491566882505018e-05, + "loss": 0.2863, + "step": 23300 + }, + { + "epoch": 1.036138151753567, + "grad_norm": 0.19946610927581787, + "learning_rate": 8.490087158307988e-05, + "loss": 0.2844, + "step": 23310 + }, + { + "epoch": 1.0365826554651731, + "grad_norm": 0.21322979032993317, + "learning_rate": 8.488606837750518e-05, + "loss": 0.2861, + "step": 23320 + }, + { + "epoch": 1.037027159176779, + "grad_norm": 0.23448218405246735, + "learning_rate": 8.487125921085552e-05, + "loss": 0.2834, + "step": 23330 + }, + { + "epoch": 1.037471662888385, + "grad_norm": 0.19710741937160492, + "learning_rate": 8.485644408566141e-05, + "loss": 0.2828, + "step": 23340 + }, + { + "epoch": 1.0379161665999912, + "grad_norm": 0.2267640084028244, + "learning_rate": 8.484162300445431e-05, + "loss": 0.2808, + "step": 23350 + }, + { + "epoch": 1.038360670311597, + "grad_norm": 0.22436381876468658, + "learning_rate": 8.482679596976676e-05, + "loss": 0.2844, + "step": 23360 + }, + { + "epoch": 1.0388051740232032, + "grad_norm": 0.20095759630203247, + "learning_rate": 8.48119629841323e-05, + "loss": 0.2815, + "step": 23370 + }, + { + "epoch": 1.039249677734809, + "grad_norm": 0.2260611355304718, + "learning_rate": 8.479712405008547e-05, + "loss": 0.2828, + "step": 23380 + }, + { + "epoch": 1.0396941814464151, + "grad_norm": 0.21784450113773346, + "learning_rate": 8.478227917016184e-05, + "loss": 0.286, + "step": 23390 + }, + { + "epoch": 1.040138685158021, + "grad_norm": 0.20258423686027527, + "learning_rate": 8.476742834689801e-05, + "loss": 0.2853, + "step": 23400 + }, + { + "epoch": 1.040583188869627, + "grad_norm": 0.19038112461566925, + "learning_rate": 8.475257158283157e-05, + "loss": 0.2827, + "step": 23410 + }, + { + "epoch": 1.041027692581233, + "grad_norm": 0.1697458028793335, + "learning_rate": 8.473770888050112e-05, + "loss": 0.2857, + "step": 23420 + }, + { + "epoch": 1.041472196292839, + "grad_norm": 0.20912186801433563, + "learning_rate": 8.47228402424463e-05, + "loss": 0.2848, + "step": 23430 + }, + { + "epoch": 1.0419167000044451, + "grad_norm": 0.21870794892311096, + "learning_rate": 8.470796567120775e-05, + "loss": 0.2847, + "step": 23440 + }, + { + "epoch": 1.042361203716051, + "grad_norm": 0.22538335621356964, + "learning_rate": 8.469308516932714e-05, + "loss": 0.2862, + "step": 23450 + }, + { + "epoch": 1.042805707427657, + "grad_norm": 0.20683416724205017, + "learning_rate": 8.467819873934714e-05, + "loss": 0.2832, + "step": 23460 + }, + { + "epoch": 1.043250211139263, + "grad_norm": 0.17886023223400116, + "learning_rate": 8.466330638381143e-05, + "loss": 0.2831, + "step": 23470 + }, + { + "epoch": 1.043694714850869, + "grad_norm": 0.22260195016860962, + "learning_rate": 8.464840810526469e-05, + "loss": 0.2833, + "step": 23480 + }, + { + "epoch": 1.044139218562475, + "grad_norm": 0.21660761535167694, + "learning_rate": 8.463350390625264e-05, + "loss": 0.2848, + "step": 23490 + }, + { + "epoch": 1.044583722274081, + "grad_norm": 0.1920333355665207, + "learning_rate": 8.4618593789322e-05, + "loss": 0.2832, + "step": 23500 + }, + { + "epoch": 1.045028225985687, + "grad_norm": 0.21309372782707214, + "learning_rate": 8.46036777570205e-05, + "loss": 0.2812, + "step": 23510 + }, + { + "epoch": 1.045472729697293, + "grad_norm": 0.21250450611114502, + "learning_rate": 8.458875581189688e-05, + "loss": 0.285, + "step": 23520 + }, + { + "epoch": 1.0459172334088989, + "grad_norm": 0.2227354347705841, + "learning_rate": 8.457382795650092e-05, + "loss": 0.2809, + "step": 23530 + }, + { + "epoch": 1.046361737120505, + "grad_norm": 0.2335045486688614, + "learning_rate": 8.455889419338335e-05, + "loss": 0.2845, + "step": 23540 + }, + { + "epoch": 1.046806240832111, + "grad_norm": 0.2362784594297409, + "learning_rate": 8.454395452509593e-05, + "loss": 0.2853, + "step": 23550 + }, + { + "epoch": 1.047250744543717, + "grad_norm": 0.2011265903711319, + "learning_rate": 8.452900895419146e-05, + "loss": 0.2851, + "step": 23560 + }, + { + "epoch": 1.047695248255323, + "grad_norm": 0.2087012678384781, + "learning_rate": 8.451405748322376e-05, + "loss": 0.2848, + "step": 23570 + }, + { + "epoch": 1.0481397519669289, + "grad_norm": 0.24185892939567566, + "learning_rate": 8.449910011474759e-05, + "loss": 0.2851, + "step": 23580 + }, + { + "epoch": 1.048584255678535, + "grad_norm": 0.24146398901939392, + "learning_rate": 8.448413685131876e-05, + "loss": 0.2863, + "step": 23590 + }, + { + "epoch": 1.0490287593901408, + "grad_norm": 0.23612000048160553, + "learning_rate": 8.446916769549407e-05, + "loss": 0.2826, + "step": 23600 + }, + { + "epoch": 1.049473263101747, + "grad_norm": 0.22960597276687622, + "learning_rate": 8.445419264983136e-05, + "loss": 0.2816, + "step": 23610 + }, + { + "epoch": 1.0499177668133528, + "grad_norm": 0.2449428290128708, + "learning_rate": 8.443921171688947e-05, + "loss": 0.2835, + "step": 23620 + }, + { + "epoch": 1.050362270524959, + "grad_norm": 0.17391395568847656, + "learning_rate": 8.442422489922819e-05, + "loss": 0.2857, + "step": 23630 + }, + { + "epoch": 1.0508067742365648, + "grad_norm": 0.20399849116802216, + "learning_rate": 8.440923219940838e-05, + "loss": 0.2875, + "step": 23640 + }, + { + "epoch": 1.0512512779481709, + "grad_norm": 0.1859460324048996, + "learning_rate": 8.439423361999189e-05, + "loss": 0.2821, + "step": 23650 + }, + { + "epoch": 1.051695781659777, + "grad_norm": 0.20821607112884521, + "learning_rate": 8.437922916354155e-05, + "loss": 0.282, + "step": 23660 + }, + { + "epoch": 1.0521402853713828, + "grad_norm": 0.20332646369934082, + "learning_rate": 8.436421883262123e-05, + "loss": 0.2853, + "step": 23670 + }, + { + "epoch": 1.052584789082989, + "grad_norm": 0.21223735809326172, + "learning_rate": 8.434920262979577e-05, + "loss": 0.2864, + "step": 23680 + }, + { + "epoch": 1.0530292927945948, + "grad_norm": 0.20064623653888702, + "learning_rate": 8.433418055763104e-05, + "loss": 0.2841, + "step": 23690 + }, + { + "epoch": 1.0534737965062009, + "grad_norm": 0.21178758144378662, + "learning_rate": 8.431915261869389e-05, + "loss": 0.2835, + "step": 23700 + }, + { + "epoch": 1.0539183002178067, + "grad_norm": 0.2036750763654709, + "learning_rate": 8.43041188155522e-05, + "loss": 0.2833, + "step": 23710 + }, + { + "epoch": 1.0543628039294128, + "grad_norm": 0.18127568066120148, + "learning_rate": 8.428907915077481e-05, + "loss": 0.2825, + "step": 23720 + }, + { + "epoch": 1.0548073076410187, + "grad_norm": 0.19205397367477417, + "learning_rate": 8.42740336269316e-05, + "loss": 0.2861, + "step": 23730 + }, + { + "epoch": 1.0552518113526248, + "grad_norm": 0.20154695212841034, + "learning_rate": 8.425898224659345e-05, + "loss": 0.2847, + "step": 23740 + }, + { + "epoch": 1.055696315064231, + "grad_norm": 0.19208258390426636, + "learning_rate": 8.42439250123322e-05, + "loss": 0.2861, + "step": 23750 + }, + { + "epoch": 1.0561408187758368, + "grad_norm": 0.16230769455432892, + "learning_rate": 8.422886192672076e-05, + "loss": 0.2835, + "step": 23760 + }, + { + "epoch": 1.0565853224874429, + "grad_norm": 0.18834057450294495, + "learning_rate": 8.421379299233297e-05, + "loss": 0.2848, + "step": 23770 + }, + { + "epoch": 1.0570298261990487, + "grad_norm": 0.18481503427028656, + "learning_rate": 8.419871821174371e-05, + "loss": 0.2853, + "step": 23780 + }, + { + "epoch": 1.0574743299106548, + "grad_norm": 0.22303898632526398, + "learning_rate": 8.418363758752884e-05, + "loss": 0.2848, + "step": 23790 + }, + { + "epoch": 1.0579188336222607, + "grad_norm": 0.21488060057163239, + "learning_rate": 8.416855112226523e-05, + "loss": 0.2803, + "step": 23800 + }, + { + "epoch": 1.0583633373338668, + "grad_norm": 0.19223076105117798, + "learning_rate": 8.415345881853075e-05, + "loss": 0.2839, + "step": 23810 + }, + { + "epoch": 1.0588078410454727, + "grad_norm": 0.2338123768568039, + "learning_rate": 8.413836067890426e-05, + "loss": 0.2814, + "step": 23820 + }, + { + "epoch": 1.0592523447570787, + "grad_norm": 0.21219009160995483, + "learning_rate": 8.41232567059656e-05, + "loss": 0.2854, + "step": 23830 + }, + { + "epoch": 1.0596968484686846, + "grad_norm": 0.20600619912147522, + "learning_rate": 8.410814690229565e-05, + "loss": 0.2863, + "step": 23840 + }, + { + "epoch": 1.0601413521802907, + "grad_norm": 0.22889642417430878, + "learning_rate": 8.409303127047626e-05, + "loss": 0.2862, + "step": 23850 + }, + { + "epoch": 1.0605858558918968, + "grad_norm": 0.2452101707458496, + "learning_rate": 8.407790981309028e-05, + "loss": 0.2839, + "step": 23860 + }, + { + "epoch": 1.0610303596035027, + "grad_norm": 0.22179967164993286, + "learning_rate": 8.406278253272153e-05, + "loss": 0.2874, + "step": 23870 + }, + { + "epoch": 1.0614748633151088, + "grad_norm": 0.19727589190006256, + "learning_rate": 8.404764943195487e-05, + "loss": 0.2874, + "step": 23880 + }, + { + "epoch": 1.0619193670267146, + "grad_norm": 0.18939557671546936, + "learning_rate": 8.403251051337613e-05, + "loss": 0.286, + "step": 23890 + }, + { + "epoch": 1.0623638707383207, + "grad_norm": 0.19192266464233398, + "learning_rate": 8.401736577957214e-05, + "loss": 0.2828, + "step": 23900 + }, + { + "epoch": 1.0628083744499266, + "grad_norm": 0.20901615917682648, + "learning_rate": 8.40022152331307e-05, + "loss": 0.2834, + "step": 23910 + }, + { + "epoch": 1.0632528781615327, + "grad_norm": 0.2246548980474472, + "learning_rate": 8.398705887664064e-05, + "loss": 0.2848, + "step": 23920 + }, + { + "epoch": 1.0636973818731386, + "grad_norm": 0.23318064212799072, + "learning_rate": 8.397189671269177e-05, + "loss": 0.2846, + "step": 23930 + }, + { + "epoch": 1.0641418855847447, + "grad_norm": 0.19296708703041077, + "learning_rate": 8.395672874387488e-05, + "loss": 0.2828, + "step": 23940 + }, + { + "epoch": 1.0645863892963505, + "grad_norm": 0.1832272857427597, + "learning_rate": 8.394155497278177e-05, + "loss": 0.2837, + "step": 23950 + }, + { + "epoch": 1.0650308930079566, + "grad_norm": 0.16908299922943115, + "learning_rate": 8.392637540200523e-05, + "loss": 0.2811, + "step": 23960 + }, + { + "epoch": 1.0654753967195627, + "grad_norm": 0.19289059937000275, + "learning_rate": 8.391119003413902e-05, + "loss": 0.2806, + "step": 23970 + }, + { + "epoch": 1.0659199004311686, + "grad_norm": 0.21767519414424896, + "learning_rate": 8.38959988717779e-05, + "loss": 0.2813, + "step": 23980 + }, + { + "epoch": 1.0663644041427747, + "grad_norm": 0.22379960119724274, + "learning_rate": 8.388080191751764e-05, + "loss": 0.2851, + "step": 23990 + }, + { + "epoch": 1.0668089078543805, + "grad_norm": 0.19896985590457916, + "learning_rate": 8.386559917395496e-05, + "loss": 0.2839, + "step": 24000 + }, + { + "epoch": 1.0672534115659866, + "grad_norm": 0.18830733001232147, + "learning_rate": 8.385039064368761e-05, + "loss": 0.2866, + "step": 24010 + }, + { + "epoch": 1.0676979152775925, + "grad_norm": 0.258247047662735, + "learning_rate": 8.383517632931431e-05, + "loss": 0.2861, + "step": 24020 + }, + { + "epoch": 1.0681424189891986, + "grad_norm": 0.19199760258197784, + "learning_rate": 8.381995623343477e-05, + "loss": 0.2783, + "step": 24030 + }, + { + "epoch": 1.0685869227008045, + "grad_norm": 0.19611351191997528, + "learning_rate": 8.380473035864968e-05, + "loss": 0.2843, + "step": 24040 + }, + { + "epoch": 1.0690314264124106, + "grad_norm": 0.20180009305477142, + "learning_rate": 8.378949870756076e-05, + "loss": 0.2818, + "step": 24050 + }, + { + "epoch": 1.0694759301240166, + "grad_norm": 0.19926118850708008, + "learning_rate": 8.377426128277063e-05, + "loss": 0.2866, + "step": 24060 + }, + { + "epoch": 1.0699204338356225, + "grad_norm": 0.20527419447898865, + "learning_rate": 8.375901808688298e-05, + "loss": 0.2812, + "step": 24070 + }, + { + "epoch": 1.0703649375472286, + "grad_norm": 0.23668238520622253, + "learning_rate": 8.374376912250246e-05, + "loss": 0.284, + "step": 24080 + }, + { + "epoch": 1.0708094412588345, + "grad_norm": 0.23733489215373993, + "learning_rate": 8.372851439223468e-05, + "loss": 0.2851, + "step": 24090 + }, + { + "epoch": 1.0712539449704406, + "grad_norm": 0.21526110172271729, + "learning_rate": 8.371325389868627e-05, + "loss": 0.2822, + "step": 24100 + }, + { + "epoch": 1.0716984486820464, + "grad_norm": 0.21002097427845, + "learning_rate": 8.369798764446482e-05, + "loss": 0.2857, + "step": 24110 + }, + { + "epoch": 1.0721429523936525, + "grad_norm": 0.19558078050613403, + "learning_rate": 8.368271563217893e-05, + "loss": 0.2831, + "step": 24120 + }, + { + "epoch": 1.0725874561052584, + "grad_norm": 0.2044730931520462, + "learning_rate": 8.366743786443817e-05, + "loss": 0.2837, + "step": 24130 + }, + { + "epoch": 1.0730319598168645, + "grad_norm": 0.18502755463123322, + "learning_rate": 8.365215434385309e-05, + "loss": 0.2809, + "step": 24140 + }, + { + "epoch": 1.0734764635284704, + "grad_norm": 0.17627067863941193, + "learning_rate": 8.36368650730352e-05, + "loss": 0.2843, + "step": 24150 + }, + { + "epoch": 1.0739209672400765, + "grad_norm": 0.21553488075733185, + "learning_rate": 8.362157005459705e-05, + "loss": 0.2854, + "step": 24160 + }, + { + "epoch": 1.0743654709516823, + "grad_norm": 0.19362612068653107, + "learning_rate": 8.360626929115213e-05, + "loss": 0.2838, + "step": 24170 + }, + { + "epoch": 1.0748099746632884, + "grad_norm": 0.18875588476657867, + "learning_rate": 8.359096278531492e-05, + "loss": 0.2861, + "step": 24180 + }, + { + "epoch": 1.0752544783748945, + "grad_norm": 0.2504798173904419, + "learning_rate": 8.357565053970088e-05, + "loss": 0.2863, + "step": 24190 + }, + { + "epoch": 1.0756989820865004, + "grad_norm": 0.2680591344833374, + "learning_rate": 8.356033255692647e-05, + "loss": 0.2841, + "step": 24200 + }, + { + "epoch": 1.0761434857981065, + "grad_norm": 0.2116355150938034, + "learning_rate": 8.354500883960911e-05, + "loss": 0.2843, + "step": 24210 + }, + { + "epoch": 1.0765879895097124, + "grad_norm": 0.2139105349779129, + "learning_rate": 8.352967939036717e-05, + "loss": 0.2841, + "step": 24220 + }, + { + "epoch": 1.0770324932213184, + "grad_norm": 0.19607719779014587, + "learning_rate": 8.35143442118201e-05, + "loss": 0.2849, + "step": 24230 + }, + { + "epoch": 1.0774769969329243, + "grad_norm": 0.23573242127895355, + "learning_rate": 8.349900330658819e-05, + "loss": 0.2848, + "step": 24240 + }, + { + "epoch": 1.0779215006445304, + "grad_norm": 0.2596496641635895, + "learning_rate": 8.348365667729284e-05, + "loss": 0.2846, + "step": 24250 + }, + { + "epoch": 1.0783660043561363, + "grad_norm": 0.20976798236370087, + "learning_rate": 8.346830432655633e-05, + "loss": 0.2855, + "step": 24260 + }, + { + "epoch": 1.0788105080677424, + "grad_norm": 0.1937892585992813, + "learning_rate": 8.345294625700195e-05, + "loss": 0.2809, + "step": 24270 + }, + { + "epoch": 1.0792550117793485, + "grad_norm": 0.18218715488910675, + "learning_rate": 8.343758247125402e-05, + "loss": 0.284, + "step": 24280 + }, + { + "epoch": 1.0796995154909543, + "grad_norm": 0.17896668612957, + "learning_rate": 8.342221297193776e-05, + "loss": 0.2813, + "step": 24290 + }, + { + "epoch": 1.0801440192025604, + "grad_norm": 0.19388830661773682, + "learning_rate": 8.34068377616794e-05, + "loss": 0.2849, + "step": 24300 + }, + { + "epoch": 1.0805885229141663, + "grad_norm": 0.1972825825214386, + "learning_rate": 8.339145684310615e-05, + "loss": 0.2836, + "step": 24310 + }, + { + "epoch": 1.0810330266257724, + "grad_norm": 0.19029483199119568, + "learning_rate": 8.337607021884618e-05, + "loss": 0.2828, + "step": 24320 + }, + { + "epoch": 1.0814775303373783, + "grad_norm": 0.18221141397953033, + "learning_rate": 8.336067789152867e-05, + "loss": 0.2837, + "step": 24330 + }, + { + "epoch": 1.0819220340489843, + "grad_norm": 0.20555013418197632, + "learning_rate": 8.334527986378369e-05, + "loss": 0.2832, + "step": 24340 + }, + { + "epoch": 1.0823665377605902, + "grad_norm": 0.15944549441337585, + "learning_rate": 8.332987613824239e-05, + "loss": 0.2845, + "step": 24350 + }, + { + "epoch": 1.0828110414721963, + "grad_norm": 0.18650606274604797, + "learning_rate": 8.331446671753685e-05, + "loss": 0.2863, + "step": 24360 + }, + { + "epoch": 1.0832555451838022, + "grad_norm": 0.19852150976657867, + "learning_rate": 8.329905160430007e-05, + "loss": 0.2812, + "step": 24370 + }, + { + "epoch": 1.0837000488954083, + "grad_norm": 0.21852430701255798, + "learning_rate": 8.328363080116611e-05, + "loss": 0.2833, + "step": 24380 + }, + { + "epoch": 1.0841445526070144, + "grad_norm": 0.1933426856994629, + "learning_rate": 8.326820431076997e-05, + "loss": 0.2819, + "step": 24390 + }, + { + "epoch": 1.0845890563186202, + "grad_norm": 0.20742639899253845, + "learning_rate": 8.325277213574759e-05, + "loss": 0.282, + "step": 24400 + }, + { + "epoch": 1.0850335600302263, + "grad_norm": 0.23980304598808289, + "learning_rate": 8.32373342787359e-05, + "loss": 0.2819, + "step": 24410 + }, + { + "epoch": 1.0854780637418322, + "grad_norm": 0.20916469395160675, + "learning_rate": 8.322189074237285e-05, + "loss": 0.2884, + "step": 24420 + }, + { + "epoch": 1.0859225674534383, + "grad_norm": 0.2161339372396469, + "learning_rate": 8.32064415292973e-05, + "loss": 0.2823, + "step": 24430 + }, + { + "epoch": 1.0863670711650442, + "grad_norm": 0.22798526287078857, + "learning_rate": 8.319098664214907e-05, + "loss": 0.2836, + "step": 24440 + }, + { + "epoch": 1.0868115748766503, + "grad_norm": 0.2090734988451004, + "learning_rate": 8.3175526083569e-05, + "loss": 0.2858, + "step": 24450 + }, + { + "epoch": 1.0872560785882561, + "grad_norm": 0.1963769495487213, + "learning_rate": 8.316005985619889e-05, + "loss": 0.28, + "step": 24460 + }, + { + "epoch": 1.0877005822998622, + "grad_norm": 0.21125851571559906, + "learning_rate": 8.314458796268147e-05, + "loss": 0.2853, + "step": 24470 + }, + { + "epoch": 1.088145086011468, + "grad_norm": 0.17858216166496277, + "learning_rate": 8.312911040566047e-05, + "loss": 0.2783, + "step": 24480 + }, + { + "epoch": 1.0885895897230742, + "grad_norm": 0.21174344420433044, + "learning_rate": 8.31136271877806e-05, + "loss": 0.284, + "step": 24490 + }, + { + "epoch": 1.0890340934346803, + "grad_norm": 0.2266554981470108, + "learning_rate": 8.309813831168748e-05, + "loss": 0.28, + "step": 24500 + }, + { + "epoch": 1.0894785971462861, + "grad_norm": 0.21811464428901672, + "learning_rate": 8.308264378002777e-05, + "loss": 0.2823, + "step": 24510 + }, + { + "epoch": 1.0899231008578922, + "grad_norm": 0.21845336258411407, + "learning_rate": 8.306714359544906e-05, + "loss": 0.2837, + "step": 24520 + }, + { + "epoch": 1.090367604569498, + "grad_norm": 0.19013676047325134, + "learning_rate": 8.30516377605999e-05, + "loss": 0.2818, + "step": 24530 + }, + { + "epoch": 1.0908121082811042, + "grad_norm": 0.21102501451969147, + "learning_rate": 8.30361262781298e-05, + "loss": 0.2832, + "step": 24540 + }, + { + "epoch": 1.09125661199271, + "grad_norm": 0.20286628603935242, + "learning_rate": 8.302060915068924e-05, + "loss": 0.2834, + "step": 24550 + }, + { + "epoch": 1.0917011157043162, + "grad_norm": 0.17997612059116364, + "learning_rate": 8.300508638092972e-05, + "loss": 0.282, + "step": 24560 + }, + { + "epoch": 1.092145619415922, + "grad_norm": 0.18823355436325073, + "learning_rate": 8.298955797150361e-05, + "loss": 0.2816, + "step": 24570 + }, + { + "epoch": 1.0925901231275281, + "grad_norm": 0.18787027895450592, + "learning_rate": 8.297402392506433e-05, + "loss": 0.2785, + "step": 24580 + }, + { + "epoch": 1.0930346268391342, + "grad_norm": 0.19169014692306519, + "learning_rate": 8.295848424426617e-05, + "loss": 0.283, + "step": 24590 + }, + { + "epoch": 1.09347913055074, + "grad_norm": 0.2285809963941574, + "learning_rate": 8.29429389317645e-05, + "loss": 0.2866, + "step": 24600 + }, + { + "epoch": 1.0939236342623462, + "grad_norm": 0.22454820573329926, + "learning_rate": 8.292738799021556e-05, + "loss": 0.2824, + "step": 24610 + }, + { + "epoch": 1.094368137973952, + "grad_norm": 0.17988485097885132, + "learning_rate": 8.291183142227656e-05, + "loss": 0.2816, + "step": 24620 + }, + { + "epoch": 1.0948126416855581, + "grad_norm": 0.20695005357265472, + "learning_rate": 8.289626923060572e-05, + "loss": 0.2839, + "step": 24630 + }, + { + "epoch": 1.095257145397164, + "grad_norm": 0.20229516923427582, + "learning_rate": 8.288070141786218e-05, + "loss": 0.282, + "step": 24640 + }, + { + "epoch": 1.09570164910877, + "grad_norm": 0.2059016078710556, + "learning_rate": 8.286512798670605e-05, + "loss": 0.2818, + "step": 24650 + }, + { + "epoch": 1.096146152820376, + "grad_norm": 0.22433626651763916, + "learning_rate": 8.284954893979842e-05, + "loss": 0.2853, + "step": 24660 + }, + { + "epoch": 1.096590656531982, + "grad_norm": 0.23010998964309692, + "learning_rate": 8.283396427980131e-05, + "loss": 0.2838, + "step": 24670 + }, + { + "epoch": 1.097035160243588, + "grad_norm": 0.20661884546279907, + "learning_rate": 8.281837400937771e-05, + "loss": 0.2844, + "step": 24680 + }, + { + "epoch": 1.097479663955194, + "grad_norm": 0.2561887204647064, + "learning_rate": 8.28027781311916e-05, + "loss": 0.283, + "step": 24690 + }, + { + "epoch": 1.0979241676668001, + "grad_norm": 0.2225271612405777, + "learning_rate": 8.278717664790785e-05, + "loss": 0.2819, + "step": 24700 + }, + { + "epoch": 1.098368671378406, + "grad_norm": 0.21308426558971405, + "learning_rate": 8.277156956219234e-05, + "loss": 0.2834, + "step": 24710 + }, + { + "epoch": 1.098813175090012, + "grad_norm": 0.1977514922618866, + "learning_rate": 8.275595687671189e-05, + "loss": 0.2857, + "step": 24720 + }, + { + "epoch": 1.099257678801618, + "grad_norm": 0.18373551964759827, + "learning_rate": 8.27403385941343e-05, + "loss": 0.2848, + "step": 24730 + }, + { + "epoch": 1.099702182513224, + "grad_norm": 0.16943411529064178, + "learning_rate": 8.272471471712828e-05, + "loss": 0.285, + "step": 24740 + }, + { + "epoch": 1.10014668622483, + "grad_norm": 0.22931788861751556, + "learning_rate": 8.270908524836355e-05, + "loss": 0.2862, + "step": 24750 + }, + { + "epoch": 1.100591189936436, + "grad_norm": 0.20404426753520966, + "learning_rate": 8.269345019051074e-05, + "loss": 0.2805, + "step": 24760 + }, + { + "epoch": 1.1010356936480419, + "grad_norm": 0.2111790031194687, + "learning_rate": 8.267780954624147e-05, + "loss": 0.2843, + "step": 24770 + }, + { + "epoch": 1.101480197359648, + "grad_norm": 0.20066682994365692, + "learning_rate": 8.266216331822827e-05, + "loss": 0.2835, + "step": 24780 + }, + { + "epoch": 1.1019247010712538, + "grad_norm": 0.2363877296447754, + "learning_rate": 8.264651150914469e-05, + "loss": 0.2805, + "step": 24790 + }, + { + "epoch": 1.10236920478286, + "grad_norm": 0.19468368589878082, + "learning_rate": 8.263085412166517e-05, + "loss": 0.2825, + "step": 24800 + }, + { + "epoch": 1.102813708494466, + "grad_norm": 0.1990746259689331, + "learning_rate": 8.261519115846514e-05, + "loss": 0.2837, + "step": 24810 + }, + { + "epoch": 1.103258212206072, + "grad_norm": 0.17517073452472687, + "learning_rate": 8.259952262222096e-05, + "loss": 0.2786, + "step": 24820 + }, + { + "epoch": 1.103702715917678, + "grad_norm": 0.21489574015140533, + "learning_rate": 8.258384851560997e-05, + "loss": 0.2844, + "step": 24830 + }, + { + "epoch": 1.1041472196292839, + "grad_norm": 0.17287179827690125, + "learning_rate": 8.256816884131044e-05, + "loss": 0.2786, + "step": 24840 + }, + { + "epoch": 1.10459172334089, + "grad_norm": 0.2323639839887619, + "learning_rate": 8.255248360200159e-05, + "loss": 0.2835, + "step": 24850 + }, + { + "epoch": 1.1050362270524958, + "grad_norm": 0.20994648337364197, + "learning_rate": 8.253679280036359e-05, + "loss": 0.2842, + "step": 24860 + }, + { + "epoch": 1.105480730764102, + "grad_norm": 0.1761491894721985, + "learning_rate": 8.252109643907762e-05, + "loss": 0.2822, + "step": 24870 + }, + { + "epoch": 1.1059252344757078, + "grad_norm": 0.1933155655860901, + "learning_rate": 8.250539452082569e-05, + "loss": 0.28, + "step": 24880 + }, + { + "epoch": 1.1063697381873139, + "grad_norm": 0.21769554913043976, + "learning_rate": 8.248968704829087e-05, + "loss": 0.2828, + "step": 24890 + }, + { + "epoch": 1.10681424189892, + "grad_norm": 0.231772318482399, + "learning_rate": 8.247397402415714e-05, + "loss": 0.2808, + "step": 24900 + }, + { + "epoch": 1.1072587456105258, + "grad_norm": 0.2166195958852768, + "learning_rate": 8.24582554511094e-05, + "loss": 0.2825, + "step": 24910 + }, + { + "epoch": 1.107703249322132, + "grad_norm": 0.2253277748823166, + "learning_rate": 8.244253133183355e-05, + "loss": 0.2816, + "step": 24920 + }, + { + "epoch": 1.1081477530337378, + "grad_norm": 0.19944025576114655, + "learning_rate": 8.24268016690164e-05, + "loss": 0.2792, + "step": 24930 + }, + { + "epoch": 1.108592256745344, + "grad_norm": 0.19300557672977448, + "learning_rate": 8.241106646534571e-05, + "loss": 0.2826, + "step": 24940 + }, + { + "epoch": 1.1090367604569498, + "grad_norm": 0.18468016386032104, + "learning_rate": 8.23953257235102e-05, + "loss": 0.2831, + "step": 24950 + }, + { + "epoch": 1.1094812641685559, + "grad_norm": 0.1889522820711136, + "learning_rate": 8.237957944619956e-05, + "loss": 0.2783, + "step": 24960 + }, + { + "epoch": 1.1099257678801617, + "grad_norm": 0.1894257664680481, + "learning_rate": 8.236382763610437e-05, + "loss": 0.2871, + "step": 24970 + }, + { + "epoch": 1.1103702715917678, + "grad_norm": 0.22318923473358154, + "learning_rate": 8.234807029591619e-05, + "loss": 0.2856, + "step": 24980 + }, + { + "epoch": 1.1108147753033737, + "grad_norm": 0.22112834453582764, + "learning_rate": 8.233230742832752e-05, + "loss": 0.2846, + "step": 24990 + }, + { + "epoch": 1.1112592790149798, + "grad_norm": 0.2309887558221817, + "learning_rate": 8.231653903603178e-05, + "loss": 0.2818, + "step": 25000 + }, + { + "epoch": 1.1117037827265857, + "grad_norm": 0.19830390810966492, + "learning_rate": 8.23007651217234e-05, + "loss": 0.2827, + "step": 25010 + }, + { + "epoch": 1.1121482864381917, + "grad_norm": 0.19621945917606354, + "learning_rate": 8.228498568809769e-05, + "loss": 0.2852, + "step": 25020 + }, + { + "epoch": 1.1125927901497978, + "grad_norm": 0.19520235061645508, + "learning_rate": 8.22692007378509e-05, + "loss": 0.2803, + "step": 25030 + }, + { + "epoch": 1.1130372938614037, + "grad_norm": 0.16892217099666595, + "learning_rate": 8.225341027368028e-05, + "loss": 0.2809, + "step": 25040 + }, + { + "epoch": 1.1134817975730098, + "grad_norm": 0.19282492995262146, + "learning_rate": 8.223761429828399e-05, + "loss": 0.2804, + "step": 25050 + }, + { + "epoch": 1.1139263012846157, + "grad_norm": 0.18119357526302338, + "learning_rate": 8.22218128143611e-05, + "loss": 0.2812, + "step": 25060 + }, + { + "epoch": 1.1143708049962218, + "grad_norm": 0.19613473117351532, + "learning_rate": 8.220600582461166e-05, + "loss": 0.2808, + "step": 25070 + }, + { + "epoch": 1.1148153087078276, + "grad_norm": 0.20447438955307007, + "learning_rate": 8.219019333173668e-05, + "loss": 0.2785, + "step": 25080 + }, + { + "epoch": 1.1152598124194337, + "grad_norm": 0.21009792387485504, + "learning_rate": 8.217437533843805e-05, + "loss": 0.2818, + "step": 25090 + }, + { + "epoch": 1.1157043161310396, + "grad_norm": 0.17675824463367462, + "learning_rate": 8.215855184741867e-05, + "loss": 0.2828, + "step": 25100 + }, + { + "epoch": 1.1161488198426457, + "grad_norm": 0.19638517498970032, + "learning_rate": 8.21427228613823e-05, + "loss": 0.2852, + "step": 25110 + }, + { + "epoch": 1.1165933235542518, + "grad_norm": 0.20374342799186707, + "learning_rate": 8.21268883830337e-05, + "loss": 0.2843, + "step": 25120 + }, + { + "epoch": 1.1170378272658577, + "grad_norm": 0.1641135811805725, + "learning_rate": 8.211104841507855e-05, + "loss": 0.2829, + "step": 25130 + }, + { + "epoch": 1.1174823309774637, + "grad_norm": 0.18419265747070312, + "learning_rate": 8.209520296022346e-05, + "loss": 0.2853, + "step": 25140 + }, + { + "epoch": 1.1179268346890696, + "grad_norm": 0.19337198138237, + "learning_rate": 8.207935202117599e-05, + "loss": 0.2831, + "step": 25150 + }, + { + "epoch": 1.1183713384006757, + "grad_norm": 0.2381230741739273, + "learning_rate": 8.206349560064463e-05, + "loss": 0.2838, + "step": 25160 + }, + { + "epoch": 1.1188158421122816, + "grad_norm": 0.22565412521362305, + "learning_rate": 8.204763370133881e-05, + "loss": 0.2857, + "step": 25170 + }, + { + "epoch": 1.1192603458238877, + "grad_norm": 0.1856418251991272, + "learning_rate": 8.203176632596892e-05, + "loss": 0.2844, + "step": 25180 + }, + { + "epoch": 1.1197048495354935, + "grad_norm": 0.16625335812568665, + "learning_rate": 8.20158934772462e-05, + "loss": 0.2816, + "step": 25190 + }, + { + "epoch": 1.1201493532470996, + "grad_norm": 0.2048785388469696, + "learning_rate": 8.200001515788294e-05, + "loss": 0.2828, + "step": 25200 + }, + { + "epoch": 1.1205938569587055, + "grad_norm": 0.246022030711174, + "learning_rate": 8.198413137059228e-05, + "loss": 0.2827, + "step": 25210 + }, + { + "epoch": 1.1210383606703116, + "grad_norm": 0.18542391061782837, + "learning_rate": 8.196824211808835e-05, + "loss": 0.2824, + "step": 25220 + }, + { + "epoch": 1.1214828643819177, + "grad_norm": 0.19374775886535645, + "learning_rate": 8.195234740308617e-05, + "loss": 0.2831, + "step": 25230 + }, + { + "epoch": 1.1219273680935236, + "grad_norm": 0.1780315637588501, + "learning_rate": 8.193644722830171e-05, + "loss": 0.2793, + "step": 25240 + }, + { + "epoch": 1.1223718718051297, + "grad_norm": 0.22817596793174744, + "learning_rate": 8.19205415964519e-05, + "loss": 0.2829, + "step": 25250 + }, + { + "epoch": 1.1228163755167355, + "grad_norm": 0.20571273565292358, + "learning_rate": 8.190463051025456e-05, + "loss": 0.2795, + "step": 25260 + }, + { + "epoch": 1.1232608792283416, + "grad_norm": 0.23155444860458374, + "learning_rate": 8.188871397242843e-05, + "loss": 0.28, + "step": 25270 + }, + { + "epoch": 1.1237053829399475, + "grad_norm": 0.21042227745056152, + "learning_rate": 8.187279198569326e-05, + "loss": 0.2817, + "step": 25280 + }, + { + "epoch": 1.1241498866515536, + "grad_norm": 0.2023276835680008, + "learning_rate": 8.185686455276966e-05, + "loss": 0.2812, + "step": 25290 + }, + { + "epoch": 1.1245943903631594, + "grad_norm": 0.189702570438385, + "learning_rate": 8.184093167637921e-05, + "loss": 0.2816, + "step": 25300 + }, + { + "epoch": 1.1250388940747655, + "grad_norm": 0.19266727566719055, + "learning_rate": 8.182499335924437e-05, + "loss": 0.282, + "step": 25310 + }, + { + "epoch": 1.1254833977863714, + "grad_norm": 0.1962689608335495, + "learning_rate": 8.18090496040886e-05, + "loss": 0.2814, + "step": 25320 + }, + { + "epoch": 1.1259279014979775, + "grad_norm": 0.20956018567085266, + "learning_rate": 8.179310041363621e-05, + "loss": 0.2802, + "step": 25330 + }, + { + "epoch": 1.1263724052095836, + "grad_norm": 0.1745910942554474, + "learning_rate": 8.17771457906125e-05, + "loss": 0.2793, + "step": 25340 + }, + { + "epoch": 1.1268169089211895, + "grad_norm": 0.20587173104286194, + "learning_rate": 8.176118573774371e-05, + "loss": 0.2844, + "step": 25350 + }, + { + "epoch": 1.1272614126327956, + "grad_norm": 0.23401615023612976, + "learning_rate": 8.174522025775692e-05, + "loss": 0.2827, + "step": 25360 + }, + { + "epoch": 1.1277059163444014, + "grad_norm": 0.18395937979221344, + "learning_rate": 8.172924935338022e-05, + "loss": 0.2811, + "step": 25370 + }, + { + "epoch": 1.1281504200560075, + "grad_norm": 0.18022876977920532, + "learning_rate": 8.171327302734262e-05, + "loss": 0.2799, + "step": 25380 + }, + { + "epoch": 1.1285949237676134, + "grad_norm": 0.1790742725133896, + "learning_rate": 8.169729128237401e-05, + "loss": 0.2799, + "step": 25390 + }, + { + "epoch": 1.1290394274792195, + "grad_norm": 0.16311106085777283, + "learning_rate": 8.168130412120525e-05, + "loss": 0.2806, + "step": 25400 + }, + { + "epoch": 1.1294839311908254, + "grad_norm": 0.18005986511707306, + "learning_rate": 8.16653115465681e-05, + "loss": 0.2797, + "step": 25410 + }, + { + "epoch": 1.1299284349024314, + "grad_norm": 0.21073776483535767, + "learning_rate": 8.164931356119526e-05, + "loss": 0.2852, + "step": 25420 + }, + { + "epoch": 1.1303729386140375, + "grad_norm": 0.22408555448055267, + "learning_rate": 8.163331016782032e-05, + "loss": 0.2825, + "step": 25430 + }, + { + "epoch": 1.1308174423256434, + "grad_norm": 0.22374628484249115, + "learning_rate": 8.161730136917785e-05, + "loss": 0.2824, + "step": 25440 + }, + { + "epoch": 1.1312619460372495, + "grad_norm": 0.2249225229024887, + "learning_rate": 8.160128716800333e-05, + "loss": 0.2841, + "step": 25450 + }, + { + "epoch": 1.1317064497488554, + "grad_norm": 0.1843441128730774, + "learning_rate": 8.158526756703313e-05, + "loss": 0.2794, + "step": 25460 + }, + { + "epoch": 1.1321509534604615, + "grad_norm": 0.1883556842803955, + "learning_rate": 8.156924256900455e-05, + "loss": 0.2811, + "step": 25470 + }, + { + "epoch": 1.1325954571720673, + "grad_norm": 0.1866857409477234, + "learning_rate": 8.155321217665584e-05, + "loss": 0.2817, + "step": 25480 + }, + { + "epoch": 1.1330399608836734, + "grad_norm": 0.21169041097164154, + "learning_rate": 8.153717639272614e-05, + "loss": 0.28, + "step": 25490 + }, + { + "epoch": 1.1334844645952793, + "grad_norm": 0.23594743013381958, + "learning_rate": 8.152113521995555e-05, + "loss": 0.2823, + "step": 25500 + }, + { + "epoch": 1.1339289683068854, + "grad_norm": 0.17840991914272308, + "learning_rate": 8.150508866108505e-05, + "loss": 0.2856, + "step": 25510 + }, + { + "epoch": 1.1343734720184913, + "grad_norm": 0.2100195586681366, + "learning_rate": 8.148903671885657e-05, + "loss": 0.283, + "step": 25520 + }, + { + "epoch": 1.1348179757300974, + "grad_norm": 0.22346362471580505, + "learning_rate": 8.147297939601292e-05, + "loss": 0.2838, + "step": 25530 + }, + { + "epoch": 1.1352624794417032, + "grad_norm": 0.20041203498840332, + "learning_rate": 8.145691669529792e-05, + "loss": 0.2799, + "step": 25540 + }, + { + "epoch": 1.1357069831533093, + "grad_norm": 0.20005245506763458, + "learning_rate": 8.144084861945618e-05, + "loss": 0.2819, + "step": 25550 + }, + { + "epoch": 1.1361514868649154, + "grad_norm": 0.201791912317276, + "learning_rate": 8.142477517123333e-05, + "loss": 0.2819, + "step": 25560 + }, + { + "epoch": 1.1365959905765213, + "grad_norm": 0.22028563916683197, + "learning_rate": 8.140869635337586e-05, + "loss": 0.2815, + "step": 25570 + }, + { + "epoch": 1.1370404942881274, + "grad_norm": 0.20855645835399628, + "learning_rate": 8.139261216863123e-05, + "loss": 0.2832, + "step": 25580 + }, + { + "epoch": 1.1374849979997332, + "grad_norm": 0.21787168085575104, + "learning_rate": 8.137652261974776e-05, + "loss": 0.2864, + "step": 25590 + }, + { + "epoch": 1.1379295017113393, + "grad_norm": 0.23609289526939392, + "learning_rate": 8.136042770947472e-05, + "loss": 0.2864, + "step": 25600 + }, + { + "epoch": 1.1383740054229452, + "grad_norm": 0.18452012538909912, + "learning_rate": 8.134432744056228e-05, + "loss": 0.2822, + "step": 25610 + }, + { + "epoch": 1.1388185091345513, + "grad_norm": 0.20929968357086182, + "learning_rate": 8.132822181576158e-05, + "loss": 0.2802, + "step": 25620 + }, + { + "epoch": 1.1392630128461572, + "grad_norm": 0.18587768077850342, + "learning_rate": 8.131211083782459e-05, + "loss": 0.2824, + "step": 25630 + }, + { + "epoch": 1.1397075165577633, + "grad_norm": 0.17799288034439087, + "learning_rate": 8.129599450950424e-05, + "loss": 0.2848, + "step": 25640 + }, + { + "epoch": 1.1401520202693693, + "grad_norm": 0.18402864038944244, + "learning_rate": 8.127987283355438e-05, + "loss": 0.2805, + "step": 25650 + }, + { + "epoch": 1.1405965239809752, + "grad_norm": 0.1935960054397583, + "learning_rate": 8.126374581272976e-05, + "loss": 0.2815, + "step": 25660 + }, + { + "epoch": 1.1410410276925813, + "grad_norm": 0.21142536401748657, + "learning_rate": 8.124761344978605e-05, + "loss": 0.2811, + "step": 25670 + }, + { + "epoch": 1.1414855314041872, + "grad_norm": 0.18530172109603882, + "learning_rate": 8.12314757474798e-05, + "loss": 0.2822, + "step": 25680 + }, + { + "epoch": 1.1419300351157933, + "grad_norm": 0.21373195946216583, + "learning_rate": 8.121533270856856e-05, + "loss": 0.2813, + "step": 25690 + }, + { + "epoch": 1.1423745388273991, + "grad_norm": 0.21657656133174896, + "learning_rate": 8.119918433581069e-05, + "loss": 0.2825, + "step": 25700 + }, + { + "epoch": 1.1428190425390052, + "grad_norm": 0.21003809571266174, + "learning_rate": 8.118303063196551e-05, + "loss": 0.2824, + "step": 25710 + }, + { + "epoch": 1.143263546250611, + "grad_norm": 0.20306304097175598, + "learning_rate": 8.116687159979326e-05, + "loss": 0.2822, + "step": 25720 + }, + { + "epoch": 1.1437080499622172, + "grad_norm": 0.18468409776687622, + "learning_rate": 8.115070724205508e-05, + "loss": 0.2862, + "step": 25730 + }, + { + "epoch": 1.1441525536738233, + "grad_norm": 0.23606954514980316, + "learning_rate": 8.113453756151296e-05, + "loss": 0.2816, + "step": 25740 + }, + { + "epoch": 1.1445970573854292, + "grad_norm": 0.18925753235816956, + "learning_rate": 8.111836256092995e-05, + "loss": 0.2798, + "step": 25750 + }, + { + "epoch": 1.1450415610970353, + "grad_norm": 0.1819349229335785, + "learning_rate": 8.110218224306985e-05, + "loss": 0.2786, + "step": 25760 + }, + { + "epoch": 1.1454860648086411, + "grad_norm": 0.19761665165424347, + "learning_rate": 8.108599661069745e-05, + "loss": 0.2816, + "step": 25770 + }, + { + "epoch": 1.1459305685202472, + "grad_norm": 0.22492676973342896, + "learning_rate": 8.106980566657845e-05, + "loss": 0.2853, + "step": 25780 + }, + { + "epoch": 1.146375072231853, + "grad_norm": 0.21348032355308533, + "learning_rate": 8.10536094134794e-05, + "loss": 0.281, + "step": 25790 + }, + { + "epoch": 1.1468195759434592, + "grad_norm": 0.24151462316513062, + "learning_rate": 8.103740785416783e-05, + "loss": 0.2794, + "step": 25800 + }, + { + "epoch": 1.147264079655065, + "grad_norm": 0.21526142954826355, + "learning_rate": 8.102120099141212e-05, + "loss": 0.2812, + "step": 25810 + }, + { + "epoch": 1.1477085833666711, + "grad_norm": 0.21024225652217865, + "learning_rate": 8.100498882798163e-05, + "loss": 0.2819, + "step": 25820 + }, + { + "epoch": 1.148153087078277, + "grad_norm": 0.18422110378742218, + "learning_rate": 8.09887713666465e-05, + "loss": 0.2827, + "step": 25830 + }, + { + "epoch": 1.148597590789883, + "grad_norm": 0.20185907185077667, + "learning_rate": 8.09725486101779e-05, + "loss": 0.2813, + "step": 25840 + }, + { + "epoch": 1.149042094501489, + "grad_norm": 0.23753678798675537, + "learning_rate": 8.095632056134784e-05, + "loss": 0.2827, + "step": 25850 + }, + { + "epoch": 1.149486598213095, + "grad_norm": 0.21091216802597046, + "learning_rate": 8.094008722292925e-05, + "loss": 0.2817, + "step": 25860 + }, + { + "epoch": 1.1499311019247012, + "grad_norm": 0.20453566312789917, + "learning_rate": 8.092384859769598e-05, + "loss": 0.2831, + "step": 25870 + }, + { + "epoch": 1.150375605636307, + "grad_norm": 0.1667848527431488, + "learning_rate": 8.090760468842275e-05, + "loss": 0.2798, + "step": 25880 + }, + { + "epoch": 1.1508201093479131, + "grad_norm": 0.18139928579330444, + "learning_rate": 8.089135549788521e-05, + "loss": 0.2809, + "step": 25890 + }, + { + "epoch": 1.151264613059519, + "grad_norm": 0.18492408096790314, + "learning_rate": 8.087510102885987e-05, + "loss": 0.2838, + "step": 25900 + }, + { + "epoch": 1.151709116771125, + "grad_norm": 0.20991025865077972, + "learning_rate": 8.085884128412422e-05, + "loss": 0.2778, + "step": 25910 + }, + { + "epoch": 1.152153620482731, + "grad_norm": 0.15927651524543762, + "learning_rate": 8.084257626645659e-05, + "loss": 0.2813, + "step": 25920 + }, + { + "epoch": 1.152598124194337, + "grad_norm": 0.1667761206626892, + "learning_rate": 8.08263059786362e-05, + "loss": 0.2814, + "step": 25930 + }, + { + "epoch": 1.153042627905943, + "grad_norm": 0.2039365917444229, + "learning_rate": 8.081003042344325e-05, + "loss": 0.2805, + "step": 25940 + }, + { + "epoch": 1.153487131617549, + "grad_norm": 0.21799686551094055, + "learning_rate": 8.079374960365872e-05, + "loss": 0.283, + "step": 25950 + }, + { + "epoch": 1.153931635329155, + "grad_norm": 0.1850994974374771, + "learning_rate": 8.077746352206463e-05, + "loss": 0.2812, + "step": 25960 + }, + { + "epoch": 1.154376139040761, + "grad_norm": 0.20817793905735016, + "learning_rate": 8.076117218144377e-05, + "loss": 0.282, + "step": 25970 + }, + { + "epoch": 1.154820642752367, + "grad_norm": 0.1973486840724945, + "learning_rate": 8.074487558457991e-05, + "loss": 0.2784, + "step": 25980 + }, + { + "epoch": 1.155265146463973, + "grad_norm": 0.19689375162124634, + "learning_rate": 8.072857373425768e-05, + "loss": 0.2818, + "step": 25990 + }, + { + "epoch": 1.155709650175579, + "grad_norm": 0.1801997274160385, + "learning_rate": 8.071226663326264e-05, + "loss": 0.2828, + "step": 26000 + }, + { + "epoch": 1.156154153887185, + "grad_norm": 0.1901748776435852, + "learning_rate": 8.069595428438121e-05, + "loss": 0.2829, + "step": 26010 + }, + { + "epoch": 1.156598657598791, + "grad_norm": 0.20652654767036438, + "learning_rate": 8.067963669040072e-05, + "loss": 0.2849, + "step": 26020 + }, + { + "epoch": 1.1570431613103969, + "grad_norm": 0.1872020810842514, + "learning_rate": 8.066331385410942e-05, + "loss": 0.2827, + "step": 26030 + }, + { + "epoch": 1.157487665022003, + "grad_norm": 0.18113310635089874, + "learning_rate": 8.064698577829641e-05, + "loss": 0.2808, + "step": 26040 + }, + { + "epoch": 1.157932168733609, + "grad_norm": 0.1730651706457138, + "learning_rate": 8.063065246575175e-05, + "loss": 0.2825, + "step": 26050 + }, + { + "epoch": 1.158376672445215, + "grad_norm": 0.18061292171478271, + "learning_rate": 8.061431391926631e-05, + "loss": 0.2859, + "step": 26060 + }, + { + "epoch": 1.1588211761568208, + "grad_norm": 0.18280884623527527, + "learning_rate": 8.059797014163195e-05, + "loss": 0.2793, + "step": 26070 + }, + { + "epoch": 1.1592656798684269, + "grad_norm": 0.19054275751113892, + "learning_rate": 8.058162113564133e-05, + "loss": 0.2798, + "step": 26080 + }, + { + "epoch": 1.159710183580033, + "grad_norm": 0.2279992550611496, + "learning_rate": 8.056526690408806e-05, + "loss": 0.2845, + "step": 26090 + }, + { + "epoch": 1.1601546872916388, + "grad_norm": 0.22068293392658234, + "learning_rate": 8.054890744976666e-05, + "loss": 0.2898, + "step": 26100 + }, + { + "epoch": 1.160599191003245, + "grad_norm": 0.1996166855096817, + "learning_rate": 8.053254277547248e-05, + "loss": 0.2837, + "step": 26110 + }, + { + "epoch": 1.1610436947148508, + "grad_norm": 0.18995873630046844, + "learning_rate": 8.051617288400182e-05, + "loss": 0.2825, + "step": 26120 + }, + { + "epoch": 1.161488198426457, + "grad_norm": 0.20250655710697174, + "learning_rate": 8.049979777815182e-05, + "loss": 0.2813, + "step": 26130 + }, + { + "epoch": 1.1619327021380628, + "grad_norm": 0.22407977283000946, + "learning_rate": 8.048341746072054e-05, + "loss": 0.2796, + "step": 26140 + }, + { + "epoch": 1.1623772058496689, + "grad_norm": 0.18223756551742554, + "learning_rate": 8.046703193450696e-05, + "loss": 0.2846, + "step": 26150 + }, + { + "epoch": 1.1628217095612747, + "grad_norm": 0.22004826366901398, + "learning_rate": 8.04506412023109e-05, + "loss": 0.2812, + "step": 26160 + }, + { + "epoch": 1.1632662132728808, + "grad_norm": 0.1955237090587616, + "learning_rate": 8.043424526693306e-05, + "loss": 0.2801, + "step": 26170 + }, + { + "epoch": 1.163710716984487, + "grad_norm": 0.2581331133842468, + "learning_rate": 8.04178441311751e-05, + "loss": 0.2836, + "step": 26180 + }, + { + "epoch": 1.1641552206960928, + "grad_norm": 0.272775262594223, + "learning_rate": 8.04014377978395e-05, + "loss": 0.2841, + "step": 26190 + }, + { + "epoch": 1.1645997244076989, + "grad_norm": 0.23408502340316772, + "learning_rate": 8.038502626972967e-05, + "loss": 0.2829, + "step": 26200 + }, + { + "epoch": 1.1650442281193047, + "grad_norm": 0.19437462091445923, + "learning_rate": 8.036860954964989e-05, + "loss": 0.2829, + "step": 26210 + }, + { + "epoch": 1.1654887318309108, + "grad_norm": 0.18683400750160217, + "learning_rate": 8.035218764040531e-05, + "loss": 0.2815, + "step": 26220 + }, + { + "epoch": 1.1659332355425167, + "grad_norm": 0.21842168271541595, + "learning_rate": 8.033576054480199e-05, + "loss": 0.2832, + "step": 26230 + }, + { + "epoch": 1.1663777392541228, + "grad_norm": 0.18446530401706696, + "learning_rate": 8.031932826564688e-05, + "loss": 0.2804, + "step": 26240 + }, + { + "epoch": 1.1668222429657287, + "grad_norm": 0.20216193795204163, + "learning_rate": 8.030289080574782e-05, + "loss": 0.2819, + "step": 26250 + }, + { + "epoch": 1.1672667466773348, + "grad_norm": 0.15330715477466583, + "learning_rate": 8.028644816791349e-05, + "loss": 0.2802, + "step": 26260 + }, + { + "epoch": 1.1677112503889409, + "grad_norm": 0.21893951296806335, + "learning_rate": 8.027000035495351e-05, + "loss": 0.2836, + "step": 26270 + }, + { + "epoch": 1.1681557541005467, + "grad_norm": 0.2034139335155487, + "learning_rate": 8.025354736967836e-05, + "loss": 0.2798, + "step": 26280 + }, + { + "epoch": 1.1686002578121528, + "grad_norm": 0.2108180969953537, + "learning_rate": 8.023708921489941e-05, + "loss": 0.2809, + "step": 26290 + }, + { + "epoch": 1.1690447615237587, + "grad_norm": 0.22098799049854279, + "learning_rate": 8.022062589342887e-05, + "loss": 0.2809, + "step": 26300 + }, + { + "epoch": 1.1694892652353648, + "grad_norm": 0.18232139945030212, + "learning_rate": 8.020415740807993e-05, + "loss": 0.2839, + "step": 26310 + }, + { + "epoch": 1.1699337689469707, + "grad_norm": 0.20501132309436798, + "learning_rate": 8.018768376166656e-05, + "loss": 0.2817, + "step": 26320 + }, + { + "epoch": 1.1703782726585767, + "grad_norm": 0.21338334679603577, + "learning_rate": 8.017120495700368e-05, + "loss": 0.2838, + "step": 26330 + }, + { + "epoch": 1.1708227763701826, + "grad_norm": 0.2049216777086258, + "learning_rate": 8.015472099690704e-05, + "loss": 0.2791, + "step": 26340 + }, + { + "epoch": 1.1712672800817887, + "grad_norm": 0.20826710760593414, + "learning_rate": 8.013823188419332e-05, + "loss": 0.277, + "step": 26350 + }, + { + "epoch": 1.1717117837933946, + "grad_norm": 0.18031218647956848, + "learning_rate": 8.012173762168006e-05, + "loss": 0.2803, + "step": 26360 + }, + { + "epoch": 1.1721562875050007, + "grad_norm": 0.23272264003753662, + "learning_rate": 8.010523821218567e-05, + "loss": 0.2837, + "step": 26370 + }, + { + "epoch": 1.1726007912166065, + "grad_norm": 0.23607617616653442, + "learning_rate": 8.008873365852945e-05, + "loss": 0.2818, + "step": 26380 + }, + { + "epoch": 1.1730452949282126, + "grad_norm": 0.1927766352891922, + "learning_rate": 8.007222396353157e-05, + "loss": 0.2823, + "step": 26390 + }, + { + "epoch": 1.1734897986398187, + "grad_norm": 0.17377395927906036, + "learning_rate": 8.00557091300131e-05, + "loss": 0.2796, + "step": 26400 + }, + { + "epoch": 1.1739343023514246, + "grad_norm": 0.20301254093647003, + "learning_rate": 8.003918916079597e-05, + "loss": 0.2792, + "step": 26410 + }, + { + "epoch": 1.1743788060630307, + "grad_norm": 0.1839238703250885, + "learning_rate": 8.002266405870298e-05, + "loss": 0.2803, + "step": 26420 + }, + { + "epoch": 1.1748233097746366, + "grad_norm": 0.1860237568616867, + "learning_rate": 8.000613382655782e-05, + "loss": 0.2791, + "step": 26430 + }, + { + "epoch": 1.1752678134862427, + "grad_norm": 0.19412671029567719, + "learning_rate": 7.998959846718505e-05, + "loss": 0.2787, + "step": 26440 + }, + { + "epoch": 1.1757123171978485, + "grad_norm": 0.20334355533123016, + "learning_rate": 7.997305798341012e-05, + "loss": 0.2805, + "step": 26450 + }, + { + "epoch": 1.1761568209094546, + "grad_norm": 0.20248685777187347, + "learning_rate": 7.995651237805937e-05, + "loss": 0.283, + "step": 26460 + }, + { + "epoch": 1.1766013246210605, + "grad_norm": 0.1958528310060501, + "learning_rate": 7.993996165395996e-05, + "loss": 0.2788, + "step": 26470 + }, + { + "epoch": 1.1770458283326666, + "grad_norm": 0.18931123614311218, + "learning_rate": 7.992340581393996e-05, + "loss": 0.2805, + "step": 26480 + }, + { + "epoch": 1.1774903320442727, + "grad_norm": 0.21804632246494293, + "learning_rate": 7.990684486082831e-05, + "loss": 0.281, + "step": 26490 + }, + { + "epoch": 1.1779348357558785, + "grad_norm": 0.20904210209846497, + "learning_rate": 7.989027879745482e-05, + "loss": 0.2821, + "step": 26500 + }, + { + "epoch": 1.1783793394674846, + "grad_norm": 0.20982789993286133, + "learning_rate": 7.98737076266502e-05, + "loss": 0.2801, + "step": 26510 + }, + { + "epoch": 1.1788238431790905, + "grad_norm": 0.2510005235671997, + "learning_rate": 7.985713135124598e-05, + "loss": 0.2805, + "step": 26520 + }, + { + "epoch": 1.1792683468906966, + "grad_norm": 0.24395501613616943, + "learning_rate": 7.98405499740746e-05, + "loss": 0.2791, + "step": 26530 + }, + { + "epoch": 1.1797128506023025, + "grad_norm": 0.2008044719696045, + "learning_rate": 7.98239634979694e-05, + "loss": 0.2816, + "step": 26540 + }, + { + "epoch": 1.1801573543139086, + "grad_norm": 0.23355112969875336, + "learning_rate": 7.98073719257645e-05, + "loss": 0.2819, + "step": 26550 + }, + { + "epoch": 1.1806018580255144, + "grad_norm": 0.19364093244075775, + "learning_rate": 7.979077526029499e-05, + "loss": 0.28, + "step": 26560 + }, + { + "epoch": 1.1810463617371205, + "grad_norm": 0.18820251524448395, + "learning_rate": 7.977417350439675e-05, + "loss": 0.2824, + "step": 26570 + }, + { + "epoch": 1.1814908654487266, + "grad_norm": 0.19615329802036285, + "learning_rate": 7.97575666609066e-05, + "loss": 0.283, + "step": 26580 + }, + { + "epoch": 1.1819353691603325, + "grad_norm": 0.1614908128976822, + "learning_rate": 7.974095473266216e-05, + "loss": 0.2814, + "step": 26590 + }, + { + "epoch": 1.1823798728719386, + "grad_norm": 0.19143404066562653, + "learning_rate": 7.972433772250198e-05, + "loss": 0.2802, + "step": 26600 + }, + { + "epoch": 1.1828243765835444, + "grad_norm": 0.21645575761795044, + "learning_rate": 7.970771563326544e-05, + "loss": 0.2807, + "step": 26610 + }, + { + "epoch": 1.1832688802951505, + "grad_norm": 0.21369390189647675, + "learning_rate": 7.96910884677928e-05, + "loss": 0.2805, + "step": 26620 + }, + { + "epoch": 1.1837133840067564, + "grad_norm": 0.221390962600708, + "learning_rate": 7.967445622892523e-05, + "loss": 0.2834, + "step": 26630 + }, + { + "epoch": 1.1841578877183625, + "grad_norm": 0.2248738408088684, + "learning_rate": 7.965781891950465e-05, + "loss": 0.2838, + "step": 26640 + }, + { + "epoch": 1.1846023914299684, + "grad_norm": 0.20149937272071838, + "learning_rate": 7.964117654237397e-05, + "loss": 0.2839, + "step": 26650 + }, + { + "epoch": 1.1850468951415745, + "grad_norm": 0.18159379065036774, + "learning_rate": 7.962452910037692e-05, + "loss": 0.2831, + "step": 26660 + }, + { + "epoch": 1.1854913988531803, + "grad_norm": 0.17541910707950592, + "learning_rate": 7.96078765963581e-05, + "loss": 0.2808, + "step": 26670 + }, + { + "epoch": 1.1859359025647864, + "grad_norm": 0.21176332235336304, + "learning_rate": 7.95912190331629e-05, + "loss": 0.2776, + "step": 26680 + }, + { + "epoch": 1.1863804062763923, + "grad_norm": 0.23996494710445404, + "learning_rate": 7.957455641363772e-05, + "loss": 0.2838, + "step": 26690 + }, + { + "epoch": 1.1868249099879984, + "grad_norm": 0.2101423740386963, + "learning_rate": 7.955788874062968e-05, + "loss": 0.2827, + "step": 26700 + }, + { + "epoch": 1.1872694136996045, + "grad_norm": 0.2052844911813736, + "learning_rate": 7.95412160169869e-05, + "loss": 0.2849, + "step": 26710 + }, + { + "epoch": 1.1877139174112104, + "grad_norm": 0.20877988636493683, + "learning_rate": 7.952453824555824e-05, + "loss": 0.2819, + "step": 26720 + }, + { + "epoch": 1.1881584211228164, + "grad_norm": 0.19698546826839447, + "learning_rate": 7.95078554291935e-05, + "loss": 0.2843, + "step": 26730 + }, + { + "epoch": 1.1886029248344223, + "grad_norm": 0.2096928358078003, + "learning_rate": 7.94911675707433e-05, + "loss": 0.281, + "step": 26740 + }, + { + "epoch": 1.1890474285460284, + "grad_norm": 0.1855454444885254, + "learning_rate": 7.947447467305915e-05, + "loss": 0.2809, + "step": 26750 + }, + { + "epoch": 1.1894919322576343, + "grad_norm": 0.2044970542192459, + "learning_rate": 7.94577767389934e-05, + "loss": 0.2818, + "step": 26760 + }, + { + "epoch": 1.1899364359692404, + "grad_norm": 0.204344242811203, + "learning_rate": 7.944107377139928e-05, + "loss": 0.2814, + "step": 26770 + }, + { + "epoch": 1.1903809396808462, + "grad_norm": 0.2074400782585144, + "learning_rate": 7.942436577313088e-05, + "loss": 0.2798, + "step": 26780 + }, + { + "epoch": 1.1908254433924523, + "grad_norm": 0.21547731757164001, + "learning_rate": 7.940765274704312e-05, + "loss": 0.2794, + "step": 26790 + }, + { + "epoch": 1.1912699471040584, + "grad_norm": 0.2212713211774826, + "learning_rate": 7.939093469599181e-05, + "loss": 0.2808, + "step": 26800 + }, + { + "epoch": 1.1917144508156643, + "grad_norm": 0.2097073346376419, + "learning_rate": 7.93742116228336e-05, + "loss": 0.2814, + "step": 26810 + }, + { + "epoch": 1.1921589545272704, + "grad_norm": 0.19935950636863708, + "learning_rate": 7.935748353042602e-05, + "loss": 0.2822, + "step": 26820 + }, + { + "epoch": 1.1926034582388763, + "grad_norm": 0.18372271955013275, + "learning_rate": 7.934075042162744e-05, + "loss": 0.2782, + "step": 26830 + }, + { + "epoch": 1.1930479619504823, + "grad_norm": 0.2132577896118164, + "learning_rate": 7.932401229929705e-05, + "loss": 0.2829, + "step": 26840 + }, + { + "epoch": 1.1934924656620882, + "grad_norm": 0.19327546656131744, + "learning_rate": 7.9307269166295e-05, + "loss": 0.2801, + "step": 26850 + }, + { + "epoch": 1.1939369693736943, + "grad_norm": 0.2278168797492981, + "learning_rate": 7.92905210254822e-05, + "loss": 0.2827, + "step": 26860 + }, + { + "epoch": 1.1943814730853002, + "grad_norm": 0.21591036021709442, + "learning_rate": 7.927376787972045e-05, + "loss": 0.2815, + "step": 26870 + }, + { + "epoch": 1.1948259767969063, + "grad_norm": 0.20381180942058563, + "learning_rate": 7.92570097318724e-05, + "loss": 0.2817, + "step": 26880 + }, + { + "epoch": 1.1952704805085124, + "grad_norm": 0.20364250242710114, + "learning_rate": 7.924024658480158e-05, + "loss": 0.2786, + "step": 26890 + }, + { + "epoch": 1.1957149842201182, + "grad_norm": 0.18008802831172943, + "learning_rate": 7.922347844137233e-05, + "loss": 0.2804, + "step": 26900 + }, + { + "epoch": 1.196159487931724, + "grad_norm": 0.19977280497550964, + "learning_rate": 7.92067053044499e-05, + "loss": 0.2812, + "step": 26910 + }, + { + "epoch": 1.1966039916433302, + "grad_norm": 0.2158697545528412, + "learning_rate": 7.918992717690031e-05, + "loss": 0.2799, + "step": 26920 + }, + { + "epoch": 1.1970484953549363, + "grad_norm": 0.1654617339372635, + "learning_rate": 7.917314406159053e-05, + "loss": 0.2804, + "step": 26930 + }, + { + "epoch": 1.1974929990665422, + "grad_norm": 0.2054877132177353, + "learning_rate": 7.915635596138832e-05, + "loss": 0.2822, + "step": 26940 + }, + { + "epoch": 1.1979375027781483, + "grad_norm": 0.18985795974731445, + "learning_rate": 7.913956287916228e-05, + "loss": 0.2807, + "step": 26950 + }, + { + "epoch": 1.1983820064897541, + "grad_norm": 0.17861515283584595, + "learning_rate": 7.912276481778193e-05, + "loss": 0.2812, + "step": 26960 + }, + { + "epoch": 1.1988265102013602, + "grad_norm": 0.190648153424263, + "learning_rate": 7.910596178011759e-05, + "loss": 0.2791, + "step": 26970 + }, + { + "epoch": 1.199271013912966, + "grad_norm": 0.1735813021659851, + "learning_rate": 7.908915376904043e-05, + "loss": 0.2802, + "step": 26980 + }, + { + "epoch": 1.1997155176245722, + "grad_norm": 0.2192799299955368, + "learning_rate": 7.907234078742247e-05, + "loss": 0.2779, + "step": 26990 + }, + { + "epoch": 1.200160021336178, + "grad_norm": 0.243630051612854, + "learning_rate": 7.90555228381366e-05, + "loss": 0.2835, + "step": 27000 + }, + { + "epoch": 1.2006045250477841, + "grad_norm": 0.2365555614233017, + "learning_rate": 7.903869992405656e-05, + "loss": 0.2797, + "step": 27010 + }, + { + "epoch": 1.2010490287593902, + "grad_norm": 0.23544059693813324, + "learning_rate": 7.902187204805691e-05, + "loss": 0.2785, + "step": 27020 + }, + { + "epoch": 1.201493532470996, + "grad_norm": 0.19851373136043549, + "learning_rate": 7.900503921301308e-05, + "loss": 0.2783, + "step": 27030 + }, + { + "epoch": 1.2019380361826022, + "grad_norm": 0.20048633217811584, + "learning_rate": 7.898820142180133e-05, + "loss": 0.2784, + "step": 27040 + }, + { + "epoch": 1.202382539894208, + "grad_norm": 0.2182668149471283, + "learning_rate": 7.897135867729879e-05, + "loss": 0.2795, + "step": 27050 + }, + { + "epoch": 1.2028270436058142, + "grad_norm": 0.20501922070980072, + "learning_rate": 7.89545109823834e-05, + "loss": 0.2775, + "step": 27060 + }, + { + "epoch": 1.20327154731742, + "grad_norm": 0.24694134294986725, + "learning_rate": 7.8937658339934e-05, + "loss": 0.281, + "step": 27070 + }, + { + "epoch": 1.2037160510290261, + "grad_norm": 0.2327820211648941, + "learning_rate": 7.892080075283026e-05, + "loss": 0.2773, + "step": 27080 + }, + { + "epoch": 1.204160554740632, + "grad_norm": 0.18916256725788116, + "learning_rate": 7.890393822395263e-05, + "loss": 0.2843, + "step": 27090 + }, + { + "epoch": 1.204605058452238, + "grad_norm": 0.20304447412490845, + "learning_rate": 7.88870707561825e-05, + "loss": 0.2813, + "step": 27100 + }, + { + "epoch": 1.2050495621638442, + "grad_norm": 0.19674530625343323, + "learning_rate": 7.887019835240203e-05, + "loss": 0.2812, + "step": 27110 + }, + { + "epoch": 1.20549406587545, + "grad_norm": 0.20419786870479584, + "learning_rate": 7.885332101549427e-05, + "loss": 0.2818, + "step": 27120 + }, + { + "epoch": 1.2059385695870561, + "grad_norm": 0.1768699288368225, + "learning_rate": 7.883643874834308e-05, + "loss": 0.2798, + "step": 27130 + }, + { + "epoch": 1.206383073298662, + "grad_norm": 0.17521262168884277, + "learning_rate": 7.881955155383321e-05, + "loss": 0.2795, + "step": 27140 + }, + { + "epoch": 1.206827577010268, + "grad_norm": 0.18476137518882751, + "learning_rate": 7.880265943485017e-05, + "loss": 0.2825, + "step": 27150 + }, + { + "epoch": 1.207272080721874, + "grad_norm": 0.21592754125595093, + "learning_rate": 7.878576239428038e-05, + "loss": 0.2838, + "step": 27160 + }, + { + "epoch": 1.20771658443348, + "grad_norm": 0.1966157853603363, + "learning_rate": 7.87688604350111e-05, + "loss": 0.2828, + "step": 27170 + }, + { + "epoch": 1.208161088145086, + "grad_norm": 0.22383974492549896, + "learning_rate": 7.875195355993042e-05, + "loss": 0.2869, + "step": 27180 + }, + { + "epoch": 1.208605591856692, + "grad_norm": 0.20875954627990723, + "learning_rate": 7.873504177192724e-05, + "loss": 0.2822, + "step": 27190 + }, + { + "epoch": 1.209050095568298, + "grad_norm": 0.18797610700130463, + "learning_rate": 7.87181250738913e-05, + "loss": 0.2838, + "step": 27200 + }, + { + "epoch": 1.209494599279904, + "grad_norm": 0.25810757279396057, + "learning_rate": 7.870120346871324e-05, + "loss": 0.2829, + "step": 27210 + }, + { + "epoch": 1.2099391029915099, + "grad_norm": 0.24128346145153046, + "learning_rate": 7.86842769592845e-05, + "loss": 0.2821, + "step": 27220 + }, + { + "epoch": 1.210383606703116, + "grad_norm": 0.18752031028270721, + "learning_rate": 7.866734554849732e-05, + "loss": 0.2822, + "step": 27230 + }, + { + "epoch": 1.210828110414722, + "grad_norm": 0.20327118039131165, + "learning_rate": 7.865040923924486e-05, + "loss": 0.2811, + "step": 27240 + }, + { + "epoch": 1.211272614126328, + "grad_norm": 0.17507322132587433, + "learning_rate": 7.863346803442104e-05, + "loss": 0.2815, + "step": 27250 + }, + { + "epoch": 1.211717117837934, + "grad_norm": 0.20251552760601044, + "learning_rate": 7.861652193692067e-05, + "loss": 0.2824, + "step": 27260 + }, + { + "epoch": 1.2121616215495399, + "grad_norm": 0.23801621794700623, + "learning_rate": 7.859957094963937e-05, + "loss": 0.2802, + "step": 27270 + }, + { + "epoch": 1.212606125261146, + "grad_norm": 0.21139247715473175, + "learning_rate": 7.858261507547357e-05, + "loss": 0.2797, + "step": 27280 + }, + { + "epoch": 1.2130506289727518, + "grad_norm": 0.2162880152463913, + "learning_rate": 7.856565431732061e-05, + "loss": 0.2823, + "step": 27290 + }, + { + "epoch": 1.213495132684358, + "grad_norm": 0.21971631050109863, + "learning_rate": 7.854868867807859e-05, + "loss": 0.2817, + "step": 27300 + }, + { + "epoch": 1.2139396363959638, + "grad_norm": 0.22744575142860413, + "learning_rate": 7.85317181606465e-05, + "loss": 0.2835, + "step": 27310 + }, + { + "epoch": 1.21438414010757, + "grad_norm": 0.2211182862520218, + "learning_rate": 7.85147427679241e-05, + "loss": 0.2805, + "step": 27320 + }, + { + "epoch": 1.214828643819176, + "grad_norm": 0.19031096994876862, + "learning_rate": 7.849776250281205e-05, + "loss": 0.2815, + "step": 27330 + }, + { + "epoch": 1.2152731475307819, + "grad_norm": 0.21633504331111908, + "learning_rate": 7.84807773682118e-05, + "loss": 0.2803, + "step": 27340 + }, + { + "epoch": 1.215717651242388, + "grad_norm": 0.198603093624115, + "learning_rate": 7.846378736702565e-05, + "loss": 0.2811, + "step": 27350 + }, + { + "epoch": 1.2161621549539938, + "grad_norm": 0.19378498196601868, + "learning_rate": 7.844679250215671e-05, + "loss": 0.2829, + "step": 27360 + }, + { + "epoch": 1.2166066586656, + "grad_norm": 0.20502451062202454, + "learning_rate": 7.842979277650898e-05, + "loss": 0.2793, + "step": 27370 + }, + { + "epoch": 1.2170511623772058, + "grad_norm": 0.19910554587841034, + "learning_rate": 7.84127881929872e-05, + "loss": 0.2795, + "step": 27380 + }, + { + "epoch": 1.2174956660888119, + "grad_norm": 0.22114789485931396, + "learning_rate": 7.839577875449704e-05, + "loss": 0.279, + "step": 27390 + }, + { + "epoch": 1.2179401698004177, + "grad_norm": 0.1746855527162552, + "learning_rate": 7.837876446394489e-05, + "loss": 0.2799, + "step": 27400 + }, + { + "epoch": 1.2183846735120238, + "grad_norm": 0.21938377618789673, + "learning_rate": 7.836174532423805e-05, + "loss": 0.2806, + "step": 27410 + }, + { + "epoch": 1.21882917722363, + "grad_norm": 0.22404058277606964, + "learning_rate": 7.834472133828466e-05, + "loss": 0.2809, + "step": 27420 + }, + { + "epoch": 1.2192736809352358, + "grad_norm": 0.2359585464000702, + "learning_rate": 7.832769250899359e-05, + "loss": 0.2782, + "step": 27430 + }, + { + "epoch": 1.219718184646842, + "grad_norm": 0.19353583455085754, + "learning_rate": 7.831065883927464e-05, + "loss": 0.2757, + "step": 27440 + }, + { + "epoch": 1.2201626883584478, + "grad_norm": 0.18827515840530396, + "learning_rate": 7.829362033203841e-05, + "loss": 0.2793, + "step": 27450 + }, + { + "epoch": 1.2206071920700539, + "grad_norm": 0.20551739633083344, + "learning_rate": 7.827657699019628e-05, + "loss": 0.2852, + "step": 27460 + }, + { + "epoch": 1.2210516957816597, + "grad_norm": 0.19140081107616425, + "learning_rate": 7.825952881666052e-05, + "loss": 0.2812, + "step": 27470 + }, + { + "epoch": 1.2214961994932658, + "grad_norm": 0.19470170140266418, + "learning_rate": 7.824247581434418e-05, + "loss": 0.2781, + "step": 27480 + }, + { + "epoch": 1.2219407032048717, + "grad_norm": 0.22382637858390808, + "learning_rate": 7.822541798616116e-05, + "loss": 0.2796, + "step": 27490 + }, + { + "epoch": 1.2223852069164778, + "grad_norm": 0.20992542803287506, + "learning_rate": 7.820835533502617e-05, + "loss": 0.2819, + "step": 27500 + }, + { + "epoch": 1.2228297106280837, + "grad_norm": 0.20558114349842072, + "learning_rate": 7.819128786385475e-05, + "loss": 0.2801, + "step": 27510 + }, + { + "epoch": 1.2232742143396897, + "grad_norm": 0.2053934782743454, + "learning_rate": 7.817421557556329e-05, + "loss": 0.2822, + "step": 27520 + }, + { + "epoch": 1.2237187180512956, + "grad_norm": 0.18116623163223267, + "learning_rate": 7.815713847306893e-05, + "loss": 0.2797, + "step": 27530 + }, + { + "epoch": 1.2241632217629017, + "grad_norm": 0.22534534335136414, + "learning_rate": 7.81400565592897e-05, + "loss": 0.278, + "step": 27540 + }, + { + "epoch": 1.2246077254745078, + "grad_norm": 0.21142801642417908, + "learning_rate": 7.812296983714444e-05, + "loss": 0.2816, + "step": 27550 + }, + { + "epoch": 1.2250522291861137, + "grad_norm": 0.1839839071035385, + "learning_rate": 7.810587830955281e-05, + "loss": 0.278, + "step": 27560 + }, + { + "epoch": 1.2254967328977198, + "grad_norm": 0.21555162966251373, + "learning_rate": 7.808878197943528e-05, + "loss": 0.2817, + "step": 27570 + }, + { + "epoch": 1.2259412366093256, + "grad_norm": 0.20901648700237274, + "learning_rate": 7.807168084971312e-05, + "loss": 0.2786, + "step": 27580 + }, + { + "epoch": 1.2263857403209317, + "grad_norm": 0.22274303436279297, + "learning_rate": 7.805457492330849e-05, + "loss": 0.2798, + "step": 27590 + }, + { + "epoch": 1.2268302440325376, + "grad_norm": 0.21249547600746155, + "learning_rate": 7.803746420314428e-05, + "loss": 0.2815, + "step": 27600 + }, + { + "epoch": 1.2272747477441437, + "grad_norm": 0.20451584458351135, + "learning_rate": 7.802034869214428e-05, + "loss": 0.2829, + "step": 27610 + }, + { + "epoch": 1.2277192514557496, + "grad_norm": 0.1895698457956314, + "learning_rate": 7.800322839323303e-05, + "loss": 0.2822, + "step": 27620 + }, + { + "epoch": 1.2281637551673557, + "grad_norm": 0.19332166016101837, + "learning_rate": 7.798610330933593e-05, + "loss": 0.2813, + "step": 27630 + }, + { + "epoch": 1.2286082588789617, + "grad_norm": 0.19846129417419434, + "learning_rate": 7.796897344337922e-05, + "loss": 0.2795, + "step": 27640 + }, + { + "epoch": 1.2290527625905676, + "grad_norm": 0.2338293194770813, + "learning_rate": 7.795183879828989e-05, + "loss": 0.284, + "step": 27650 + }, + { + "epoch": 1.2294972663021737, + "grad_norm": 0.21832282841205597, + "learning_rate": 7.793469937699579e-05, + "loss": 0.2814, + "step": 27660 + }, + { + "epoch": 1.2299417700137796, + "grad_norm": 0.23262491822242737, + "learning_rate": 7.791755518242558e-05, + "loss": 0.2805, + "step": 27670 + }, + { + "epoch": 1.2303862737253857, + "grad_norm": 0.22413338720798492, + "learning_rate": 7.790040621750876e-05, + "loss": 0.2808, + "step": 27680 + }, + { + "epoch": 1.2308307774369915, + "grad_norm": 0.21474917232990265, + "learning_rate": 7.788325248517558e-05, + "loss": 0.2799, + "step": 27690 + }, + { + "epoch": 1.2312752811485976, + "grad_norm": 0.21290473639965057, + "learning_rate": 7.786609398835715e-05, + "loss": 0.28, + "step": 27700 + }, + { + "epoch": 1.2317197848602035, + "grad_norm": 0.2766728103160858, + "learning_rate": 7.784893072998541e-05, + "loss": 0.281, + "step": 27710 + }, + { + "epoch": 1.2321642885718096, + "grad_norm": 0.20005422830581665, + "learning_rate": 7.783176271299306e-05, + "loss": 0.2819, + "step": 27720 + }, + { + "epoch": 1.2326087922834157, + "grad_norm": 0.21344174444675446, + "learning_rate": 7.781458994031368e-05, + "loss": 0.2813, + "step": 27730 + }, + { + "epoch": 1.2330532959950216, + "grad_norm": 0.1885332614183426, + "learning_rate": 7.779741241488161e-05, + "loss": 0.2776, + "step": 27740 + }, + { + "epoch": 1.2334977997066274, + "grad_norm": 0.2222631424665451, + "learning_rate": 7.7780230139632e-05, + "loss": 0.2814, + "step": 27750 + }, + { + "epoch": 1.2339423034182335, + "grad_norm": 0.21800372004508972, + "learning_rate": 7.776304311750087e-05, + "loss": 0.281, + "step": 27760 + }, + { + "epoch": 1.2343868071298396, + "grad_norm": 0.1757761538028717, + "learning_rate": 7.7745851351425e-05, + "loss": 0.2803, + "step": 27770 + }, + { + "epoch": 1.2348313108414455, + "grad_norm": 0.18495149910449982, + "learning_rate": 7.772865484434197e-05, + "loss": 0.2784, + "step": 27780 + }, + { + "epoch": 1.2352758145530516, + "grad_norm": 0.18621520698070526, + "learning_rate": 7.77114535991902e-05, + "loss": 0.2814, + "step": 27790 + }, + { + "epoch": 1.2357203182646574, + "grad_norm": 0.21967393159866333, + "learning_rate": 7.769424761890893e-05, + "loss": 0.2768, + "step": 27800 + }, + { + "epoch": 1.2361648219762635, + "grad_norm": 0.21037133038043976, + "learning_rate": 7.767703690643817e-05, + "loss": 0.2817, + "step": 27810 + }, + { + "epoch": 1.2366093256878694, + "grad_norm": 0.19497838616371155, + "learning_rate": 7.76598214647188e-05, + "loss": 0.2795, + "step": 27820 + }, + { + "epoch": 1.2370538293994755, + "grad_norm": 0.1958761066198349, + "learning_rate": 7.764260129669241e-05, + "loss": 0.2777, + "step": 27830 + }, + { + "epoch": 1.2374983331110814, + "grad_norm": 0.18634505569934845, + "learning_rate": 7.76253764053015e-05, + "loss": 0.2809, + "step": 27840 + }, + { + "epoch": 1.2379428368226875, + "grad_norm": 0.19127462804317474, + "learning_rate": 7.760814679348932e-05, + "loss": 0.2773, + "step": 27850 + }, + { + "epoch": 1.2383873405342936, + "grad_norm": 0.16369111835956573, + "learning_rate": 7.759091246419992e-05, + "loss": 0.2798, + "step": 27860 + }, + { + "epoch": 1.2388318442458994, + "grad_norm": 0.16851256787776947, + "learning_rate": 7.75736734203782e-05, + "loss": 0.2776, + "step": 27870 + }, + { + "epoch": 1.2392763479575055, + "grad_norm": 0.14229382574558258, + "learning_rate": 7.755642966496985e-05, + "loss": 0.2794, + "step": 27880 + }, + { + "epoch": 1.2397208516691114, + "grad_norm": 0.18582682311534882, + "learning_rate": 7.753918120092132e-05, + "loss": 0.2803, + "step": 27890 + }, + { + "epoch": 1.2401653553807175, + "grad_norm": 0.18137171864509583, + "learning_rate": 7.752192803117993e-05, + "loss": 0.2797, + "step": 27900 + }, + { + "epoch": 1.2406098590923234, + "grad_norm": 0.2143506407737732, + "learning_rate": 7.750467015869377e-05, + "loss": 0.2813, + "step": 27910 + }, + { + "epoch": 1.2410543628039294, + "grad_norm": 0.2079656720161438, + "learning_rate": 7.748740758641174e-05, + "loss": 0.2795, + "step": 27920 + }, + { + "epoch": 1.2414988665155353, + "grad_norm": 0.1959439218044281, + "learning_rate": 7.74701403172835e-05, + "loss": 0.2815, + "step": 27930 + }, + { + "epoch": 1.2419433702271414, + "grad_norm": 0.16628684103488922, + "learning_rate": 7.745286835425962e-05, + "loss": 0.2798, + "step": 27940 + }, + { + "epoch": 1.2423878739387475, + "grad_norm": 0.17392604053020477, + "learning_rate": 7.743559170029138e-05, + "loss": 0.2801, + "step": 27950 + }, + { + "epoch": 1.2428323776503534, + "grad_norm": 0.1990216225385666, + "learning_rate": 7.741831035833087e-05, + "loss": 0.2814, + "step": 27960 + }, + { + "epoch": 1.2432768813619595, + "grad_norm": 0.24279065430164337, + "learning_rate": 7.740102433133102e-05, + "loss": 0.2791, + "step": 27970 + }, + { + "epoch": 1.2437213850735653, + "grad_norm": 0.15975907444953918, + "learning_rate": 7.738373362224553e-05, + "loss": 0.2827, + "step": 27980 + }, + { + "epoch": 1.2441658887851714, + "grad_norm": 0.2114836871623993, + "learning_rate": 7.73664382340289e-05, + "loss": 0.2786, + "step": 27990 + }, + { + "epoch": 1.2446103924967773, + "grad_norm": 0.18022334575653076, + "learning_rate": 7.734913816963647e-05, + "loss": 0.2766, + "step": 28000 + }, + { + "epoch": 1.2450548962083834, + "grad_norm": 0.18195219337940216, + "learning_rate": 7.73318334320243e-05, + "loss": 0.2782, + "step": 28010 + }, + { + "epoch": 1.2454993999199893, + "grad_norm": 0.2124672830104828, + "learning_rate": 7.731452402414934e-05, + "loss": 0.2808, + "step": 28020 + }, + { + "epoch": 1.2459439036315954, + "grad_norm": 0.18415603041648865, + "learning_rate": 7.729720994896928e-05, + "loss": 0.281, + "step": 28030 + }, + { + "epoch": 1.2463884073432012, + "grad_norm": 0.20073655247688293, + "learning_rate": 7.727989120944262e-05, + "loss": 0.2812, + "step": 28040 + }, + { + "epoch": 1.2468329110548073, + "grad_norm": 0.21049976348876953, + "learning_rate": 7.726256780852865e-05, + "loss": 0.2765, + "step": 28050 + }, + { + "epoch": 1.2472774147664132, + "grad_norm": 0.19667507708072662, + "learning_rate": 7.724523974918749e-05, + "loss": 0.2774, + "step": 28060 + }, + { + "epoch": 1.2477219184780193, + "grad_norm": 0.18787164986133575, + "learning_rate": 7.722790703438002e-05, + "loss": 0.2801, + "step": 28070 + }, + { + "epoch": 1.2481664221896254, + "grad_norm": 0.19503508508205414, + "learning_rate": 7.72105696670679e-05, + "loss": 0.2789, + "step": 28080 + }, + { + "epoch": 1.2486109259012312, + "grad_norm": 0.16564421355724335, + "learning_rate": 7.719322765021364e-05, + "loss": 0.2801, + "step": 28090 + }, + { + "epoch": 1.2490554296128373, + "grad_norm": 0.22444839775562286, + "learning_rate": 7.717588098678051e-05, + "loss": 0.2822, + "step": 28100 + }, + { + "epoch": 1.2494999333244432, + "grad_norm": 0.18032948672771454, + "learning_rate": 7.715852967973258e-05, + "loss": 0.2803, + "step": 28110 + }, + { + "epoch": 1.2499444370360493, + "grad_norm": 0.19629709422588348, + "learning_rate": 7.714117373203474e-05, + "loss": 0.2811, + "step": 28120 + }, + { + "epoch": 1.2503889407476552, + "grad_norm": 0.154670849442482, + "learning_rate": 7.712381314665259e-05, + "loss": 0.2794, + "step": 28130 + }, + { + "epoch": 1.2508334444592613, + "grad_norm": 0.17922428250312805, + "learning_rate": 7.710644792655261e-05, + "loss": 0.2792, + "step": 28140 + }, + { + "epoch": 1.2512779481708671, + "grad_norm": 0.15903495252132416, + "learning_rate": 7.708907807470207e-05, + "loss": 0.2802, + "step": 28150 + }, + { + "epoch": 1.2517224518824732, + "grad_norm": 0.1782655268907547, + "learning_rate": 7.707170359406896e-05, + "loss": 0.2789, + "step": 28160 + }, + { + "epoch": 1.2521669555940793, + "grad_norm": 0.19964627921581268, + "learning_rate": 7.705432448762213e-05, + "loss": 0.2799, + "step": 28170 + }, + { + "epoch": 1.2526114593056852, + "grad_norm": 0.1990969479084015, + "learning_rate": 7.703694075833117e-05, + "loss": 0.2795, + "step": 28180 + }, + { + "epoch": 1.2530559630172913, + "grad_norm": 0.17329110205173492, + "learning_rate": 7.70195524091665e-05, + "loss": 0.2772, + "step": 28190 + }, + { + "epoch": 1.2535004667288971, + "grad_norm": 0.2176048457622528, + "learning_rate": 7.70021594430993e-05, + "loss": 0.2815, + "step": 28200 + }, + { + "epoch": 1.2539449704405032, + "grad_norm": 0.22137883305549622, + "learning_rate": 7.698476186310157e-05, + "loss": 0.2798, + "step": 28210 + }, + { + "epoch": 1.254389474152109, + "grad_norm": 0.2275976687669754, + "learning_rate": 7.696735967214608e-05, + "loss": 0.2805, + "step": 28220 + }, + { + "epoch": 1.2548339778637152, + "grad_norm": 0.2279648780822754, + "learning_rate": 7.694995287320636e-05, + "loss": 0.2818, + "step": 28230 + }, + { + "epoch": 1.255278481575321, + "grad_norm": 0.193085715174675, + "learning_rate": 7.693254146925679e-05, + "loss": 0.2815, + "step": 28240 + }, + { + "epoch": 1.2557229852869272, + "grad_norm": 0.1924104392528534, + "learning_rate": 7.691512546327251e-05, + "loss": 0.2812, + "step": 28250 + }, + { + "epoch": 1.2561674889985333, + "grad_norm": 0.21754419803619385, + "learning_rate": 7.689770485822939e-05, + "loss": 0.2782, + "step": 28260 + }, + { + "epoch": 1.2566119927101391, + "grad_norm": 0.27264171838760376, + "learning_rate": 7.688027965710416e-05, + "loss": 0.2799, + "step": 28270 + }, + { + "epoch": 1.257056496421745, + "grad_norm": 0.23016709089279175, + "learning_rate": 7.686284986287433e-05, + "loss": 0.2794, + "step": 28280 + }, + { + "epoch": 1.257501000133351, + "grad_norm": 0.18997398018836975, + "learning_rate": 7.684541547851817e-05, + "loss": 0.2805, + "step": 28290 + }, + { + "epoch": 1.2579455038449572, + "grad_norm": 0.1799359768629074, + "learning_rate": 7.68279765070147e-05, + "loss": 0.2781, + "step": 28300 + }, + { + "epoch": 1.258390007556563, + "grad_norm": 0.196841761469841, + "learning_rate": 7.68105329513438e-05, + "loss": 0.2791, + "step": 28310 + }, + { + "epoch": 1.2588345112681691, + "grad_norm": 0.18237562477588654, + "learning_rate": 7.67930848144861e-05, + "loss": 0.2793, + "step": 28320 + }, + { + "epoch": 1.259279014979775, + "grad_norm": 0.18253116309642792, + "learning_rate": 7.6775632099423e-05, + "loss": 0.2826, + "step": 28330 + }, + { + "epoch": 1.259723518691381, + "grad_norm": 0.17919544875621796, + "learning_rate": 7.675817480913667e-05, + "loss": 0.2794, + "step": 28340 + }, + { + "epoch": 1.2601680224029872, + "grad_norm": 0.18478180468082428, + "learning_rate": 7.674071294661011e-05, + "loss": 0.2816, + "step": 28350 + }, + { + "epoch": 1.260612526114593, + "grad_norm": 0.21419933438301086, + "learning_rate": 7.672324651482707e-05, + "loss": 0.2833, + "step": 28360 + }, + { + "epoch": 1.261057029826199, + "grad_norm": 0.1790054738521576, + "learning_rate": 7.670577551677209e-05, + "loss": 0.276, + "step": 28370 + }, + { + "epoch": 1.261501533537805, + "grad_norm": 0.20162643492221832, + "learning_rate": 7.668829995543047e-05, + "loss": 0.2798, + "step": 28380 + }, + { + "epoch": 1.2619460372494111, + "grad_norm": 0.17963674664497375, + "learning_rate": 7.667081983378832e-05, + "loss": 0.278, + "step": 28390 + }, + { + "epoch": 1.262390540961017, + "grad_norm": 0.19990147650241852, + "learning_rate": 7.66533351548325e-05, + "loss": 0.2801, + "step": 28400 + }, + { + "epoch": 1.262835044672623, + "grad_norm": 0.23404774069786072, + "learning_rate": 7.663584592155069e-05, + "loss": 0.2815, + "step": 28410 + }, + { + "epoch": 1.263279548384229, + "grad_norm": 0.19924990832805634, + "learning_rate": 7.661835213693129e-05, + "loss": 0.277, + "step": 28420 + }, + { + "epoch": 1.263724052095835, + "grad_norm": 0.18571241199970245, + "learning_rate": 7.660085380396353e-05, + "loss": 0.2817, + "step": 28430 + }, + { + "epoch": 1.264168555807441, + "grad_norm": 0.18984375894069672, + "learning_rate": 7.658335092563738e-05, + "loss": 0.2801, + "step": 28440 + }, + { + "epoch": 1.264613059519047, + "grad_norm": 0.16053147614002228, + "learning_rate": 7.656584350494362e-05, + "loss": 0.2779, + "step": 28450 + }, + { + "epoch": 1.2650575632306529, + "grad_norm": 0.17296123504638672, + "learning_rate": 7.654833154487378e-05, + "loss": 0.2797, + "step": 28460 + }, + { + "epoch": 1.265502066942259, + "grad_norm": 0.19944000244140625, + "learning_rate": 7.653081504842017e-05, + "loss": 0.2779, + "step": 28470 + }, + { + "epoch": 1.265946570653865, + "grad_norm": 0.21091806888580322, + "learning_rate": 7.65132940185759e-05, + "loss": 0.2803, + "step": 28480 + }, + { + "epoch": 1.266391074365471, + "grad_norm": 0.1991644650697708, + "learning_rate": 7.649576845833481e-05, + "loss": 0.2825, + "step": 28490 + }, + { + "epoch": 1.2668355780770768, + "grad_norm": 0.21186868846416473, + "learning_rate": 7.647823837069156e-05, + "loss": 0.2798, + "step": 28500 + }, + { + "epoch": 1.267280081788683, + "grad_norm": 0.20658369362354279, + "learning_rate": 7.646070375864156e-05, + "loss": 0.277, + "step": 28510 + }, + { + "epoch": 1.267724585500289, + "grad_norm": 0.18001332879066467, + "learning_rate": 7.644316462518099e-05, + "loss": 0.2791, + "step": 28520 + }, + { + "epoch": 1.2681690892118949, + "grad_norm": 0.19912093877792358, + "learning_rate": 7.642562097330679e-05, + "loss": 0.2812, + "step": 28530 + }, + { + "epoch": 1.268613592923501, + "grad_norm": 0.18071183562278748, + "learning_rate": 7.640807280601671e-05, + "loss": 0.2803, + "step": 28540 + }, + { + "epoch": 1.2690580966351068, + "grad_norm": 0.20545795559883118, + "learning_rate": 7.639052012630927e-05, + "loss": 0.2793, + "step": 28550 + }, + { + "epoch": 1.269502600346713, + "grad_norm": 0.16668088734149933, + "learning_rate": 7.63729629371837e-05, + "loss": 0.2778, + "step": 28560 + }, + { + "epoch": 1.269947104058319, + "grad_norm": 0.15887175500392914, + "learning_rate": 7.635540124164009e-05, + "loss": 0.2805, + "step": 28570 + }, + { + "epoch": 1.2703916077699249, + "grad_norm": 0.1835402548313141, + "learning_rate": 7.633783504267922e-05, + "loss": 0.281, + "step": 28580 + }, + { + "epoch": 1.2708361114815308, + "grad_norm": 0.15970632433891296, + "learning_rate": 7.632026434330269e-05, + "loss": 0.2788, + "step": 28590 + }, + { + "epoch": 1.2712806151931368, + "grad_norm": 0.21131499111652374, + "learning_rate": 7.630268914651282e-05, + "loss": 0.2838, + "step": 28600 + }, + { + "epoch": 1.271725118904743, + "grad_norm": 0.18620087206363678, + "learning_rate": 7.628510945531278e-05, + "loss": 0.2802, + "step": 28610 + }, + { + "epoch": 1.2721696226163488, + "grad_norm": 0.2188059687614441, + "learning_rate": 7.626752527270641e-05, + "loss": 0.2856, + "step": 28620 + }, + { + "epoch": 1.272614126327955, + "grad_norm": 0.18233366310596466, + "learning_rate": 7.62499366016984e-05, + "loss": 0.279, + "step": 28630 + }, + { + "epoch": 1.2730586300395608, + "grad_norm": 0.20789727568626404, + "learning_rate": 7.623234344529416e-05, + "loss": 0.2824, + "step": 28640 + }, + { + "epoch": 1.2735031337511669, + "grad_norm": 0.2459554225206375, + "learning_rate": 7.62147458064999e-05, + "loss": 0.2807, + "step": 28650 + }, + { + "epoch": 1.273947637462773, + "grad_norm": 0.19872713088989258, + "learning_rate": 7.619714368832254e-05, + "loss": 0.2787, + "step": 28660 + }, + { + "epoch": 1.2743921411743788, + "grad_norm": 0.19545434415340424, + "learning_rate": 7.61795370937698e-05, + "loss": 0.2808, + "step": 28670 + }, + { + "epoch": 1.2748366448859847, + "grad_norm": 0.21004855632781982, + "learning_rate": 7.61619260258502e-05, + "loss": 0.2776, + "step": 28680 + }, + { + "epoch": 1.2752811485975908, + "grad_norm": 0.1599510908126831, + "learning_rate": 7.614431048757298e-05, + "loss": 0.2804, + "step": 28690 + }, + { + "epoch": 1.2757256523091969, + "grad_norm": 0.21134307980537415, + "learning_rate": 7.612669048194814e-05, + "loss": 0.2804, + "step": 28700 + }, + { + "epoch": 1.2761701560208027, + "grad_norm": 0.2113795280456543, + "learning_rate": 7.610906601198646e-05, + "loss": 0.2783, + "step": 28710 + }, + { + "epoch": 1.2766146597324088, + "grad_norm": 0.18654176592826843, + "learning_rate": 7.60914370806995e-05, + "loss": 0.2801, + "step": 28720 + }, + { + "epoch": 1.2770591634440147, + "grad_norm": 0.18445177376270294, + "learning_rate": 7.607380369109953e-05, + "loss": 0.2786, + "step": 28730 + }, + { + "epoch": 1.2775036671556208, + "grad_norm": 0.18421193957328796, + "learning_rate": 7.605616584619961e-05, + "loss": 0.281, + "step": 28740 + }, + { + "epoch": 1.2779481708672267, + "grad_norm": 0.2333691418170929, + "learning_rate": 7.603852354901362e-05, + "loss": 0.2799, + "step": 28750 + }, + { + "epoch": 1.2783926745788328, + "grad_norm": 0.22063294053077698, + "learning_rate": 7.602087680255609e-05, + "loss": 0.2822, + "step": 28760 + }, + { + "epoch": 1.2788371782904386, + "grad_norm": 0.2287117838859558, + "learning_rate": 7.600322560984238e-05, + "loss": 0.2804, + "step": 28770 + }, + { + "epoch": 1.2792816820020447, + "grad_norm": 0.22997911274433136, + "learning_rate": 7.598556997388863e-05, + "loss": 0.2812, + "step": 28780 + }, + { + "epoch": 1.2797261857136508, + "grad_norm": 0.2110718935728073, + "learning_rate": 7.596790989771166e-05, + "loss": 0.2832, + "step": 28790 + }, + { + "epoch": 1.2801706894252567, + "grad_norm": 0.1918289065361023, + "learning_rate": 7.595024538432914e-05, + "loss": 0.2791, + "step": 28800 + }, + { + "epoch": 1.2806151931368626, + "grad_norm": 0.1977507621049881, + "learning_rate": 7.59325764367594e-05, + "loss": 0.2761, + "step": 28810 + }, + { + "epoch": 1.2810596968484687, + "grad_norm": 0.22768867015838623, + "learning_rate": 7.59149030580216e-05, + "loss": 0.2799, + "step": 28820 + }, + { + "epoch": 1.2815042005600747, + "grad_norm": 0.19284170866012573, + "learning_rate": 7.589722525113562e-05, + "loss": 0.2813, + "step": 28830 + }, + { + "epoch": 1.2819487042716806, + "grad_norm": 0.18934719264507294, + "learning_rate": 7.587954301912216e-05, + "loss": 0.2793, + "step": 28840 + }, + { + "epoch": 1.2823932079832867, + "grad_norm": 0.20214338600635529, + "learning_rate": 7.586185636500263e-05, + "loss": 0.2824, + "step": 28850 + }, + { + "epoch": 1.2828377116948926, + "grad_norm": 0.18550679087638855, + "learning_rate": 7.584416529179914e-05, + "loss": 0.2782, + "step": 28860 + }, + { + "epoch": 1.2832822154064987, + "grad_norm": 0.19994661211967468, + "learning_rate": 7.582646980253465e-05, + "loss": 0.2789, + "step": 28870 + }, + { + "epoch": 1.2837267191181048, + "grad_norm": 0.18046316504478455, + "learning_rate": 7.580876990023282e-05, + "loss": 0.2778, + "step": 28880 + }, + { + "epoch": 1.2841712228297106, + "grad_norm": 0.1984882652759552, + "learning_rate": 7.579106558791809e-05, + "loss": 0.2797, + "step": 28890 + }, + { + "epoch": 1.2846157265413165, + "grad_norm": 0.2032884657382965, + "learning_rate": 7.577335686861565e-05, + "loss": 0.2788, + "step": 28900 + }, + { + "epoch": 1.2850602302529226, + "grad_norm": 0.21261031925678253, + "learning_rate": 7.575564374535141e-05, + "loss": 0.2757, + "step": 28910 + }, + { + "epoch": 1.2855047339645287, + "grad_norm": 0.1989171952009201, + "learning_rate": 7.573792622115207e-05, + "loss": 0.28, + "step": 28920 + }, + { + "epoch": 1.2859492376761346, + "grad_norm": 0.19559605419635773, + "learning_rate": 7.572020429904507e-05, + "loss": 0.2792, + "step": 28930 + }, + { + "epoch": 1.2863937413877407, + "grad_norm": 0.22921375930309296, + "learning_rate": 7.570247798205861e-05, + "loss": 0.2786, + "step": 28940 + }, + { + "epoch": 1.2868382450993465, + "grad_norm": 0.18414011597633362, + "learning_rate": 7.568474727322164e-05, + "loss": 0.2787, + "step": 28950 + }, + { + "epoch": 1.2872827488109526, + "grad_norm": 0.18535926938056946, + "learning_rate": 7.566701217556384e-05, + "loss": 0.2792, + "step": 28960 + }, + { + "epoch": 1.2877272525225585, + "grad_norm": 0.2073618769645691, + "learning_rate": 7.564927269211564e-05, + "loss": 0.2767, + "step": 28970 + }, + { + "epoch": 1.2881717562341646, + "grad_norm": 0.201123908162117, + "learning_rate": 7.563152882590824e-05, + "loss": 0.2797, + "step": 28980 + }, + { + "epoch": 1.2886162599457704, + "grad_norm": 0.20167917013168335, + "learning_rate": 7.56137805799736e-05, + "loss": 0.2815, + "step": 28990 + }, + { + "epoch": 1.2890607636573765, + "grad_norm": 0.2466711550951004, + "learning_rate": 7.559602795734439e-05, + "loss": 0.2773, + "step": 29000 + }, + { + "epoch": 1.2895052673689826, + "grad_norm": 0.20390425622463226, + "learning_rate": 7.557827096105408e-05, + "loss": 0.2792, + "step": 29010 + }, + { + "epoch": 1.2899497710805885, + "grad_norm": 0.19924631714820862, + "learning_rate": 7.55605095941368e-05, + "loss": 0.2822, + "step": 29020 + }, + { + "epoch": 1.2903942747921946, + "grad_norm": 0.20401982963085175, + "learning_rate": 7.554274385962752e-05, + "loss": 0.277, + "step": 29030 + }, + { + "epoch": 1.2908387785038005, + "grad_norm": 0.1998802274465561, + "learning_rate": 7.552497376056191e-05, + "loss": 0.2812, + "step": 29040 + }, + { + "epoch": 1.2912832822154066, + "grad_norm": 0.18225093185901642, + "learning_rate": 7.550719929997639e-05, + "loss": 0.277, + "step": 29050 + }, + { + "epoch": 1.2917277859270124, + "grad_norm": 0.2094736397266388, + "learning_rate": 7.548942048090813e-05, + "loss": 0.2804, + "step": 29060 + }, + { + "epoch": 1.2921722896386185, + "grad_norm": 0.17970599234104156, + "learning_rate": 7.547163730639506e-05, + "loss": 0.2786, + "step": 29070 + }, + { + "epoch": 1.2926167933502244, + "grad_norm": 0.23119544982910156, + "learning_rate": 7.545384977947583e-05, + "loss": 0.279, + "step": 29080 + }, + { + "epoch": 1.2930612970618305, + "grad_norm": 0.28179073333740234, + "learning_rate": 7.543605790318981e-05, + "loss": 0.2831, + "step": 29090 + }, + { + "epoch": 1.2935058007734366, + "grad_norm": 0.21838819980621338, + "learning_rate": 7.54182616805772e-05, + "loss": 0.2779, + "step": 29100 + }, + { + "epoch": 1.2939503044850424, + "grad_norm": 0.21566765010356903, + "learning_rate": 7.540046111467885e-05, + "loss": 0.2815, + "step": 29110 + }, + { + "epoch": 1.2943948081966483, + "grad_norm": 0.21021370589733124, + "learning_rate": 7.53826562085364e-05, + "loss": 0.2818, + "step": 29120 + }, + { + "epoch": 1.2948393119082544, + "grad_norm": 0.21445414423942566, + "learning_rate": 7.536484696519221e-05, + "loss": 0.2792, + "step": 29130 + }, + { + "epoch": 1.2952838156198605, + "grad_norm": 0.2012384533882141, + "learning_rate": 7.534703338768942e-05, + "loss": 0.2752, + "step": 29140 + }, + { + "epoch": 1.2957283193314664, + "grad_norm": 0.20263488590717316, + "learning_rate": 7.532921547907185e-05, + "loss": 0.277, + "step": 29150 + }, + { + "epoch": 1.2961728230430725, + "grad_norm": 0.22964896261692047, + "learning_rate": 7.531139324238412e-05, + "loss": 0.2819, + "step": 29160 + }, + { + "epoch": 1.2966173267546783, + "grad_norm": 0.2500708997249603, + "learning_rate": 7.529356668067157e-05, + "loss": 0.2806, + "step": 29170 + }, + { + "epoch": 1.2970618304662844, + "grad_norm": 0.21423521637916565, + "learning_rate": 7.527573579698023e-05, + "loss": 0.2787, + "step": 29180 + }, + { + "epoch": 1.2975063341778905, + "grad_norm": 0.2369096577167511, + "learning_rate": 7.525790059435693e-05, + "loss": 0.2761, + "step": 29190 + }, + { + "epoch": 1.2979508378894964, + "grad_norm": 0.22224977612495422, + "learning_rate": 7.524006107584926e-05, + "loss": 0.2827, + "step": 29200 + }, + { + "epoch": 1.2983953416011023, + "grad_norm": 0.18308626115322113, + "learning_rate": 7.522221724450544e-05, + "loss": 0.2805, + "step": 29210 + }, + { + "epoch": 1.2988398453127084, + "grad_norm": 0.18870723247528076, + "learning_rate": 7.520436910337451e-05, + "loss": 0.2827, + "step": 29220 + }, + { + "epoch": 1.2992843490243144, + "grad_norm": 0.23381249606609344, + "learning_rate": 7.518651665550627e-05, + "loss": 0.279, + "step": 29230 + }, + { + "epoch": 1.2997288527359203, + "grad_norm": 0.18610922992229462, + "learning_rate": 7.516865990395117e-05, + "loss": 0.2769, + "step": 29240 + }, + { + "epoch": 1.3001733564475264, + "grad_norm": 0.1919599175453186, + "learning_rate": 7.515079885176047e-05, + "loss": 0.2797, + "step": 29250 + }, + { + "epoch": 1.3006178601591323, + "grad_norm": 0.1624281257390976, + "learning_rate": 7.513293350198612e-05, + "loss": 0.2784, + "step": 29260 + }, + { + "epoch": 1.3010623638707384, + "grad_norm": 0.18522217869758606, + "learning_rate": 7.511506385768081e-05, + "loss": 0.2781, + "step": 29270 + }, + { + "epoch": 1.3015068675823442, + "grad_norm": 0.1768706738948822, + "learning_rate": 7.509718992189801e-05, + "loss": 0.2782, + "step": 29280 + }, + { + "epoch": 1.3019513712939503, + "grad_norm": 0.21947632730007172, + "learning_rate": 7.507931169769182e-05, + "loss": 0.2828, + "step": 29290 + }, + { + "epoch": 1.3023958750055562, + "grad_norm": 0.2173905074596405, + "learning_rate": 7.506142918811722e-05, + "loss": 0.2816, + "step": 29300 + }, + { + "epoch": 1.3028403787171623, + "grad_norm": 0.2248533070087433, + "learning_rate": 7.504354239622978e-05, + "loss": 0.2798, + "step": 29310 + }, + { + "epoch": 1.3032848824287684, + "grad_norm": 0.2041567713022232, + "learning_rate": 7.50256513250859e-05, + "loss": 0.2822, + "step": 29320 + }, + { + "epoch": 1.3037293861403743, + "grad_norm": 0.21536897122859955, + "learning_rate": 7.500775597774265e-05, + "loss": 0.2797, + "step": 29330 + }, + { + "epoch": 1.3041738898519801, + "grad_norm": 0.20372825860977173, + "learning_rate": 7.498985635725788e-05, + "loss": 0.2797, + "step": 29340 + }, + { + "epoch": 1.3046183935635862, + "grad_norm": 0.2237229198217392, + "learning_rate": 7.497195246669012e-05, + "loss": 0.2801, + "step": 29350 + }, + { + "epoch": 1.3050628972751923, + "grad_norm": 0.15716376900672913, + "learning_rate": 7.495404430909868e-05, + "loss": 0.2808, + "step": 29360 + }, + { + "epoch": 1.3055074009867982, + "grad_norm": 0.1996854990720749, + "learning_rate": 7.493613188754356e-05, + "loss": 0.2782, + "step": 29370 + }, + { + "epoch": 1.3059519046984043, + "grad_norm": 0.18560580909252167, + "learning_rate": 7.49182152050855e-05, + "loss": 0.2799, + "step": 29380 + }, + { + "epoch": 1.3063964084100101, + "grad_norm": 0.21885457634925842, + "learning_rate": 7.490029426478598e-05, + "loss": 0.2798, + "step": 29390 + }, + { + "epoch": 1.3068409121216162, + "grad_norm": 0.2003796398639679, + "learning_rate": 7.488236906970719e-05, + "loss": 0.2784, + "step": 29400 + }, + { + "epoch": 1.3072854158332223, + "grad_norm": 0.1940920501947403, + "learning_rate": 7.486443962291207e-05, + "loss": 0.2783, + "step": 29410 + }, + { + "epoch": 1.3077299195448282, + "grad_norm": 0.22013871371746063, + "learning_rate": 7.484650592746424e-05, + "loss": 0.2822, + "step": 29420 + }, + { + "epoch": 1.308174423256434, + "grad_norm": 0.18745523691177368, + "learning_rate": 7.482856798642811e-05, + "loss": 0.2779, + "step": 29430 + }, + { + "epoch": 1.3086189269680402, + "grad_norm": 0.22527176141738892, + "learning_rate": 7.481062580286878e-05, + "loss": 0.2767, + "step": 29440 + }, + { + "epoch": 1.3090634306796463, + "grad_norm": 0.22211609780788422, + "learning_rate": 7.479267937985208e-05, + "loss": 0.2801, + "step": 29450 + }, + { + "epoch": 1.3095079343912521, + "grad_norm": 0.17457923293113708, + "learning_rate": 7.477472872044456e-05, + "loss": 0.2811, + "step": 29460 + }, + { + "epoch": 1.3099524381028582, + "grad_norm": 0.17032849788665771, + "learning_rate": 7.475677382771347e-05, + "loss": 0.2762, + "step": 29470 + }, + { + "epoch": 1.310396941814464, + "grad_norm": 0.18676045536994934, + "learning_rate": 7.473881470472683e-05, + "loss": 0.2794, + "step": 29480 + }, + { + "epoch": 1.3108414455260702, + "grad_norm": 0.18955031037330627, + "learning_rate": 7.472085135455337e-05, + "loss": 0.278, + "step": 29490 + }, + { + "epoch": 1.311285949237676, + "grad_norm": 0.1747664213180542, + "learning_rate": 7.470288378026256e-05, + "loss": 0.278, + "step": 29500 + }, + { + "epoch": 1.3117304529492821, + "grad_norm": 0.2110908329486847, + "learning_rate": 7.468491198492451e-05, + "loss": 0.277, + "step": 29510 + }, + { + "epoch": 1.312174956660888, + "grad_norm": 0.1955411285161972, + "learning_rate": 7.466693597161013e-05, + "loss": 0.2779, + "step": 29520 + }, + { + "epoch": 1.312619460372494, + "grad_norm": 0.19566112756729126, + "learning_rate": 7.464895574339104e-05, + "loss": 0.2783, + "step": 29530 + }, + { + "epoch": 1.3130639640841002, + "grad_norm": 0.17427660524845123, + "learning_rate": 7.463097130333958e-05, + "loss": 0.2737, + "step": 29540 + }, + { + "epoch": 1.313508467795706, + "grad_norm": 0.16555845737457275, + "learning_rate": 7.461298265452876e-05, + "loss": 0.2775, + "step": 29550 + }, + { + "epoch": 1.3139529715073122, + "grad_norm": 0.17223034799098969, + "learning_rate": 7.459498980003239e-05, + "loss": 0.2798, + "step": 29560 + }, + { + "epoch": 1.314397475218918, + "grad_norm": 0.1765144318342209, + "learning_rate": 7.457699274292493e-05, + "loss": 0.279, + "step": 29570 + }, + { + "epoch": 1.3148419789305241, + "grad_norm": 0.2023237645626068, + "learning_rate": 7.455899148628159e-05, + "loss": 0.2814, + "step": 29580 + }, + { + "epoch": 1.31528648264213, + "grad_norm": 0.18216362595558167, + "learning_rate": 7.45409860331783e-05, + "loss": 0.2807, + "step": 29590 + }, + { + "epoch": 1.315730986353736, + "grad_norm": 0.1807079166173935, + "learning_rate": 7.452297638669169e-05, + "loss": 0.279, + "step": 29600 + }, + { + "epoch": 1.316175490065342, + "grad_norm": 0.21060575544834137, + "learning_rate": 7.450496254989911e-05, + "loss": 0.281, + "step": 29610 + }, + { + "epoch": 1.316619993776948, + "grad_norm": 0.18998950719833374, + "learning_rate": 7.448694452587866e-05, + "loss": 0.2735, + "step": 29620 + }, + { + "epoch": 1.3170644974885541, + "grad_norm": 0.1897488832473755, + "learning_rate": 7.44689223177091e-05, + "loss": 0.2777, + "step": 29630 + }, + { + "epoch": 1.31750900120016, + "grad_norm": 0.26631247997283936, + "learning_rate": 7.445089592846994e-05, + "loss": 0.2807, + "step": 29640 + }, + { + "epoch": 1.3179535049117659, + "grad_norm": 0.2063828408718109, + "learning_rate": 7.443286536124141e-05, + "loss": 0.2806, + "step": 29650 + }, + { + "epoch": 1.318398008623372, + "grad_norm": 0.20439384877681732, + "learning_rate": 7.441483061910443e-05, + "loss": 0.28, + "step": 29660 + }, + { + "epoch": 1.318842512334978, + "grad_norm": 0.18164485692977905, + "learning_rate": 7.439679170514064e-05, + "loss": 0.279, + "step": 29670 + }, + { + "epoch": 1.319287016046584, + "grad_norm": 0.19599807262420654, + "learning_rate": 7.43787486224324e-05, + "loss": 0.2784, + "step": 29680 + }, + { + "epoch": 1.31973151975819, + "grad_norm": 0.17722928524017334, + "learning_rate": 7.436070137406276e-05, + "loss": 0.2803, + "step": 29690 + }, + { + "epoch": 1.320176023469796, + "grad_norm": 0.175860196352005, + "learning_rate": 7.434264996311556e-05, + "loss": 0.2762, + "step": 29700 + }, + { + "epoch": 1.320620527181402, + "grad_norm": 0.20774294435977936, + "learning_rate": 7.432459439267525e-05, + "loss": 0.2808, + "step": 29710 + }, + { + "epoch": 1.321065030893008, + "grad_norm": 0.2028816193342209, + "learning_rate": 7.430653466582701e-05, + "loss": 0.278, + "step": 29720 + }, + { + "epoch": 1.321509534604614, + "grad_norm": 0.227676659822464, + "learning_rate": 7.42884707856568e-05, + "loss": 0.2791, + "step": 29730 + }, + { + "epoch": 1.3219540383162198, + "grad_norm": 0.20702506601810455, + "learning_rate": 7.427040275525122e-05, + "loss": 0.2798, + "step": 29740 + }, + { + "epoch": 1.322398542027826, + "grad_norm": 0.186300128698349, + "learning_rate": 7.42523305776976e-05, + "loss": 0.2799, + "step": 29750 + }, + { + "epoch": 1.322843045739432, + "grad_norm": 0.227384552359581, + "learning_rate": 7.4234254256084e-05, + "loss": 0.278, + "step": 29760 + }, + { + "epoch": 1.3232875494510379, + "grad_norm": 0.1761137694120407, + "learning_rate": 7.421617379349915e-05, + "loss": 0.276, + "step": 29770 + }, + { + "epoch": 1.323732053162644, + "grad_norm": 0.19208505749702454, + "learning_rate": 7.41980891930325e-05, + "loss": 0.2782, + "step": 29780 + }, + { + "epoch": 1.3241765568742498, + "grad_norm": 0.18963655829429626, + "learning_rate": 7.418000045777425e-05, + "loss": 0.2777, + "step": 29790 + }, + { + "epoch": 1.324621060585856, + "grad_norm": 0.2061968296766281, + "learning_rate": 7.416190759081523e-05, + "loss": 0.2803, + "step": 29800 + }, + { + "epoch": 1.3250655642974618, + "grad_norm": 0.1882597953081131, + "learning_rate": 7.414381059524704e-05, + "loss": 0.2816, + "step": 29810 + }, + { + "epoch": 1.325510068009068, + "grad_norm": 0.2013653963804245, + "learning_rate": 7.412570947416195e-05, + "loss": 0.281, + "step": 29820 + }, + { + "epoch": 1.3259545717206738, + "grad_norm": 0.18219168484210968, + "learning_rate": 7.410760423065295e-05, + "loss": 0.2785, + "step": 29830 + }, + { + "epoch": 1.3263990754322799, + "grad_norm": 0.1917395144701004, + "learning_rate": 7.408949486781372e-05, + "loss": 0.2774, + "step": 29840 + }, + { + "epoch": 1.326843579143886, + "grad_norm": 0.1795857846736908, + "learning_rate": 7.407138138873868e-05, + "loss": 0.2756, + "step": 29850 + }, + { + "epoch": 1.3272880828554918, + "grad_norm": 0.2134808450937271, + "learning_rate": 7.405326379652292e-05, + "loss": 0.2787, + "step": 29860 + }, + { + "epoch": 1.327732586567098, + "grad_norm": 0.22174157202243805, + "learning_rate": 7.403514209426222e-05, + "loss": 0.2805, + "step": 29870 + }, + { + "epoch": 1.3281770902787038, + "grad_norm": 0.23829123377799988, + "learning_rate": 7.40170162850531e-05, + "loss": 0.2754, + "step": 29880 + }, + { + "epoch": 1.3286215939903099, + "grad_norm": 0.16666629910469055, + "learning_rate": 7.399888637199278e-05, + "loss": 0.2751, + "step": 29890 + }, + { + "epoch": 1.3290660977019157, + "grad_norm": 0.18357227742671967, + "learning_rate": 7.398075235817914e-05, + "loss": 0.2791, + "step": 29900 + }, + { + "epoch": 1.3295106014135218, + "grad_norm": 0.20200733840465546, + "learning_rate": 7.39626142467108e-05, + "loss": 0.2794, + "step": 29910 + }, + { + "epoch": 1.3299551051251277, + "grad_norm": 0.19994397461414337, + "learning_rate": 7.394447204068706e-05, + "loss": 0.2756, + "step": 29920 + }, + { + "epoch": 1.3303996088367338, + "grad_norm": 0.21429121494293213, + "learning_rate": 7.392632574320793e-05, + "loss": 0.2801, + "step": 29930 + }, + { + "epoch": 1.33084411254834, + "grad_norm": 0.1887032389640808, + "learning_rate": 7.390817535737411e-05, + "loss": 0.2779, + "step": 29940 + }, + { + "epoch": 1.3312886162599458, + "grad_norm": 0.1812911480665207, + "learning_rate": 7.389002088628703e-05, + "loss": 0.2778, + "step": 29950 + }, + { + "epoch": 1.3317331199715516, + "grad_norm": 0.2084679752588272, + "learning_rate": 7.387186233304877e-05, + "loss": 0.2804, + "step": 29960 + }, + { + "epoch": 1.3321776236831577, + "grad_norm": 0.20966993272304535, + "learning_rate": 7.385369970076212e-05, + "loss": 0.2771, + "step": 29970 + }, + { + "epoch": 1.3326221273947638, + "grad_norm": 0.21672362089157104, + "learning_rate": 7.38355329925306e-05, + "loss": 0.2764, + "step": 29980 + }, + { + "epoch": 1.3330666311063697, + "grad_norm": 0.154769629240036, + "learning_rate": 7.381736221145838e-05, + "loss": 0.2749, + "step": 29990 + }, + { + "epoch": 1.3335111348179758, + "grad_norm": 0.1701272577047348, + "learning_rate": 7.37991873606504e-05, + "loss": 0.2776, + "step": 30000 + }, + { + "epoch": 1.3339556385295817, + "grad_norm": 0.21054388582706451, + "learning_rate": 7.378100844321218e-05, + "loss": 0.2781, + "step": 30010 + }, + { + "epoch": 1.3344001422411877, + "grad_norm": 0.21122942864894867, + "learning_rate": 7.376282546225004e-05, + "loss": 0.2763, + "step": 30020 + }, + { + "epoch": 1.3348446459527938, + "grad_norm": 0.2138872593641281, + "learning_rate": 7.374463842087094e-05, + "loss": 0.2799, + "step": 30030 + }, + { + "epoch": 1.3352891496643997, + "grad_norm": 0.19339346885681152, + "learning_rate": 7.372644732218254e-05, + "loss": 0.2806, + "step": 30040 + }, + { + "epoch": 1.3357336533760056, + "grad_norm": 0.20542386174201965, + "learning_rate": 7.370825216929322e-05, + "loss": 0.2776, + "step": 30050 + }, + { + "epoch": 1.3361781570876117, + "grad_norm": 0.21184493601322174, + "learning_rate": 7.369005296531205e-05, + "loss": 0.2757, + "step": 30060 + }, + { + "epoch": 1.3366226607992178, + "grad_norm": 0.19971583783626556, + "learning_rate": 7.367184971334873e-05, + "loss": 0.2761, + "step": 30070 + }, + { + "epoch": 1.3370671645108236, + "grad_norm": 0.17790378630161285, + "learning_rate": 7.365364241651371e-05, + "loss": 0.2758, + "step": 30080 + }, + { + "epoch": 1.3375116682224297, + "grad_norm": 0.17367416620254517, + "learning_rate": 7.363543107791815e-05, + "loss": 0.2809, + "step": 30090 + }, + { + "epoch": 1.3379561719340356, + "grad_norm": 0.2173595428466797, + "learning_rate": 7.361721570067384e-05, + "loss": 0.2813, + "step": 30100 + }, + { + "epoch": 1.3384006756456417, + "grad_norm": 0.1819869875907898, + "learning_rate": 7.359899628789331e-05, + "loss": 0.2807, + "step": 30110 + }, + { + "epoch": 1.3388451793572476, + "grad_norm": 0.20596115291118622, + "learning_rate": 7.358077284268974e-05, + "loss": 0.2782, + "step": 30120 + }, + { + "epoch": 1.3392896830688537, + "grad_norm": 0.2137671411037445, + "learning_rate": 7.356254536817702e-05, + "loss": 0.2806, + "step": 30130 + }, + { + "epoch": 1.3397341867804595, + "grad_norm": 0.16867093741893768, + "learning_rate": 7.354431386746973e-05, + "loss": 0.28, + "step": 30140 + }, + { + "epoch": 1.3401786904920656, + "grad_norm": 0.1923588067293167, + "learning_rate": 7.352607834368316e-05, + "loss": 0.2773, + "step": 30150 + }, + { + "epoch": 1.3406231942036717, + "grad_norm": 0.2138230949640274, + "learning_rate": 7.350783879993324e-05, + "loss": 0.2783, + "step": 30160 + }, + { + "epoch": 1.3410676979152776, + "grad_norm": 0.2064148187637329, + "learning_rate": 7.348959523933658e-05, + "loss": 0.2776, + "step": 30170 + }, + { + "epoch": 1.3415122016268834, + "grad_norm": 0.268045574426651, + "learning_rate": 7.347134766501057e-05, + "loss": 0.2799, + "step": 30180 + }, + { + "epoch": 1.3419567053384895, + "grad_norm": 0.17003920674324036, + "learning_rate": 7.345309608007315e-05, + "loss": 0.2773, + "step": 30190 + }, + { + "epoch": 1.3424012090500956, + "grad_norm": 0.18039269745349884, + "learning_rate": 7.343484048764308e-05, + "loss": 0.2781, + "step": 30200 + }, + { + "epoch": 1.3428457127617015, + "grad_norm": 0.2085830122232437, + "learning_rate": 7.341658089083972e-05, + "loss": 0.2781, + "step": 30210 + }, + { + "epoch": 1.3432902164733076, + "grad_norm": 0.1988387107849121, + "learning_rate": 7.339831729278313e-05, + "loss": 0.2784, + "step": 30220 + }, + { + "epoch": 1.3437347201849135, + "grad_norm": 0.18563584983348846, + "learning_rate": 7.338004969659404e-05, + "loss": 0.2797, + "step": 30230 + }, + { + "epoch": 1.3441792238965196, + "grad_norm": 0.2246815264225006, + "learning_rate": 7.336177810539391e-05, + "loss": 0.278, + "step": 30240 + }, + { + "epoch": 1.3446237276081257, + "grad_norm": 0.20847226679325104, + "learning_rate": 7.334350252230485e-05, + "loss": 0.2754, + "step": 30250 + }, + { + "epoch": 1.3450682313197315, + "grad_norm": 0.20162121951580048, + "learning_rate": 7.332522295044965e-05, + "loss": 0.2775, + "step": 30260 + }, + { + "epoch": 1.3455127350313374, + "grad_norm": 0.17102688550949097, + "learning_rate": 7.33069393929518e-05, + "loss": 0.2767, + "step": 30270 + }, + { + "epoch": 1.3459572387429435, + "grad_norm": 0.17291919887065887, + "learning_rate": 7.328865185293545e-05, + "loss": 0.2764, + "step": 30280 + }, + { + "epoch": 1.3464017424545496, + "grad_norm": 0.17330095171928406, + "learning_rate": 7.327036033352546e-05, + "loss": 0.277, + "step": 30290 + }, + { + "epoch": 1.3468462461661554, + "grad_norm": 0.17227241396903992, + "learning_rate": 7.325206483784733e-05, + "loss": 0.277, + "step": 30300 + }, + { + "epoch": 1.3472907498777615, + "grad_norm": 0.1997208148241043, + "learning_rate": 7.323376536902724e-05, + "loss": 0.2784, + "step": 30310 + }, + { + "epoch": 1.3477352535893674, + "grad_norm": 0.18696768581867218, + "learning_rate": 7.321546193019213e-05, + "loss": 0.2755, + "step": 30320 + }, + { + "epoch": 1.3481797573009735, + "grad_norm": 0.20337893068790436, + "learning_rate": 7.31971545244695e-05, + "loss": 0.2789, + "step": 30330 + }, + { + "epoch": 1.3486242610125794, + "grad_norm": 0.17288313806056976, + "learning_rate": 7.31788431549876e-05, + "loss": 0.2779, + "step": 30340 + }, + { + "epoch": 1.3490687647241855, + "grad_norm": 0.15537843108177185, + "learning_rate": 7.316052782487534e-05, + "loss": 0.2772, + "step": 30350 + }, + { + "epoch": 1.3495132684357913, + "grad_norm": 0.20431172847747803, + "learning_rate": 7.314220853726234e-05, + "loss": 0.2792, + "step": 30360 + }, + { + "epoch": 1.3499577721473974, + "grad_norm": 0.1844140589237213, + "learning_rate": 7.312388529527884e-05, + "loss": 0.2786, + "step": 30370 + }, + { + "epoch": 1.3504022758590035, + "grad_norm": 0.17510661482810974, + "learning_rate": 7.310555810205577e-05, + "loss": 0.2797, + "step": 30380 + }, + { + "epoch": 1.3508467795706094, + "grad_norm": 0.15512438118457794, + "learning_rate": 7.308722696072476e-05, + "loss": 0.2773, + "step": 30390 + }, + { + "epoch": 1.3512912832822155, + "grad_norm": 0.17918629944324493, + "learning_rate": 7.306889187441811e-05, + "loss": 0.281, + "step": 30400 + }, + { + "epoch": 1.3517357869938214, + "grad_norm": 0.19159354269504547, + "learning_rate": 7.305055284626876e-05, + "loss": 0.2789, + "step": 30410 + }, + { + "epoch": 1.3521802907054274, + "grad_norm": 0.19638966023921967, + "learning_rate": 7.303220987941037e-05, + "loss": 0.28, + "step": 30420 + }, + { + "epoch": 1.3526247944170333, + "grad_norm": 0.16614319384098053, + "learning_rate": 7.301386297697726e-05, + "loss": 0.2778, + "step": 30430 + }, + { + "epoch": 1.3530692981286394, + "grad_norm": 0.22330160439014435, + "learning_rate": 7.299551214210438e-05, + "loss": 0.2777, + "step": 30440 + }, + { + "epoch": 1.3535138018402453, + "grad_norm": 0.1882018893957138, + "learning_rate": 7.297715737792738e-05, + "loss": 0.2765, + "step": 30450 + }, + { + "epoch": 1.3539583055518514, + "grad_norm": 0.19930320978164673, + "learning_rate": 7.295879868758265e-05, + "loss": 0.2758, + "step": 30460 + }, + { + "epoch": 1.3544028092634575, + "grad_norm": 0.17728132009506226, + "learning_rate": 7.294043607420713e-05, + "loss": 0.2748, + "step": 30470 + }, + { + "epoch": 1.3548473129750633, + "grad_norm": 0.19158309698104858, + "learning_rate": 7.292206954093852e-05, + "loss": 0.2784, + "step": 30480 + }, + { + "epoch": 1.3552918166866692, + "grad_norm": 0.21349294483661652, + "learning_rate": 7.290369909091515e-05, + "loss": 0.2759, + "step": 30490 + }, + { + "epoch": 1.3557363203982753, + "grad_norm": 0.18600107729434967, + "learning_rate": 7.2885324727276e-05, + "loss": 0.2771, + "step": 30500 + }, + { + "epoch": 1.3561808241098814, + "grad_norm": 0.18088579177856445, + "learning_rate": 7.286694645316076e-05, + "loss": 0.2795, + "step": 30510 + }, + { + "epoch": 1.3566253278214873, + "grad_norm": 0.18952640891075134, + "learning_rate": 7.284856427170982e-05, + "loss": 0.2775, + "step": 30520 + }, + { + "epoch": 1.3570698315330934, + "grad_norm": 0.1913730502128601, + "learning_rate": 7.283017818606414e-05, + "loss": 0.2811, + "step": 30530 + }, + { + "epoch": 1.3575143352446992, + "grad_norm": 0.22082118690013885, + "learning_rate": 7.28117881993654e-05, + "loss": 0.2773, + "step": 30540 + }, + { + "epoch": 1.3579588389563053, + "grad_norm": 0.18535998463630676, + "learning_rate": 7.279339431475598e-05, + "loss": 0.2771, + "step": 30550 + }, + { + "epoch": 1.3584033426679114, + "grad_norm": 0.2150890976190567, + "learning_rate": 7.277499653537887e-05, + "loss": 0.2791, + "step": 30560 + }, + { + "epoch": 1.3588478463795173, + "grad_norm": 0.18236036598682404, + "learning_rate": 7.275659486437776e-05, + "loss": 0.2771, + "step": 30570 + }, + { + "epoch": 1.3592923500911231, + "grad_norm": 0.21350963413715363, + "learning_rate": 7.273818930489695e-05, + "loss": 0.2754, + "step": 30580 + }, + { + "epoch": 1.3597368538027292, + "grad_norm": 0.18552008271217346, + "learning_rate": 7.271977986008151e-05, + "loss": 0.2799, + "step": 30590 + }, + { + "epoch": 1.3601813575143353, + "grad_norm": 0.17103387415409088, + "learning_rate": 7.270136653307705e-05, + "loss": 0.2795, + "step": 30600 + }, + { + "epoch": 1.3606258612259412, + "grad_norm": 0.20085586607456207, + "learning_rate": 7.268294932702994e-05, + "loss": 0.2757, + "step": 30610 + }, + { + "epoch": 1.3610703649375473, + "grad_norm": 0.16261066496372223, + "learning_rate": 7.266452824508719e-05, + "loss": 0.2739, + "step": 30620 + }, + { + "epoch": 1.3615148686491532, + "grad_norm": 0.14819900691509247, + "learning_rate": 7.264610329039643e-05, + "loss": 0.2734, + "step": 30630 + }, + { + "epoch": 1.3619593723607593, + "grad_norm": 0.15027761459350586, + "learning_rate": 7.262767446610599e-05, + "loss": 0.2809, + "step": 30640 + }, + { + "epoch": 1.3624038760723651, + "grad_norm": 0.16246414184570312, + "learning_rate": 7.260924177536485e-05, + "loss": 0.2767, + "step": 30650 + }, + { + "epoch": 1.3628483797839712, + "grad_norm": 0.16481654345989227, + "learning_rate": 7.259080522132265e-05, + "loss": 0.2771, + "step": 30660 + }, + { + "epoch": 1.363292883495577, + "grad_norm": 0.21603909134864807, + "learning_rate": 7.257236480712972e-05, + "loss": 0.2768, + "step": 30670 + }, + { + "epoch": 1.3637373872071832, + "grad_norm": 0.19363121688365936, + "learning_rate": 7.255392053593697e-05, + "loss": 0.2776, + "step": 30680 + }, + { + "epoch": 1.3641818909187893, + "grad_norm": 0.16315826773643494, + "learning_rate": 7.253547241089607e-05, + "loss": 0.2769, + "step": 30690 + }, + { + "epoch": 1.3646263946303951, + "grad_norm": 0.1590886116027832, + "learning_rate": 7.251702043515927e-05, + "loss": 0.2784, + "step": 30700 + }, + { + "epoch": 1.3650708983420012, + "grad_norm": 0.18879251182079315, + "learning_rate": 7.249856461187952e-05, + "loss": 0.2787, + "step": 30710 + }, + { + "epoch": 1.365515402053607, + "grad_norm": 0.2270498424768448, + "learning_rate": 7.248010494421042e-05, + "loss": 0.2774, + "step": 30720 + }, + { + "epoch": 1.3659599057652132, + "grad_norm": 0.2545563578605652, + "learning_rate": 7.246164143530622e-05, + "loss": 0.2782, + "step": 30730 + }, + { + "epoch": 1.366404409476819, + "grad_norm": 0.15863288938999176, + "learning_rate": 7.244317408832181e-05, + "loss": 0.2763, + "step": 30740 + }, + { + "epoch": 1.3668489131884252, + "grad_norm": 0.165414959192276, + "learning_rate": 7.242470290641279e-05, + "loss": 0.2768, + "step": 30750 + }, + { + "epoch": 1.367293416900031, + "grad_norm": 0.18731378018856049, + "learning_rate": 7.240622789273536e-05, + "loss": 0.276, + "step": 30760 + }, + { + "epoch": 1.3677379206116371, + "grad_norm": 0.18678642809391022, + "learning_rate": 7.238774905044638e-05, + "loss": 0.2812, + "step": 30770 + }, + { + "epoch": 1.3681824243232432, + "grad_norm": 0.1906880885362625, + "learning_rate": 7.236926638270341e-05, + "loss": 0.2781, + "step": 30780 + }, + { + "epoch": 1.368626928034849, + "grad_norm": 0.2194930762052536, + "learning_rate": 7.23507798926646e-05, + "loss": 0.2772, + "step": 30790 + }, + { + "epoch": 1.369071431746455, + "grad_norm": 0.17114673554897308, + "learning_rate": 7.23322895834888e-05, + "loss": 0.2785, + "step": 30800 + }, + { + "epoch": 1.369515935458061, + "grad_norm": 0.184239462018013, + "learning_rate": 7.231379545833552e-05, + "loss": 0.2792, + "step": 30810 + }, + { + "epoch": 1.3699604391696671, + "grad_norm": 0.1740555465221405, + "learning_rate": 7.229529752036487e-05, + "loss": 0.2772, + "step": 30820 + }, + { + "epoch": 1.370404942881273, + "grad_norm": 0.1774008572101593, + "learning_rate": 7.227679577273765e-05, + "loss": 0.2806, + "step": 30830 + }, + { + "epoch": 1.370849446592879, + "grad_norm": 0.1933773159980774, + "learning_rate": 7.225829021861529e-05, + "loss": 0.2805, + "step": 30840 + }, + { + "epoch": 1.371293950304485, + "grad_norm": 0.18081115186214447, + "learning_rate": 7.223978086115992e-05, + "loss": 0.278, + "step": 30850 + }, + { + "epoch": 1.371738454016091, + "grad_norm": 0.19320380687713623, + "learning_rate": 7.222126770353425e-05, + "loss": 0.274, + "step": 30860 + }, + { + "epoch": 1.3721829577276972, + "grad_norm": 0.22279489040374756, + "learning_rate": 7.22027507489017e-05, + "loss": 0.2804, + "step": 30870 + }, + { + "epoch": 1.372627461439303, + "grad_norm": 0.16695678234100342, + "learning_rate": 7.218423000042627e-05, + "loss": 0.2766, + "step": 30880 + }, + { + "epoch": 1.373071965150909, + "grad_norm": 0.15637612342834473, + "learning_rate": 7.216570546127268e-05, + "loss": 0.2783, + "step": 30890 + }, + { + "epoch": 1.373516468862515, + "grad_norm": 0.2053099423646927, + "learning_rate": 7.214717713460626e-05, + "loss": 0.277, + "step": 30900 + }, + { + "epoch": 1.373960972574121, + "grad_norm": 0.19881369173526764, + "learning_rate": 7.2128645023593e-05, + "loss": 0.2752, + "step": 30910 + }, + { + "epoch": 1.374405476285727, + "grad_norm": 0.1767936646938324, + "learning_rate": 7.211010913139951e-05, + "loss": 0.2782, + "step": 30920 + }, + { + "epoch": 1.374849979997333, + "grad_norm": 0.1616320013999939, + "learning_rate": 7.209156946119308e-05, + "loss": 0.2778, + "step": 30930 + }, + { + "epoch": 1.375294483708939, + "grad_norm": 0.13614903390407562, + "learning_rate": 7.207302601614166e-05, + "loss": 0.2777, + "step": 30940 + }, + { + "epoch": 1.375738987420545, + "grad_norm": 0.16758732497692108, + "learning_rate": 7.205447879941378e-05, + "loss": 0.2787, + "step": 30950 + }, + { + "epoch": 1.3761834911321509, + "grad_norm": 0.18119801580905914, + "learning_rate": 7.203592781417866e-05, + "loss": 0.2781, + "step": 30960 + }, + { + "epoch": 1.376627994843757, + "grad_norm": 0.2002464085817337, + "learning_rate": 7.201737306360617e-05, + "loss": 0.2774, + "step": 30970 + }, + { + "epoch": 1.3770724985553628, + "grad_norm": 0.18762600421905518, + "learning_rate": 7.19988145508668e-05, + "loss": 0.2798, + "step": 30980 + }, + { + "epoch": 1.377517002266969, + "grad_norm": 0.2060868889093399, + "learning_rate": 7.198025227913168e-05, + "loss": 0.2788, + "step": 30990 + }, + { + "epoch": 1.377961505978575, + "grad_norm": 0.19325947761535645, + "learning_rate": 7.196168625157261e-05, + "loss": 0.2792, + "step": 31000 + }, + { + "epoch": 1.378406009690181, + "grad_norm": 0.15645916759967804, + "learning_rate": 7.194311647136201e-05, + "loss": 0.2769, + "step": 31010 + }, + { + "epoch": 1.3788505134017868, + "grad_norm": 0.19063787162303925, + "learning_rate": 7.192454294167297e-05, + "loss": 0.2758, + "step": 31020 + }, + { + "epoch": 1.3792950171133929, + "grad_norm": 0.2212025225162506, + "learning_rate": 7.190596566567917e-05, + "loss": 0.2804, + "step": 31030 + }, + { + "epoch": 1.379739520824999, + "grad_norm": 0.21328631043434143, + "learning_rate": 7.188738464655496e-05, + "loss": 0.28, + "step": 31040 + }, + { + "epoch": 1.3801840245366048, + "grad_norm": 0.19700518250465393, + "learning_rate": 7.186879988747533e-05, + "loss": 0.2765, + "step": 31050 + }, + { + "epoch": 1.380628528248211, + "grad_norm": 0.22270286083221436, + "learning_rate": 7.185021139161592e-05, + "loss": 0.279, + "step": 31060 + }, + { + "epoch": 1.3810730319598168, + "grad_norm": 0.24414047598838806, + "learning_rate": 7.1831619162153e-05, + "loss": 0.2771, + "step": 31070 + }, + { + "epoch": 1.3815175356714229, + "grad_norm": 0.2407885640859604, + "learning_rate": 7.181302320226345e-05, + "loss": 0.2742, + "step": 31080 + }, + { + "epoch": 1.381962039383029, + "grad_norm": 0.20462368428707123, + "learning_rate": 7.179442351512482e-05, + "loss": 0.2773, + "step": 31090 + }, + { + "epoch": 1.3824065430946348, + "grad_norm": 0.18741117417812347, + "learning_rate": 7.177582010391528e-05, + "loss": 0.2767, + "step": 31100 + }, + { + "epoch": 1.3828510468062407, + "grad_norm": 0.19336718320846558, + "learning_rate": 7.175721297181366e-05, + "loss": 0.279, + "step": 31110 + }, + { + "epoch": 1.3832955505178468, + "grad_norm": 0.1899518370628357, + "learning_rate": 7.173860212199942e-05, + "loss": 0.276, + "step": 31120 + }, + { + "epoch": 1.383740054229453, + "grad_norm": 0.1909460723400116, + "learning_rate": 7.171998755765263e-05, + "loss": 0.2773, + "step": 31130 + }, + { + "epoch": 1.3841845579410588, + "grad_norm": 0.21562302112579346, + "learning_rate": 7.170136928195398e-05, + "loss": 0.2765, + "step": 31140 + }, + { + "epoch": 1.3846290616526649, + "grad_norm": 0.1581682413816452, + "learning_rate": 7.168274729808489e-05, + "loss": 0.2777, + "step": 31150 + }, + { + "epoch": 1.3850735653642707, + "grad_norm": 0.17922236025333405, + "learning_rate": 7.166412160922728e-05, + "loss": 0.28, + "step": 31160 + }, + { + "epoch": 1.3855180690758768, + "grad_norm": 0.20933718979358673, + "learning_rate": 7.164549221856382e-05, + "loss": 0.2748, + "step": 31170 + }, + { + "epoch": 1.3859625727874827, + "grad_norm": 0.2081807404756546, + "learning_rate": 7.162685912927775e-05, + "loss": 0.2771, + "step": 31180 + }, + { + "epoch": 1.3864070764990888, + "grad_norm": 0.1934681087732315, + "learning_rate": 7.160822234455294e-05, + "loss": 0.278, + "step": 31190 + }, + { + "epoch": 1.3868515802106947, + "grad_norm": 0.17910023033618927, + "learning_rate": 7.158958186757391e-05, + "loss": 0.2795, + "step": 31200 + }, + { + "epoch": 1.3872960839223007, + "grad_norm": 0.1756516546010971, + "learning_rate": 7.157093770152582e-05, + "loss": 0.2745, + "step": 31210 + }, + { + "epoch": 1.3877405876339068, + "grad_norm": 0.19743558764457703, + "learning_rate": 7.155228984959446e-05, + "loss": 0.2775, + "step": 31220 + }, + { + "epoch": 1.3881850913455127, + "grad_norm": 0.1760208010673523, + "learning_rate": 7.153363831496621e-05, + "loss": 0.2779, + "step": 31230 + }, + { + "epoch": 1.3886295950571188, + "grad_norm": 0.1856737732887268, + "learning_rate": 7.151498310082811e-05, + "loss": 0.2789, + "step": 31240 + }, + { + "epoch": 1.3890740987687247, + "grad_norm": 0.17165210843086243, + "learning_rate": 7.149632421036784e-05, + "loss": 0.2785, + "step": 31250 + }, + { + "epoch": 1.3895186024803308, + "grad_norm": 0.17590220272541046, + "learning_rate": 7.147766164677369e-05, + "loss": 0.2755, + "step": 31260 + }, + { + "epoch": 1.3899631061919366, + "grad_norm": 0.23232941329479218, + "learning_rate": 7.145899541323459e-05, + "loss": 0.2758, + "step": 31270 + }, + { + "epoch": 1.3904076099035427, + "grad_norm": 0.2371327131986618, + "learning_rate": 7.144032551294007e-05, + "loss": 0.2797, + "step": 31280 + }, + { + "epoch": 1.3908521136151486, + "grad_norm": 0.19572877883911133, + "learning_rate": 7.14216519490803e-05, + "loss": 0.2753, + "step": 31290 + }, + { + "epoch": 1.3912966173267547, + "grad_norm": 0.1880975216627121, + "learning_rate": 7.140297472484609e-05, + "loss": 0.2758, + "step": 31300 + }, + { + "epoch": 1.3917411210383608, + "grad_norm": 0.18463556468486786, + "learning_rate": 7.138429384342891e-05, + "loss": 0.2757, + "step": 31310 + }, + { + "epoch": 1.3921856247499667, + "grad_norm": 0.19913314282894135, + "learning_rate": 7.136560930802074e-05, + "loss": 0.2784, + "step": 31320 + }, + { + "epoch": 1.3926301284615725, + "grad_norm": 0.17930886149406433, + "learning_rate": 7.134692112181431e-05, + "loss": 0.276, + "step": 31330 + }, + { + "epoch": 1.3930746321731786, + "grad_norm": 0.18408054113388062, + "learning_rate": 7.13282292880029e-05, + "loss": 0.2744, + "step": 31340 + }, + { + "epoch": 1.3935191358847847, + "grad_norm": 0.1922573447227478, + "learning_rate": 7.130953380978043e-05, + "loss": 0.2752, + "step": 31350 + }, + { + "epoch": 1.3939636395963906, + "grad_norm": 0.19373643398284912, + "learning_rate": 7.129083469034144e-05, + "loss": 0.2758, + "step": 31360 + }, + { + "epoch": 1.3944081433079967, + "grad_norm": 0.19177615642547607, + "learning_rate": 7.127213193288112e-05, + "loss": 0.2771, + "step": 31370 + }, + { + "epoch": 1.3948526470196025, + "grad_norm": 0.18987074494361877, + "learning_rate": 7.125342554059522e-05, + "loss": 0.2746, + "step": 31380 + }, + { + "epoch": 1.3952971507312086, + "grad_norm": 0.19551429152488708, + "learning_rate": 7.12347155166802e-05, + "loss": 0.2794, + "step": 31390 + }, + { + "epoch": 1.3957416544428147, + "grad_norm": 0.17361366748809814, + "learning_rate": 7.121600186433306e-05, + "loss": 0.282, + "step": 31400 + }, + { + "epoch": 1.3961861581544206, + "grad_norm": 0.20477229356765747, + "learning_rate": 7.119728458675148e-05, + "loss": 0.2767, + "step": 31410 + }, + { + "epoch": 1.3966306618660265, + "grad_norm": 0.2284156233072281, + "learning_rate": 7.117856368713369e-05, + "loss": 0.2799, + "step": 31420 + }, + { + "epoch": 1.3970751655776326, + "grad_norm": 0.17218011617660522, + "learning_rate": 7.115983916867861e-05, + "loss": 0.2784, + "step": 31430 + }, + { + "epoch": 1.3975196692892387, + "grad_norm": 0.1740858405828476, + "learning_rate": 7.114111103458574e-05, + "loss": 0.2763, + "step": 31440 + }, + { + "epoch": 1.3979641730008445, + "grad_norm": 0.20536872744560242, + "learning_rate": 7.11223792880552e-05, + "loss": 0.2775, + "step": 31450 + }, + { + "epoch": 1.3984086767124506, + "grad_norm": 0.1797041893005371, + "learning_rate": 7.110364393228773e-05, + "loss": 0.2787, + "step": 31460 + }, + { + "epoch": 1.3988531804240565, + "grad_norm": 0.17965157330036163, + "learning_rate": 7.108490497048471e-05, + "loss": 0.2751, + "step": 31470 + }, + { + "epoch": 1.3992976841356626, + "grad_norm": 0.16194437444210052, + "learning_rate": 7.10661624058481e-05, + "loss": 0.2791, + "step": 31480 + }, + { + "epoch": 1.3997421878472684, + "grad_norm": 0.1905069649219513, + "learning_rate": 7.10474162415805e-05, + "loss": 0.2768, + "step": 31490 + }, + { + "epoch": 1.4001866915588745, + "grad_norm": 0.1828354001045227, + "learning_rate": 7.102866648088511e-05, + "loss": 0.2768, + "step": 31500 + }, + { + "epoch": 1.4006311952704804, + "grad_norm": 0.1982453167438507, + "learning_rate": 7.100991312696576e-05, + "loss": 0.2792, + "step": 31510 + }, + { + "epoch": 1.4010756989820865, + "grad_norm": 0.1905430406332016, + "learning_rate": 7.099115618302686e-05, + "loss": 0.2747, + "step": 31520 + }, + { + "epoch": 1.4015202026936926, + "grad_norm": 0.16568714380264282, + "learning_rate": 7.097239565227349e-05, + "loss": 0.2778, + "step": 31530 + }, + { + "epoch": 1.4019647064052985, + "grad_norm": 0.2369290292263031, + "learning_rate": 7.09536315379113e-05, + "loss": 0.2762, + "step": 31540 + }, + { + "epoch": 1.4024092101169046, + "grad_norm": 0.2397650182247162, + "learning_rate": 7.093486384314656e-05, + "loss": 0.2723, + "step": 31550 + }, + { + "epoch": 1.4028537138285104, + "grad_norm": 0.21187342703342438, + "learning_rate": 7.091609257118616e-05, + "loss": 0.2767, + "step": 31560 + }, + { + "epoch": 1.4032982175401165, + "grad_norm": 0.18366441130638123, + "learning_rate": 7.08973177252376e-05, + "loss": 0.2773, + "step": 31570 + }, + { + "epoch": 1.4037427212517224, + "grad_norm": 0.2114124596118927, + "learning_rate": 7.087853930850898e-05, + "loss": 0.2768, + "step": 31580 + }, + { + "epoch": 1.4041872249633285, + "grad_norm": 0.19627156853675842, + "learning_rate": 7.085975732420903e-05, + "loss": 0.2789, + "step": 31590 + }, + { + "epoch": 1.4046317286749344, + "grad_norm": 0.21603555977344513, + "learning_rate": 7.084097177554706e-05, + "loss": 0.2762, + "step": 31600 + }, + { + "epoch": 1.4050762323865404, + "grad_norm": 0.2160969078540802, + "learning_rate": 7.082218266573301e-05, + "loss": 0.2769, + "step": 31610 + }, + { + "epoch": 1.4055207360981465, + "grad_norm": 0.2388114482164383, + "learning_rate": 7.080338999797743e-05, + "loss": 0.2765, + "step": 31620 + }, + { + "epoch": 1.4059652398097524, + "grad_norm": 0.19811224937438965, + "learning_rate": 7.07845937754915e-05, + "loss": 0.2756, + "step": 31630 + }, + { + "epoch": 1.4064097435213583, + "grad_norm": 0.18465055525302887, + "learning_rate": 7.076579400148693e-05, + "loss": 0.2781, + "step": 31640 + }, + { + "epoch": 1.4068542472329644, + "grad_norm": 0.17145636677742004, + "learning_rate": 7.074699067917611e-05, + "loss": 0.2768, + "step": 31650 + }, + { + "epoch": 1.4072987509445705, + "grad_norm": 0.1549624651670456, + "learning_rate": 7.072818381177201e-05, + "loss": 0.2754, + "step": 31660 + }, + { + "epoch": 1.4077432546561763, + "grad_norm": 0.190046489238739, + "learning_rate": 7.070937340248823e-05, + "loss": 0.276, + "step": 31670 + }, + { + "epoch": 1.4081877583677824, + "grad_norm": 0.20564307272434235, + "learning_rate": 7.069055945453893e-05, + "loss": 0.2738, + "step": 31680 + }, + { + "epoch": 1.4086322620793883, + "grad_norm": 0.215445414185524, + "learning_rate": 7.067174197113892e-05, + "loss": 0.2793, + "step": 31690 + }, + { + "epoch": 1.4090767657909944, + "grad_norm": 0.18963785469532013, + "learning_rate": 7.065292095550355e-05, + "loss": 0.2768, + "step": 31700 + }, + { + "epoch": 1.4095212695026005, + "grad_norm": 0.17812514305114746, + "learning_rate": 7.063409641084887e-05, + "loss": 0.2774, + "step": 31710 + }, + { + "epoch": 1.4099657732142064, + "grad_norm": 0.2537398040294647, + "learning_rate": 7.061526834039145e-05, + "loss": 0.2797, + "step": 31720 + }, + { + "epoch": 1.4104102769258122, + "grad_norm": 0.216045081615448, + "learning_rate": 7.05964367473485e-05, + "loss": 0.2775, + "step": 31730 + }, + { + "epoch": 1.4108547806374183, + "grad_norm": 0.21879984438419342, + "learning_rate": 7.057760163493783e-05, + "loss": 0.2769, + "step": 31740 + }, + { + "epoch": 1.4112992843490244, + "grad_norm": 0.23200976848602295, + "learning_rate": 7.055876300637783e-05, + "loss": 0.2779, + "step": 31750 + }, + { + "epoch": 1.4117437880606303, + "grad_norm": 0.22580456733703613, + "learning_rate": 7.053992086488753e-05, + "loss": 0.2766, + "step": 31760 + }, + { + "epoch": 1.4121882917722364, + "grad_norm": 0.18913060426712036, + "learning_rate": 7.052107521368651e-05, + "loss": 0.2722, + "step": 31770 + }, + { + "epoch": 1.4126327954838422, + "grad_norm": 0.17309309542179108, + "learning_rate": 7.0502226055995e-05, + "loss": 0.2788, + "step": 31780 + }, + { + "epoch": 1.4130772991954483, + "grad_norm": 0.18016637861728668, + "learning_rate": 7.048337339503379e-05, + "loss": 0.2748, + "step": 31790 + }, + { + "epoch": 1.4135218029070542, + "grad_norm": 0.1726388931274414, + "learning_rate": 7.046451723402427e-05, + "loss": 0.2753, + "step": 31800 + }, + { + "epoch": 1.4139663066186603, + "grad_norm": 0.1914331167936325, + "learning_rate": 7.044565757618848e-05, + "loss": 0.2754, + "step": 31810 + }, + { + "epoch": 1.4144108103302662, + "grad_norm": 0.20308569073677063, + "learning_rate": 7.042679442474899e-05, + "loss": 0.2778, + "step": 31820 + }, + { + "epoch": 1.4148553140418723, + "grad_norm": 0.2237032800912857, + "learning_rate": 7.040792778292902e-05, + "loss": 0.2747, + "step": 31830 + }, + { + "epoch": 1.4152998177534784, + "grad_norm": 0.19265620410442352, + "learning_rate": 7.038905765395234e-05, + "loss": 0.277, + "step": 31840 + }, + { + "epoch": 1.4157443214650842, + "grad_norm": 0.16256925463676453, + "learning_rate": 7.037018404104334e-05, + "loss": 0.2742, + "step": 31850 + }, + { + "epoch": 1.41618882517669, + "grad_norm": 0.19416922330856323, + "learning_rate": 7.035130694742702e-05, + "loss": 0.2774, + "step": 31860 + }, + { + "epoch": 1.4166333288882962, + "grad_norm": 0.17918649315834045, + "learning_rate": 7.033242637632897e-05, + "loss": 0.2779, + "step": 31870 + }, + { + "epoch": 1.4170778325999023, + "grad_norm": 0.18264822661876678, + "learning_rate": 7.031354233097534e-05, + "loss": 0.2751, + "step": 31880 + }, + { + "epoch": 1.4175223363115081, + "grad_norm": 0.17992721498012543, + "learning_rate": 7.029465481459289e-05, + "loss": 0.2746, + "step": 31890 + }, + { + "epoch": 1.4179668400231142, + "grad_norm": 0.22647646069526672, + "learning_rate": 7.027576383040898e-05, + "loss": 0.2777, + "step": 31900 + }, + { + "epoch": 1.41841134373472, + "grad_norm": 0.19583138823509216, + "learning_rate": 7.025686938165159e-05, + "loss": 0.2741, + "step": 31910 + }, + { + "epoch": 1.4188558474463262, + "grad_norm": 0.21241623163223267, + "learning_rate": 7.023797147154924e-05, + "loss": 0.278, + "step": 31920 + }, + { + "epoch": 1.4193003511579323, + "grad_norm": 0.20078490674495697, + "learning_rate": 7.021907010333111e-05, + "loss": 0.2777, + "step": 31930 + }, + { + "epoch": 1.4197448548695382, + "grad_norm": 0.1477861851453781, + "learning_rate": 7.020016528022685e-05, + "loss": 0.2766, + "step": 31940 + }, + { + "epoch": 1.420189358581144, + "grad_norm": 0.20293660461902618, + "learning_rate": 7.018125700546683e-05, + "loss": 0.2767, + "step": 31950 + }, + { + "epoch": 1.4206338622927501, + "grad_norm": 0.18454167246818542, + "learning_rate": 7.016234528228196e-05, + "loss": 0.2771, + "step": 31960 + }, + { + "epoch": 1.4210783660043562, + "grad_norm": 0.17221517860889435, + "learning_rate": 7.014343011390372e-05, + "loss": 0.2738, + "step": 31970 + }, + { + "epoch": 1.421522869715962, + "grad_norm": 0.16617368161678314, + "learning_rate": 7.01245115035642e-05, + "loss": 0.2779, + "step": 31980 + }, + { + "epoch": 1.4219673734275682, + "grad_norm": 0.18178188800811768, + "learning_rate": 7.010558945449606e-05, + "loss": 0.2768, + "step": 31990 + }, + { + "epoch": 1.422411877139174, + "grad_norm": 0.18239064514636993, + "learning_rate": 7.008666396993258e-05, + "loss": 0.2792, + "step": 32000 + }, + { + "epoch": 1.4228563808507801, + "grad_norm": 0.2307618409395218, + "learning_rate": 7.006773505310759e-05, + "loss": 0.2754, + "step": 32010 + }, + { + "epoch": 1.423300884562386, + "grad_norm": 0.2191735804080963, + "learning_rate": 7.004880270725553e-05, + "loss": 0.274, + "step": 32020 + }, + { + "epoch": 1.423745388273992, + "grad_norm": 0.22717252373695374, + "learning_rate": 7.002986693561144e-05, + "loss": 0.2792, + "step": 32030 + }, + { + "epoch": 1.424189891985598, + "grad_norm": 0.2124316543340683, + "learning_rate": 7.001092774141089e-05, + "loss": 0.2729, + "step": 32040 + }, + { + "epoch": 1.424634395697204, + "grad_norm": 0.17624373733997345, + "learning_rate": 6.999198512789009e-05, + "loss": 0.2758, + "step": 32050 + }, + { + "epoch": 1.4250788994088102, + "grad_norm": 0.18157228827476501, + "learning_rate": 6.997303909828584e-05, + "loss": 0.276, + "step": 32060 + }, + { + "epoch": 1.425523403120416, + "grad_norm": 0.1731891930103302, + "learning_rate": 6.995408965583544e-05, + "loss": 0.2775, + "step": 32070 + }, + { + "epoch": 1.4259679068320221, + "grad_norm": 0.2054079920053482, + "learning_rate": 6.993513680377688e-05, + "loss": 0.275, + "step": 32080 + }, + { + "epoch": 1.426412410543628, + "grad_norm": 0.2095879167318344, + "learning_rate": 6.991618054534868e-05, + "loss": 0.2765, + "step": 32090 + }, + { + "epoch": 1.426856914255234, + "grad_norm": 0.21302355825901031, + "learning_rate": 6.989722088378991e-05, + "loss": 0.2763, + "step": 32100 + }, + { + "epoch": 1.42730141796684, + "grad_norm": 0.15515850484371185, + "learning_rate": 6.987825782234027e-05, + "loss": 0.2764, + "step": 32110 + }, + { + "epoch": 1.427745921678446, + "grad_norm": 0.18965646624565125, + "learning_rate": 6.985929136424006e-05, + "loss": 0.2777, + "step": 32120 + }, + { + "epoch": 1.428190425390052, + "grad_norm": 0.1925138384103775, + "learning_rate": 6.984032151273012e-05, + "loss": 0.2757, + "step": 32130 + }, + { + "epoch": 1.428634929101658, + "grad_norm": 0.1790185421705246, + "learning_rate": 6.982134827105186e-05, + "loss": 0.2764, + "step": 32140 + }, + { + "epoch": 1.429079432813264, + "grad_norm": 0.1824866086244583, + "learning_rate": 6.980237164244729e-05, + "loss": 0.2741, + "step": 32150 + }, + { + "epoch": 1.42952393652487, + "grad_norm": 0.17225518822669983, + "learning_rate": 6.9783391630159e-05, + "loss": 0.2766, + "step": 32160 + }, + { + "epoch": 1.4299684402364758, + "grad_norm": 0.20109736919403076, + "learning_rate": 6.976440823743015e-05, + "loss": 0.2752, + "step": 32170 + }, + { + "epoch": 1.430412943948082, + "grad_norm": 0.18755799531936646, + "learning_rate": 6.974542146750451e-05, + "loss": 0.2775, + "step": 32180 + }, + { + "epoch": 1.430857447659688, + "grad_norm": 0.19743028283119202, + "learning_rate": 6.972643132362637e-05, + "loss": 0.2743, + "step": 32190 + }, + { + "epoch": 1.431301951371294, + "grad_norm": 0.18236029148101807, + "learning_rate": 6.970743780904064e-05, + "loss": 0.2738, + "step": 32200 + }, + { + "epoch": 1.4317464550829, + "grad_norm": 0.18415233492851257, + "learning_rate": 6.968844092699277e-05, + "loss": 0.2776, + "step": 32210 + }, + { + "epoch": 1.4321909587945059, + "grad_norm": 0.1724114716053009, + "learning_rate": 6.966944068072883e-05, + "loss": 0.2775, + "step": 32220 + }, + { + "epoch": 1.432635462506112, + "grad_norm": 0.18066447973251343, + "learning_rate": 6.965043707349545e-05, + "loss": 0.2752, + "step": 32230 + }, + { + "epoch": 1.433079966217718, + "grad_norm": 0.19894492626190186, + "learning_rate": 6.963143010853982e-05, + "loss": 0.2776, + "step": 32240 + }, + { + "epoch": 1.433524469929324, + "grad_norm": 0.19954445958137512, + "learning_rate": 6.961241978910971e-05, + "loss": 0.2794, + "step": 32250 + }, + { + "epoch": 1.4339689736409298, + "grad_norm": 0.20689640939235687, + "learning_rate": 6.959340611845344e-05, + "loss": 0.2744, + "step": 32260 + }, + { + "epoch": 1.4344134773525359, + "grad_norm": 0.20740346610546112, + "learning_rate": 6.957438909981995e-05, + "loss": 0.2774, + "step": 32270 + }, + { + "epoch": 1.434857981064142, + "grad_norm": 0.1781422644853592, + "learning_rate": 6.955536873645872e-05, + "loss": 0.2742, + "step": 32280 + }, + { + "epoch": 1.4353024847757478, + "grad_norm": 0.17532363533973694, + "learning_rate": 6.953634503161982e-05, + "loss": 0.2765, + "step": 32290 + }, + { + "epoch": 1.435746988487354, + "grad_norm": 0.18830356001853943, + "learning_rate": 6.951731798855387e-05, + "loss": 0.2739, + "step": 32300 + }, + { + "epoch": 1.4361914921989598, + "grad_norm": 0.18929524719715118, + "learning_rate": 6.949828761051208e-05, + "loss": 0.2755, + "step": 32310 + }, + { + "epoch": 1.436635995910566, + "grad_norm": 0.22791996598243713, + "learning_rate": 6.947925390074622e-05, + "loss": 0.2756, + "step": 32320 + }, + { + "epoch": 1.4370804996221718, + "grad_norm": 0.23061682283878326, + "learning_rate": 6.946021686250863e-05, + "loss": 0.2769, + "step": 32330 + }, + { + "epoch": 1.4375250033337779, + "grad_norm": 0.2041943371295929, + "learning_rate": 6.94411764990522e-05, + "loss": 0.2758, + "step": 32340 + }, + { + "epoch": 1.4379695070453837, + "grad_norm": 0.18774305284023285, + "learning_rate": 6.942213281363044e-05, + "loss": 0.2741, + "step": 32350 + }, + { + "epoch": 1.4384140107569898, + "grad_norm": 0.1956353634595871, + "learning_rate": 6.940308580949737e-05, + "loss": 0.2762, + "step": 32360 + }, + { + "epoch": 1.438858514468596, + "grad_norm": 0.17418086528778076, + "learning_rate": 6.93840354899076e-05, + "loss": 0.2772, + "step": 32370 + }, + { + "epoch": 1.4393030181802018, + "grad_norm": 0.19702070951461792, + "learning_rate": 6.936498185811633e-05, + "loss": 0.276, + "step": 32380 + }, + { + "epoch": 1.4397475218918077, + "grad_norm": 0.2280888557434082, + "learning_rate": 6.93459249173793e-05, + "loss": 0.2774, + "step": 32390 + }, + { + "epoch": 1.4401920256034138, + "grad_norm": 0.18917737901210785, + "learning_rate": 6.932686467095279e-05, + "loss": 0.2761, + "step": 32400 + }, + { + "epoch": 1.4406365293150198, + "grad_norm": 0.19558411836624146, + "learning_rate": 6.930780112209373e-05, + "loss": 0.2759, + "step": 32410 + }, + { + "epoch": 1.4410810330266257, + "grad_norm": 0.20500768721103668, + "learning_rate": 6.92887342740595e-05, + "loss": 0.2738, + "step": 32420 + }, + { + "epoch": 1.4415255367382318, + "grad_norm": 0.16882914304733276, + "learning_rate": 6.926966413010816e-05, + "loss": 0.277, + "step": 32430 + }, + { + "epoch": 1.4419700404498377, + "grad_norm": 0.19496884942054749, + "learning_rate": 6.925059069349824e-05, + "loss": 0.276, + "step": 32440 + }, + { + "epoch": 1.4424145441614438, + "grad_norm": 0.18825049698352814, + "learning_rate": 6.923151396748886e-05, + "loss": 0.2741, + "step": 32450 + }, + { + "epoch": 1.4428590478730499, + "grad_norm": 0.19504688680171967, + "learning_rate": 6.921243395533974e-05, + "loss": 0.2744, + "step": 32460 + }, + { + "epoch": 1.4433035515846557, + "grad_norm": 0.1880597621202469, + "learning_rate": 6.919335066031109e-05, + "loss": 0.2786, + "step": 32470 + }, + { + "epoch": 1.4437480552962616, + "grad_norm": 0.19831585884094238, + "learning_rate": 6.917426408566379e-05, + "loss": 0.2749, + "step": 32480 + }, + { + "epoch": 1.4441925590078677, + "grad_norm": 0.21685002744197845, + "learning_rate": 6.915517423465916e-05, + "loss": 0.2785, + "step": 32490 + }, + { + "epoch": 1.4446370627194738, + "grad_norm": 0.19186389446258545, + "learning_rate": 6.913608111055914e-05, + "loss": 0.2757, + "step": 32500 + }, + { + "epoch": 1.4450815664310797, + "grad_norm": 0.2407567799091339, + "learning_rate": 6.911698471662623e-05, + "loss": 0.2776, + "step": 32510 + }, + { + "epoch": 1.4455260701426857, + "grad_norm": 0.19730070233345032, + "learning_rate": 6.90978850561235e-05, + "loss": 0.2748, + "step": 32520 + }, + { + "epoch": 1.4459705738542916, + "grad_norm": 0.20907680690288544, + "learning_rate": 6.907878213231454e-05, + "loss": 0.2768, + "step": 32530 + }, + { + "epoch": 1.4464150775658977, + "grad_norm": 0.20133768022060394, + "learning_rate": 6.90596759484635e-05, + "loss": 0.2779, + "step": 32540 + }, + { + "epoch": 1.4468595812775038, + "grad_norm": 0.19199536740779877, + "learning_rate": 6.904056650783514e-05, + "loss": 0.2772, + "step": 32550 + }, + { + "epoch": 1.4473040849891097, + "grad_norm": 0.20374278724193573, + "learning_rate": 6.902145381369471e-05, + "loss": 0.2761, + "step": 32560 + }, + { + "epoch": 1.4477485887007155, + "grad_norm": 0.19548113644123077, + "learning_rate": 6.900233786930808e-05, + "loss": 0.2759, + "step": 32570 + }, + { + "epoch": 1.4481930924123216, + "grad_norm": 0.23096248507499695, + "learning_rate": 6.898321867794161e-05, + "loss": 0.2802, + "step": 32580 + }, + { + "epoch": 1.4486375961239277, + "grad_norm": 0.1884605586528778, + "learning_rate": 6.896409624286226e-05, + "loss": 0.2759, + "step": 32590 + }, + { + "epoch": 1.4490820998355336, + "grad_norm": 0.20684608817100525, + "learning_rate": 6.894497056733754e-05, + "loss": 0.2777, + "step": 32600 + }, + { + "epoch": 1.4495266035471397, + "grad_norm": 0.18089069426059723, + "learning_rate": 6.89258416546355e-05, + "loss": 0.2733, + "step": 32610 + }, + { + "epoch": 1.4499711072587456, + "grad_norm": 0.23017166554927826, + "learning_rate": 6.890670950802474e-05, + "loss": 0.2737, + "step": 32620 + }, + { + "epoch": 1.4504156109703517, + "grad_norm": 0.18112140893936157, + "learning_rate": 6.88875741307744e-05, + "loss": 0.274, + "step": 32630 + }, + { + "epoch": 1.4508601146819575, + "grad_norm": 0.22001343965530396, + "learning_rate": 6.886843552615425e-05, + "loss": 0.2799, + "step": 32640 + }, + { + "epoch": 1.4513046183935636, + "grad_norm": 0.17436231672763824, + "learning_rate": 6.884929369743451e-05, + "loss": 0.2778, + "step": 32650 + }, + { + "epoch": 1.4517491221051695, + "grad_norm": 0.18597887456417084, + "learning_rate": 6.8830148647886e-05, + "loss": 0.2731, + "step": 32660 + }, + { + "epoch": 1.4521936258167756, + "grad_norm": 0.21939505636692047, + "learning_rate": 6.88110003807801e-05, + "loss": 0.2765, + "step": 32670 + }, + { + "epoch": 1.4526381295283817, + "grad_norm": 0.18304653465747833, + "learning_rate": 6.87918488993887e-05, + "loss": 0.2753, + "step": 32680 + }, + { + "epoch": 1.4530826332399875, + "grad_norm": 0.17621846497058868, + "learning_rate": 6.877269420698431e-05, + "loss": 0.2746, + "step": 32690 + }, + { + "epoch": 1.4535271369515934, + "grad_norm": 0.18302032351493835, + "learning_rate": 6.875353630683989e-05, + "loss": 0.274, + "step": 32700 + }, + { + "epoch": 1.4539716406631995, + "grad_norm": 0.1719108521938324, + "learning_rate": 6.873437520222905e-05, + "loss": 0.277, + "step": 32710 + }, + { + "epoch": 1.4544161443748056, + "grad_norm": 0.18783828616142273, + "learning_rate": 6.871521089642585e-05, + "loss": 0.2785, + "step": 32720 + }, + { + "epoch": 1.4548606480864115, + "grad_norm": 0.1692952960729599, + "learning_rate": 6.869604339270498e-05, + "loss": 0.2777, + "step": 32730 + }, + { + "epoch": 1.4553051517980176, + "grad_norm": 0.18230684101581573, + "learning_rate": 6.867687269434164e-05, + "loss": 0.2781, + "step": 32740 + }, + { + "epoch": 1.4557496555096234, + "grad_norm": 0.1813063621520996, + "learning_rate": 6.865769880461156e-05, + "loss": 0.2761, + "step": 32750 + }, + { + "epoch": 1.4561941592212295, + "grad_norm": 0.2082670032978058, + "learning_rate": 6.863852172679104e-05, + "loss": 0.2767, + "step": 32760 + }, + { + "epoch": 1.4566386629328356, + "grad_norm": 0.17928001284599304, + "learning_rate": 6.861934146415693e-05, + "loss": 0.2766, + "step": 32770 + }, + { + "epoch": 1.4570831666444415, + "grad_norm": 0.20097149908542633, + "learning_rate": 6.86001580199866e-05, + "loss": 0.2764, + "step": 32780 + }, + { + "epoch": 1.4575276703560474, + "grad_norm": 0.2126709222793579, + "learning_rate": 6.858097139755798e-05, + "loss": 0.2748, + "step": 32790 + }, + { + "epoch": 1.4579721740676534, + "grad_norm": 0.2206454873085022, + "learning_rate": 6.856178160014955e-05, + "loss": 0.278, + "step": 32800 + }, + { + "epoch": 1.4584166777792595, + "grad_norm": 0.1967640370130539, + "learning_rate": 6.85425886310403e-05, + "loss": 0.2769, + "step": 32810 + }, + { + "epoch": 1.4588611814908654, + "grad_norm": 0.16467995941638947, + "learning_rate": 6.852339249350979e-05, + "loss": 0.276, + "step": 32820 + }, + { + "epoch": 1.4593056852024715, + "grad_norm": 0.18462003767490387, + "learning_rate": 6.850419319083812e-05, + "loss": 0.2761, + "step": 32830 + }, + { + "epoch": 1.4597501889140774, + "grad_norm": 0.19508497416973114, + "learning_rate": 6.848499072630592e-05, + "loss": 0.2745, + "step": 32840 + }, + { + "epoch": 1.4601946926256835, + "grad_norm": 0.16319969296455383, + "learning_rate": 6.846578510319439e-05, + "loss": 0.2777, + "step": 32850 + }, + { + "epoch": 1.4606391963372893, + "grad_norm": 0.21033839881420135, + "learning_rate": 6.844657632478519e-05, + "loss": 0.277, + "step": 32860 + }, + { + "epoch": 1.4610837000488954, + "grad_norm": 0.2015879601240158, + "learning_rate": 6.842736439436063e-05, + "loss": 0.2742, + "step": 32870 + }, + { + "epoch": 1.4615282037605013, + "grad_norm": 0.1889578104019165, + "learning_rate": 6.84081493152035e-05, + "loss": 0.2746, + "step": 32880 + }, + { + "epoch": 1.4619727074721074, + "grad_norm": 0.21052177250385284, + "learning_rate": 6.83889310905971e-05, + "loss": 0.2766, + "step": 32890 + }, + { + "epoch": 1.4624172111837135, + "grad_norm": 0.18287745118141174, + "learning_rate": 6.836970972382533e-05, + "loss": 0.276, + "step": 32900 + }, + { + "epoch": 1.4628617148953194, + "grad_norm": 0.17397545278072357, + "learning_rate": 6.835048521817257e-05, + "loss": 0.2756, + "step": 32910 + }, + { + "epoch": 1.4633062186069254, + "grad_norm": 0.17085453867912292, + "learning_rate": 6.833125757692379e-05, + "loss": 0.2756, + "step": 32920 + }, + { + "epoch": 1.4637507223185313, + "grad_norm": 0.18619666993618011, + "learning_rate": 6.831202680336441e-05, + "loss": 0.2744, + "step": 32930 + }, + { + "epoch": 1.4641952260301374, + "grad_norm": 0.17902208864688873, + "learning_rate": 6.829279290078052e-05, + "loss": 0.2752, + "step": 32940 + }, + { + "epoch": 1.4646397297417433, + "grad_norm": 0.20511724054813385, + "learning_rate": 6.827355587245863e-05, + "loss": 0.2766, + "step": 32950 + }, + { + "epoch": 1.4650842334533494, + "grad_norm": 0.18590356409549713, + "learning_rate": 6.82543157216858e-05, + "loss": 0.2744, + "step": 32960 + }, + { + "epoch": 1.4655287371649552, + "grad_norm": 0.1706106811761856, + "learning_rate": 6.823507245174969e-05, + "loss": 0.2783, + "step": 32970 + }, + { + "epoch": 1.4659732408765613, + "grad_norm": 0.20204460620880127, + "learning_rate": 6.821582606593841e-05, + "loss": 0.277, + "step": 32980 + }, + { + "epoch": 1.4664177445881674, + "grad_norm": 0.19859980046749115, + "learning_rate": 6.81965765675407e-05, + "loss": 0.2772, + "step": 32990 + }, + { + "epoch": 1.4668622482997733, + "grad_norm": 0.20619803667068481, + "learning_rate": 6.81773239598457e-05, + "loss": 0.2742, + "step": 33000 + }, + { + "epoch": 1.4673067520113792, + "grad_norm": 0.18264813721179962, + "learning_rate": 6.815806824614319e-05, + "loss": 0.2743, + "step": 33010 + }, + { + "epoch": 1.4677512557229853, + "grad_norm": 0.1536542922258377, + "learning_rate": 6.813880942972343e-05, + "loss": 0.2738, + "step": 33020 + }, + { + "epoch": 1.4681957594345914, + "grad_norm": 0.18914389610290527, + "learning_rate": 6.811954751387726e-05, + "loss": 0.2762, + "step": 33030 + }, + { + "epoch": 1.4686402631461972, + "grad_norm": 0.15505529940128326, + "learning_rate": 6.810028250189598e-05, + "loss": 0.2783, + "step": 33040 + }, + { + "epoch": 1.4690847668578033, + "grad_norm": 0.17012983560562134, + "learning_rate": 6.808101439707147e-05, + "loss": 0.2735, + "step": 33050 + }, + { + "epoch": 1.4695292705694092, + "grad_norm": 0.20296888053417206, + "learning_rate": 6.806174320269609e-05, + "loss": 0.2746, + "step": 33060 + }, + { + "epoch": 1.4699737742810153, + "grad_norm": 0.19269369542598724, + "learning_rate": 6.804246892206281e-05, + "loss": 0.2726, + "step": 33070 + }, + { + "epoch": 1.4704182779926214, + "grad_norm": 0.22036796808242798, + "learning_rate": 6.802319155846506e-05, + "loss": 0.2723, + "step": 33080 + }, + { + "epoch": 1.4708627817042272, + "grad_norm": 0.16081000864505768, + "learning_rate": 6.800391111519679e-05, + "loss": 0.2716, + "step": 33090 + }, + { + "epoch": 1.471307285415833, + "grad_norm": 0.18869765102863312, + "learning_rate": 6.798462759555253e-05, + "loss": 0.2759, + "step": 33100 + }, + { + "epoch": 1.4717517891274392, + "grad_norm": 0.17264463007450104, + "learning_rate": 6.79653410028273e-05, + "loss": 0.2755, + "step": 33110 + }, + { + "epoch": 1.4721962928390453, + "grad_norm": 0.17798900604248047, + "learning_rate": 6.794605134031663e-05, + "loss": 0.275, + "step": 33120 + }, + { + "epoch": 1.4726407965506512, + "grad_norm": 0.1691431999206543, + "learning_rate": 6.792675861131661e-05, + "loss": 0.2769, + "step": 33130 + }, + { + "epoch": 1.4730853002622573, + "grad_norm": 0.2078339010477066, + "learning_rate": 6.790746281912386e-05, + "loss": 0.2794, + "step": 33140 + }, + { + "epoch": 1.4735298039738631, + "grad_norm": 0.18709008395671844, + "learning_rate": 6.788816396703546e-05, + "loss": 0.2797, + "step": 33150 + }, + { + "epoch": 1.4739743076854692, + "grad_norm": 0.20137694478034973, + "learning_rate": 6.78688620583491e-05, + "loss": 0.2744, + "step": 33160 + }, + { + "epoch": 1.474418811397075, + "grad_norm": 0.20717482268810272, + "learning_rate": 6.784955709636292e-05, + "loss": 0.2735, + "step": 33170 + }, + { + "epoch": 1.4748633151086812, + "grad_norm": 0.19125552475452423, + "learning_rate": 6.783024908437564e-05, + "loss": 0.2763, + "step": 33180 + }, + { + "epoch": 1.475307818820287, + "grad_norm": 0.199490487575531, + "learning_rate": 6.781093802568641e-05, + "loss": 0.2757, + "step": 33190 + }, + { + "epoch": 1.4757523225318931, + "grad_norm": 0.20653140544891357, + "learning_rate": 6.779162392359504e-05, + "loss": 0.2767, + "step": 33200 + }, + { + "epoch": 1.4761968262434992, + "grad_norm": 0.19022367894649506, + "learning_rate": 6.777230678140172e-05, + "loss": 0.2768, + "step": 33210 + }, + { + "epoch": 1.476641329955105, + "grad_norm": 0.1823049783706665, + "learning_rate": 6.775298660240726e-05, + "loss": 0.2749, + "step": 33220 + }, + { + "epoch": 1.477085833666711, + "grad_norm": 0.19460399448871613, + "learning_rate": 6.773366338991292e-05, + "loss": 0.2774, + "step": 33230 + }, + { + "epoch": 1.477530337378317, + "grad_norm": 0.2064390778541565, + "learning_rate": 6.771433714722052e-05, + "loss": 0.2757, + "step": 33240 + }, + { + "epoch": 1.4779748410899232, + "grad_norm": 0.21351785957813263, + "learning_rate": 6.769500787763239e-05, + "loss": 0.2742, + "step": 33250 + }, + { + "epoch": 1.478419344801529, + "grad_norm": 0.16929034888744354, + "learning_rate": 6.76756755844514e-05, + "loss": 0.275, + "step": 33260 + }, + { + "epoch": 1.4788638485131351, + "grad_norm": 0.18593135476112366, + "learning_rate": 6.765634027098087e-05, + "loss": 0.279, + "step": 33270 + }, + { + "epoch": 1.479308352224741, + "grad_norm": 0.19666755199432373, + "learning_rate": 6.763700194052468e-05, + "loss": 0.2779, + "step": 33280 + }, + { + "epoch": 1.479752855936347, + "grad_norm": 0.20288176834583282, + "learning_rate": 6.761766059638723e-05, + "loss": 0.2756, + "step": 33290 + }, + { + "epoch": 1.4801973596479532, + "grad_norm": 0.1794430911540985, + "learning_rate": 6.759831624187345e-05, + "loss": 0.2773, + "step": 33300 + }, + { + "epoch": 1.480641863359559, + "grad_norm": 0.16766251623630524, + "learning_rate": 6.757896888028871e-05, + "loss": 0.2773, + "step": 33310 + }, + { + "epoch": 1.481086367071165, + "grad_norm": 0.21082773804664612, + "learning_rate": 6.7559618514939e-05, + "loss": 0.277, + "step": 33320 + }, + { + "epoch": 1.481530870782771, + "grad_norm": 0.20014965534210205, + "learning_rate": 6.754026514913073e-05, + "loss": 0.2752, + "step": 33330 + }, + { + "epoch": 1.481975374494377, + "grad_norm": 0.1939535140991211, + "learning_rate": 6.752090878617087e-05, + "loss": 0.2777, + "step": 33340 + }, + { + "epoch": 1.482419878205983, + "grad_norm": 0.19826999306678772, + "learning_rate": 6.75015494293669e-05, + "loss": 0.276, + "step": 33350 + }, + { + "epoch": 1.482864381917589, + "grad_norm": 0.17001274228096008, + "learning_rate": 6.74821870820268e-05, + "loss": 0.2747, + "step": 33360 + }, + { + "epoch": 1.483308885629195, + "grad_norm": 0.2404899150133133, + "learning_rate": 6.746282174745907e-05, + "loss": 0.2788, + "step": 33370 + }, + { + "epoch": 1.483753389340801, + "grad_norm": 0.18796534836292267, + "learning_rate": 6.744345342897271e-05, + "loss": 0.2751, + "step": 33380 + }, + { + "epoch": 1.4841978930524071, + "grad_norm": 0.16652226448059082, + "learning_rate": 6.742408212987724e-05, + "loss": 0.2752, + "step": 33390 + }, + { + "epoch": 1.484642396764013, + "grad_norm": 0.2012079656124115, + "learning_rate": 6.740470785348269e-05, + "loss": 0.2754, + "step": 33400 + }, + { + "epoch": 1.4850869004756189, + "grad_norm": 0.17599546909332275, + "learning_rate": 6.738533060309958e-05, + "loss": 0.2748, + "step": 33410 + }, + { + "epoch": 1.485531404187225, + "grad_norm": 0.16336975991725922, + "learning_rate": 6.736595038203894e-05, + "loss": 0.2746, + "step": 33420 + }, + { + "epoch": 1.485975907898831, + "grad_norm": 0.17753735184669495, + "learning_rate": 6.734656719361236e-05, + "loss": 0.2766, + "step": 33430 + }, + { + "epoch": 1.486420411610437, + "grad_norm": 0.1896440088748932, + "learning_rate": 6.732718104113189e-05, + "loss": 0.2729, + "step": 33440 + }, + { + "epoch": 1.486864915322043, + "grad_norm": 0.16824543476104736, + "learning_rate": 6.730779192791006e-05, + "loss": 0.2747, + "step": 33450 + }, + { + "epoch": 1.4873094190336489, + "grad_norm": 0.19617655873298645, + "learning_rate": 6.728839985725997e-05, + "loss": 0.2768, + "step": 33460 + }, + { + "epoch": 1.487753922745255, + "grad_norm": 0.18452021479606628, + "learning_rate": 6.726900483249517e-05, + "loss": 0.2747, + "step": 33470 + }, + { + "epoch": 1.4881984264568608, + "grad_norm": 0.15805989503860474, + "learning_rate": 6.724960685692976e-05, + "loss": 0.2711, + "step": 33480 + }, + { + "epoch": 1.488642930168467, + "grad_norm": 0.19285938143730164, + "learning_rate": 6.723020593387833e-05, + "loss": 0.2792, + "step": 33490 + }, + { + "epoch": 1.4890874338800728, + "grad_norm": 0.1850818544626236, + "learning_rate": 6.721080206665593e-05, + "loss": 0.2764, + "step": 33500 + }, + { + "epoch": 1.489531937591679, + "grad_norm": 0.18349255621433258, + "learning_rate": 6.719139525857819e-05, + "loss": 0.2773, + "step": 33510 + }, + { + "epoch": 1.489976441303285, + "grad_norm": 0.16005782783031464, + "learning_rate": 6.717198551296117e-05, + "loss": 0.2761, + "step": 33520 + }, + { + "epoch": 1.4904209450148909, + "grad_norm": 0.1982518881559372, + "learning_rate": 6.715257283312148e-05, + "loss": 0.2745, + "step": 33530 + }, + { + "epoch": 1.4908654487264967, + "grad_norm": 0.18703846633434296, + "learning_rate": 6.713315722237623e-05, + "loss": 0.2731, + "step": 33540 + }, + { + "epoch": 1.4913099524381028, + "grad_norm": 0.18339216709136963, + "learning_rate": 6.7113738684043e-05, + "loss": 0.2737, + "step": 33550 + }, + { + "epoch": 1.491754456149709, + "grad_norm": 0.1859813779592514, + "learning_rate": 6.709431722143989e-05, + "loss": 0.2738, + "step": 33560 + }, + { + "epoch": 1.4921989598613148, + "grad_norm": 0.17610593140125275, + "learning_rate": 6.70748928378855e-05, + "loss": 0.2744, + "step": 33570 + }, + { + "epoch": 1.4926434635729209, + "grad_norm": 0.15453089773654938, + "learning_rate": 6.705546553669891e-05, + "loss": 0.2751, + "step": 33580 + }, + { + "epoch": 1.4930879672845268, + "grad_norm": 0.22044122219085693, + "learning_rate": 6.703603532119974e-05, + "loss": 0.2768, + "step": 33590 + }, + { + "epoch": 1.4935324709961328, + "grad_norm": 0.195725679397583, + "learning_rate": 6.701660219470808e-05, + "loss": 0.2744, + "step": 33600 + }, + { + "epoch": 1.493976974707739, + "grad_norm": 0.18762175738811493, + "learning_rate": 6.69971661605445e-05, + "loss": 0.2775, + "step": 33610 + }, + { + "epoch": 1.4944214784193448, + "grad_norm": 0.20287956297397614, + "learning_rate": 6.697772722203008e-05, + "loss": 0.2752, + "step": 33620 + }, + { + "epoch": 1.4948659821309507, + "grad_norm": 0.20184768736362457, + "learning_rate": 6.695828538248643e-05, + "loss": 0.2757, + "step": 33630 + }, + { + "epoch": 1.4953104858425568, + "grad_norm": 0.20921310782432556, + "learning_rate": 6.693884064523563e-05, + "loss": 0.2757, + "step": 33640 + }, + { + "epoch": 1.4957549895541629, + "grad_norm": 0.18104718625545502, + "learning_rate": 6.691939301360023e-05, + "loss": 0.2775, + "step": 33650 + }, + { + "epoch": 1.4961994932657687, + "grad_norm": 0.23145100474357605, + "learning_rate": 6.689994249090333e-05, + "loss": 0.2731, + "step": 33660 + }, + { + "epoch": 1.4966439969773748, + "grad_norm": 0.1816316694021225, + "learning_rate": 6.688048908046845e-05, + "loss": 0.2742, + "step": 33670 + }, + { + "epoch": 1.4970885006889807, + "grad_norm": 0.14502371847629547, + "learning_rate": 6.686103278561969e-05, + "loss": 0.2728, + "step": 33680 + }, + { + "epoch": 1.4975330044005868, + "grad_norm": 0.16701501607894897, + "learning_rate": 6.684157360968156e-05, + "loss": 0.2748, + "step": 33690 + }, + { + "epoch": 1.4979775081121927, + "grad_norm": 0.18884946405887604, + "learning_rate": 6.682211155597911e-05, + "loss": 0.277, + "step": 33700 + }, + { + "epoch": 1.4984220118237987, + "grad_norm": 0.19863677024841309, + "learning_rate": 6.680264662783789e-05, + "loss": 0.2749, + "step": 33710 + }, + { + "epoch": 1.4988665155354046, + "grad_norm": 0.17093218863010406, + "learning_rate": 6.678317882858391e-05, + "loss": 0.2749, + "step": 33720 + }, + { + "epoch": 1.4993110192470107, + "grad_norm": 0.24851904809474945, + "learning_rate": 6.67637081615437e-05, + "loss": 0.275, + "step": 33730 + }, + { + "epoch": 1.4997555229586168, + "grad_norm": 0.1869278997182846, + "learning_rate": 6.674423463004427e-05, + "loss": 0.2765, + "step": 33740 + }, + { + "epoch": 1.5002000266702227, + "grad_norm": 0.19008758664131165, + "learning_rate": 6.672475823741308e-05, + "loss": 0.2738, + "step": 33750 + }, + { + "epoch": 1.5006445303818285, + "grad_norm": 0.1624784767627716, + "learning_rate": 6.670527898697811e-05, + "loss": 0.2736, + "step": 33760 + }, + { + "epoch": 1.5010890340934346, + "grad_norm": 0.2023814618587494, + "learning_rate": 6.668579688206788e-05, + "loss": 0.2753, + "step": 33770 + }, + { + "epoch": 1.5015335378050407, + "grad_norm": 0.1674167960882187, + "learning_rate": 6.666631192601131e-05, + "loss": 0.275, + "step": 33780 + }, + { + "epoch": 1.5019780415166468, + "grad_norm": 0.17083705961704254, + "learning_rate": 6.664682412213785e-05, + "loss": 0.2748, + "step": 33790 + }, + { + "epoch": 1.5024225452282527, + "grad_norm": 0.16217108070850372, + "learning_rate": 6.662733347377745e-05, + "loss": 0.2764, + "step": 33800 + }, + { + "epoch": 1.5028670489398586, + "grad_norm": 0.19204705953598022, + "learning_rate": 6.660783998426051e-05, + "loss": 0.2738, + "step": 33810 + }, + { + "epoch": 1.5033115526514647, + "grad_norm": 0.18303996324539185, + "learning_rate": 6.658834365691794e-05, + "loss": 0.2754, + "step": 33820 + }, + { + "epoch": 1.5037560563630707, + "grad_norm": 0.2089681327342987, + "learning_rate": 6.656884449508115e-05, + "loss": 0.2718, + "step": 33830 + }, + { + "epoch": 1.5042005600746766, + "grad_norm": 0.18005073070526123, + "learning_rate": 6.654934250208198e-05, + "loss": 0.2745, + "step": 33840 + }, + { + "epoch": 1.5046450637862825, + "grad_norm": 0.20266519486904144, + "learning_rate": 6.65298376812528e-05, + "loss": 0.2743, + "step": 33850 + }, + { + "epoch": 1.5050895674978886, + "grad_norm": 0.16704924404621124, + "learning_rate": 6.651033003592646e-05, + "loss": 0.2746, + "step": 33860 + }, + { + "epoch": 1.5055340712094947, + "grad_norm": 0.17827199399471283, + "learning_rate": 6.649081956943626e-05, + "loss": 0.2721, + "step": 33870 + }, + { + "epoch": 1.5059785749211005, + "grad_norm": 0.21596522629261017, + "learning_rate": 6.647130628511604e-05, + "loss": 0.2748, + "step": 33880 + }, + { + "epoch": 1.5064230786327066, + "grad_norm": 0.18610674142837524, + "learning_rate": 6.645179018630005e-05, + "loss": 0.2786, + "step": 33890 + }, + { + "epoch": 1.5068675823443125, + "grad_norm": 0.16874347627162933, + "learning_rate": 6.643227127632309e-05, + "loss": 0.2752, + "step": 33900 + }, + { + "epoch": 1.5073120860559186, + "grad_norm": 0.15773968398571014, + "learning_rate": 6.641274955852038e-05, + "loss": 0.2714, + "step": 33910 + }, + { + "epoch": 1.5077565897675247, + "grad_norm": 0.15786480903625488, + "learning_rate": 6.639322503622768e-05, + "loss": 0.2736, + "step": 33920 + }, + { + "epoch": 1.5082010934791306, + "grad_norm": 0.21371020376682281, + "learning_rate": 6.637369771278116e-05, + "loss": 0.2752, + "step": 33930 + }, + { + "epoch": 1.5086455971907364, + "grad_norm": 0.22928328812122345, + "learning_rate": 6.635416759151751e-05, + "loss": 0.2774, + "step": 33940 + }, + { + "epoch": 1.5090901009023425, + "grad_norm": 0.1927756518125534, + "learning_rate": 6.633463467577394e-05, + "loss": 0.2763, + "step": 33950 + }, + { + "epoch": 1.5095346046139486, + "grad_norm": 0.21540270745754242, + "learning_rate": 6.631509896888803e-05, + "loss": 0.2746, + "step": 33960 + }, + { + "epoch": 1.5099791083255545, + "grad_norm": 0.21568530797958374, + "learning_rate": 6.629556047419794e-05, + "loss": 0.2768, + "step": 33970 + }, + { + "epoch": 1.5104236120371604, + "grad_norm": 0.21736609935760498, + "learning_rate": 6.627601919504223e-05, + "loss": 0.2732, + "step": 33980 + }, + { + "epoch": 1.5108681157487664, + "grad_norm": 0.20238971710205078, + "learning_rate": 6.625647513476001e-05, + "loss": 0.2764, + "step": 33990 + }, + { + "epoch": 1.5113126194603725, + "grad_norm": 0.15596939623355865, + "learning_rate": 6.62369282966908e-05, + "loss": 0.2774, + "step": 34000 + }, + { + "epoch": 1.5117571231719786, + "grad_norm": 0.18097306787967682, + "learning_rate": 6.621737868417464e-05, + "loss": 0.2776, + "step": 34010 + }, + { + "epoch": 1.5122016268835845, + "grad_norm": 0.20099319517612457, + "learning_rate": 6.619782630055198e-05, + "loss": 0.2775, + "step": 34020 + }, + { + "epoch": 1.5126461305951904, + "grad_norm": 0.21920278668403625, + "learning_rate": 6.617827114916382e-05, + "loss": 0.2759, + "step": 34030 + }, + { + "epoch": 1.5130906343067965, + "grad_norm": 0.18775896728038788, + "learning_rate": 6.615871323335161e-05, + "loss": 0.2761, + "step": 34040 + }, + { + "epoch": 1.5135351380184026, + "grad_norm": 0.1995992511510849, + "learning_rate": 6.613915255645725e-05, + "loss": 0.2718, + "step": 34050 + }, + { + "epoch": 1.5139796417300084, + "grad_norm": 0.20031654834747314, + "learning_rate": 6.611958912182312e-05, + "loss": 0.2766, + "step": 34060 + }, + { + "epoch": 1.5144241454416143, + "grad_norm": 0.16742180287837982, + "learning_rate": 6.610002293279207e-05, + "loss": 0.2762, + "step": 34070 + }, + { + "epoch": 1.5148686491532204, + "grad_norm": 0.16135916113853455, + "learning_rate": 6.608045399270746e-05, + "loss": 0.2764, + "step": 34080 + }, + { + "epoch": 1.5153131528648265, + "grad_norm": 0.19404080510139465, + "learning_rate": 6.606088230491304e-05, + "loss": 0.2715, + "step": 34090 + }, + { + "epoch": 1.5157576565764326, + "grad_norm": 0.18804875016212463, + "learning_rate": 6.604130787275312e-05, + "loss": 0.2743, + "step": 34100 + }, + { + "epoch": 1.5162021602880384, + "grad_norm": 0.16840897500514984, + "learning_rate": 6.602173069957242e-05, + "loss": 0.2745, + "step": 34110 + }, + { + "epoch": 1.5166466639996443, + "grad_norm": 0.19458694756031036, + "learning_rate": 6.600215078871612e-05, + "loss": 0.273, + "step": 34120 + }, + { + "epoch": 1.5170911677112504, + "grad_norm": 0.20252814888954163, + "learning_rate": 6.598256814352992e-05, + "loss": 0.2722, + "step": 34130 + }, + { + "epoch": 1.5175356714228565, + "grad_norm": 0.18672922253608704, + "learning_rate": 6.596298276735995e-05, + "loss": 0.2762, + "step": 34140 + }, + { + "epoch": 1.5179801751344624, + "grad_norm": 0.20998528599739075, + "learning_rate": 6.594339466355282e-05, + "loss": 0.2754, + "step": 34150 + }, + { + "epoch": 1.5184246788460682, + "grad_norm": 0.17450085282325745, + "learning_rate": 6.592380383545558e-05, + "loss": 0.2739, + "step": 34160 + }, + { + "epoch": 1.5188691825576743, + "grad_norm": 0.1779717057943344, + "learning_rate": 6.590421028641577e-05, + "loss": 0.278, + "step": 34170 + }, + { + "epoch": 1.5193136862692804, + "grad_norm": 0.19307149946689606, + "learning_rate": 6.588461401978143e-05, + "loss": 0.2736, + "step": 34180 + }, + { + "epoch": 1.5197581899808863, + "grad_norm": 0.178066685795784, + "learning_rate": 6.586501503890099e-05, + "loss": 0.2734, + "step": 34190 + }, + { + "epoch": 1.5202026936924922, + "grad_norm": 0.16900183260440826, + "learning_rate": 6.584541334712338e-05, + "loss": 0.2759, + "step": 34200 + }, + { + "epoch": 1.5206471974040983, + "grad_norm": 0.22744673490524292, + "learning_rate": 6.582580894779802e-05, + "loss": 0.2807, + "step": 34210 + }, + { + "epoch": 1.5210917011157044, + "grad_norm": 0.19994473457336426, + "learning_rate": 6.580620184427473e-05, + "loss": 0.2724, + "step": 34220 + }, + { + "epoch": 1.5215362048273104, + "grad_norm": 0.1897166520357132, + "learning_rate": 6.578659203990385e-05, + "loss": 0.2721, + "step": 34230 + }, + { + "epoch": 1.5219807085389163, + "grad_norm": 0.23694084584712982, + "learning_rate": 6.576697953803615e-05, + "loss": 0.2727, + "step": 34240 + }, + { + "epoch": 1.5224252122505222, + "grad_norm": 0.24059735238552094, + "learning_rate": 6.57473643420229e-05, + "loss": 0.2744, + "step": 34250 + }, + { + "epoch": 1.5228697159621283, + "grad_norm": 0.1877051591873169, + "learning_rate": 6.572774645521574e-05, + "loss": 0.275, + "step": 34260 + }, + { + "epoch": 1.5233142196737344, + "grad_norm": 0.17307600378990173, + "learning_rate": 6.570812588096688e-05, + "loss": 0.2781, + "step": 34270 + }, + { + "epoch": 1.5237587233853402, + "grad_norm": 0.16990123689174652, + "learning_rate": 6.568850262262893e-05, + "loss": 0.2742, + "step": 34280 + }, + { + "epoch": 1.5242032270969461, + "grad_norm": 0.15348856151103973, + "learning_rate": 6.566887668355497e-05, + "loss": 0.2742, + "step": 34290 + }, + { + "epoch": 1.5246477308085522, + "grad_norm": 0.16872969269752502, + "learning_rate": 6.564924806709851e-05, + "loss": 0.2732, + "step": 34300 + }, + { + "epoch": 1.5250922345201583, + "grad_norm": 0.20555265247821808, + "learning_rate": 6.562961677661359e-05, + "loss": 0.2744, + "step": 34310 + }, + { + "epoch": 1.5255367382317644, + "grad_norm": 0.18643629550933838, + "learning_rate": 6.56099828154546e-05, + "loss": 0.2738, + "step": 34320 + }, + { + "epoch": 1.5259812419433703, + "grad_norm": 0.17266497015953064, + "learning_rate": 6.55903461869765e-05, + "loss": 0.2735, + "step": 34330 + }, + { + "epoch": 1.5264257456549761, + "grad_norm": 0.18544524908065796, + "learning_rate": 6.557070689453465e-05, + "loss": 0.276, + "step": 34340 + }, + { + "epoch": 1.5268702493665822, + "grad_norm": 0.17656581103801727, + "learning_rate": 6.555106494148482e-05, + "loss": 0.2749, + "step": 34350 + }, + { + "epoch": 1.5273147530781883, + "grad_norm": 0.16552166640758514, + "learning_rate": 6.553142033118333e-05, + "loss": 0.2725, + "step": 34360 + }, + { + "epoch": 1.5277592567897942, + "grad_norm": 0.18616116046905518, + "learning_rate": 6.551177306698688e-05, + "loss": 0.2749, + "step": 34370 + }, + { + "epoch": 1.5282037605014, + "grad_norm": 0.19902808964252472, + "learning_rate": 6.549212315225267e-05, + "loss": 0.2732, + "step": 34380 + }, + { + "epoch": 1.5286482642130061, + "grad_norm": 0.1983863115310669, + "learning_rate": 6.547247059033833e-05, + "loss": 0.2694, + "step": 34390 + }, + { + "epoch": 1.5290927679246122, + "grad_norm": 0.1913273185491562, + "learning_rate": 6.545281538460193e-05, + "loss": 0.2729, + "step": 34400 + }, + { + "epoch": 1.529537271636218, + "grad_norm": 0.2036716789007187, + "learning_rate": 6.543315753840202e-05, + "loss": 0.2754, + "step": 34410 + }, + { + "epoch": 1.5299817753478242, + "grad_norm": 0.16770300269126892, + "learning_rate": 6.541349705509758e-05, + "loss": 0.2759, + "step": 34420 + }, + { + "epoch": 1.53042627905943, + "grad_norm": 0.16954387724399567, + "learning_rate": 6.539383393804805e-05, + "loss": 0.275, + "step": 34430 + }, + { + "epoch": 1.5308707827710362, + "grad_norm": 0.18479149043560028, + "learning_rate": 6.537416819061333e-05, + "loss": 0.276, + "step": 34440 + }, + { + "epoch": 1.5313152864826423, + "grad_norm": 0.17801862955093384, + "learning_rate": 6.535449981615375e-05, + "loss": 0.2755, + "step": 34450 + }, + { + "epoch": 1.5317597901942481, + "grad_norm": 0.17292271554470062, + "learning_rate": 6.53348288180301e-05, + "loss": 0.2747, + "step": 34460 + }, + { + "epoch": 1.532204293905854, + "grad_norm": 0.16550005972385406, + "learning_rate": 6.531515519960361e-05, + "loss": 0.2753, + "step": 34470 + }, + { + "epoch": 1.53264879761746, + "grad_norm": 0.2082495242357254, + "learning_rate": 6.529547896423597e-05, + "loss": 0.2749, + "step": 34480 + }, + { + "epoch": 1.5330933013290662, + "grad_norm": 0.19444820284843445, + "learning_rate": 6.52758001152893e-05, + "loss": 0.2743, + "step": 34490 + }, + { + "epoch": 1.533537805040672, + "grad_norm": 0.1796576976776123, + "learning_rate": 6.525611865612618e-05, + "loss": 0.2721, + "step": 34500 + }, + { + "epoch": 1.533982308752278, + "grad_norm": 0.18644876778125763, + "learning_rate": 6.523643459010966e-05, + "loss": 0.2761, + "step": 34510 + }, + { + "epoch": 1.534426812463884, + "grad_norm": 0.18503816425800323, + "learning_rate": 6.521674792060317e-05, + "loss": 0.2757, + "step": 34520 + }, + { + "epoch": 1.53487131617549, + "grad_norm": 0.23048894107341766, + "learning_rate": 6.519705865097063e-05, + "loss": 0.2757, + "step": 34530 + }, + { + "epoch": 1.5353158198870962, + "grad_norm": 0.2045324146747589, + "learning_rate": 6.517736678457641e-05, + "loss": 0.2742, + "step": 34540 + }, + { + "epoch": 1.535760323598702, + "grad_norm": 0.18053725361824036, + "learning_rate": 6.515767232478534e-05, + "loss": 0.2769, + "step": 34550 + }, + { + "epoch": 1.536204827310308, + "grad_norm": 0.25593671202659607, + "learning_rate": 6.51379752749626e-05, + "loss": 0.2764, + "step": 34560 + }, + { + "epoch": 1.536649331021914, + "grad_norm": 0.19561491906642914, + "learning_rate": 6.511827563847393e-05, + "loss": 0.2748, + "step": 34570 + }, + { + "epoch": 1.5370938347335201, + "grad_norm": 0.2086084634065628, + "learning_rate": 6.509857341868542e-05, + "loss": 0.2744, + "step": 34580 + }, + { + "epoch": 1.537538338445126, + "grad_norm": 0.19562828540802002, + "learning_rate": 6.507886861896367e-05, + "loss": 0.2768, + "step": 34590 + }, + { + "epoch": 1.5379828421567319, + "grad_norm": 0.19745036959648132, + "learning_rate": 6.505916124267567e-05, + "loss": 0.2721, + "step": 34600 + }, + { + "epoch": 1.538427345868338, + "grad_norm": 0.1889752894639969, + "learning_rate": 6.503945129318891e-05, + "loss": 0.2751, + "step": 34610 + }, + { + "epoch": 1.538871849579944, + "grad_norm": 0.19222460687160492, + "learning_rate": 6.501973877387122e-05, + "loss": 0.2768, + "step": 34620 + }, + { + "epoch": 1.5393163532915501, + "grad_norm": 0.19163966178894043, + "learning_rate": 6.500002368809098e-05, + "loss": 0.2724, + "step": 34630 + }, + { + "epoch": 1.539760857003156, + "grad_norm": 0.19379670917987823, + "learning_rate": 6.498030603921694e-05, + "loss": 0.2723, + "step": 34640 + }, + { + "epoch": 1.5402053607147619, + "grad_norm": 0.20301799476146698, + "learning_rate": 6.496058583061832e-05, + "loss": 0.2764, + "step": 34650 + }, + { + "epoch": 1.540649864426368, + "grad_norm": 0.17827823758125305, + "learning_rate": 6.494086306566475e-05, + "loss": 0.2759, + "step": 34660 + }, + { + "epoch": 1.541094368137974, + "grad_norm": 0.16489847004413605, + "learning_rate": 6.492113774772632e-05, + "loss": 0.2752, + "step": 34670 + }, + { + "epoch": 1.54153887184958, + "grad_norm": 0.2106565535068512, + "learning_rate": 6.490140988017354e-05, + "loss": 0.2733, + "step": 34680 + }, + { + "epoch": 1.5419833755611858, + "grad_norm": 0.20308257639408112, + "learning_rate": 6.488167946637736e-05, + "loss": 0.2728, + "step": 34690 + }, + { + "epoch": 1.542427879272792, + "grad_norm": 0.2096354365348816, + "learning_rate": 6.486194650970915e-05, + "loss": 0.2742, + "step": 34700 + }, + { + "epoch": 1.542872382984398, + "grad_norm": 0.16826677322387695, + "learning_rate": 6.48422110135408e-05, + "loss": 0.2748, + "step": 34710 + }, + { + "epoch": 1.5433168866960039, + "grad_norm": 0.1855151504278183, + "learning_rate": 6.482247298124451e-05, + "loss": 0.2737, + "step": 34720 + }, + { + "epoch": 1.54376139040761, + "grad_norm": 0.19639693200588226, + "learning_rate": 6.480273241619297e-05, + "loss": 0.2757, + "step": 34730 + }, + { + "epoch": 1.5442058941192158, + "grad_norm": 0.19471345841884613, + "learning_rate": 6.478298932175933e-05, + "loss": 0.2759, + "step": 34740 + }, + { + "epoch": 1.544650397830822, + "grad_norm": 0.18546244502067566, + "learning_rate": 6.476324370131712e-05, + "loss": 0.2744, + "step": 34750 + }, + { + "epoch": 1.545094901542428, + "grad_norm": 0.17475438117980957, + "learning_rate": 6.474349555824036e-05, + "loss": 0.2737, + "step": 34760 + }, + { + "epoch": 1.5455394052540339, + "grad_norm": 0.15330472588539124, + "learning_rate": 6.472374489590342e-05, + "loss": 0.2724, + "step": 34770 + }, + { + "epoch": 1.5459839089656398, + "grad_norm": 0.18398374319076538, + "learning_rate": 6.470399171768118e-05, + "loss": 0.2744, + "step": 34780 + }, + { + "epoch": 1.5464284126772458, + "grad_norm": 0.1617831140756607, + "learning_rate": 6.468423602694891e-05, + "loss": 0.2738, + "step": 34790 + }, + { + "epoch": 1.546872916388852, + "grad_norm": 0.18586580455303192, + "learning_rate": 6.466447782708232e-05, + "loss": 0.2767, + "step": 34800 + }, + { + "epoch": 1.5473174201004578, + "grad_norm": 0.19252443313598633, + "learning_rate": 6.464471712145754e-05, + "loss": 0.2719, + "step": 34810 + }, + { + "epoch": 1.5477619238120637, + "grad_norm": 0.18680062890052795, + "learning_rate": 6.462495391345114e-05, + "loss": 0.2738, + "step": 34820 + }, + { + "epoch": 1.5482064275236698, + "grad_norm": 0.2007714956998825, + "learning_rate": 6.46051882064401e-05, + "loss": 0.271, + "step": 34830 + }, + { + "epoch": 1.5486509312352759, + "grad_norm": 0.16846376657485962, + "learning_rate": 6.458542000380186e-05, + "loss": 0.2747, + "step": 34840 + }, + { + "epoch": 1.549095434946882, + "grad_norm": 0.16455316543579102, + "learning_rate": 6.456564930891424e-05, + "loss": 0.2734, + "step": 34850 + }, + { + "epoch": 1.5495399386584878, + "grad_norm": 0.15731365978717804, + "learning_rate": 6.454587612515555e-05, + "loss": 0.2719, + "step": 34860 + }, + { + "epoch": 1.5499844423700937, + "grad_norm": 0.15577787160873413, + "learning_rate": 6.452610045590444e-05, + "loss": 0.2735, + "step": 34870 + }, + { + "epoch": 1.5504289460816998, + "grad_norm": 0.1821681261062622, + "learning_rate": 6.450632230454005e-05, + "loss": 0.2711, + "step": 34880 + }, + { + "epoch": 1.5508734497933059, + "grad_norm": 0.1708340048789978, + "learning_rate": 6.448654167444195e-05, + "loss": 0.2713, + "step": 34890 + }, + { + "epoch": 1.5513179535049118, + "grad_norm": 0.14854083955287933, + "learning_rate": 6.446675856899005e-05, + "loss": 0.271, + "step": 34900 + }, + { + "epoch": 1.5517624572165176, + "grad_norm": 0.18729835748672485, + "learning_rate": 6.444697299156481e-05, + "loss": 0.2741, + "step": 34910 + }, + { + "epoch": 1.5522069609281237, + "grad_norm": 0.1855032742023468, + "learning_rate": 6.442718494554701e-05, + "loss": 0.2752, + "step": 34920 + }, + { + "epoch": 1.5526514646397298, + "grad_norm": 0.2062237709760666, + "learning_rate": 6.440739443431787e-05, + "loss": 0.2751, + "step": 34930 + }, + { + "epoch": 1.553095968351336, + "grad_norm": 0.1714373677968979, + "learning_rate": 6.438760146125906e-05, + "loss": 0.2729, + "step": 34940 + }, + { + "epoch": 1.5535404720629418, + "grad_norm": 0.15693843364715576, + "learning_rate": 6.436780602975267e-05, + "loss": 0.2715, + "step": 34950 + }, + { + "epoch": 1.5539849757745476, + "grad_norm": 0.19769997894763947, + "learning_rate": 6.43480081431812e-05, + "loss": 0.2745, + "step": 34960 + }, + { + "epoch": 1.5544294794861537, + "grad_norm": 0.1851397454738617, + "learning_rate": 6.432820780492756e-05, + "loss": 0.2733, + "step": 34970 + }, + { + "epoch": 1.5548739831977598, + "grad_norm": 0.17255577445030212, + "learning_rate": 6.430840501837506e-05, + "loss": 0.277, + "step": 34980 + }, + { + "epoch": 1.5553184869093657, + "grad_norm": 0.1385740488767624, + "learning_rate": 6.428859978690748e-05, + "loss": 0.2712, + "step": 34990 + }, + { + "epoch": 1.5557629906209716, + "grad_norm": 0.16265138983726501, + "learning_rate": 6.426879211390901e-05, + "loss": 0.2741, + "step": 35000 + }, + { + "epoch": 1.5562074943325777, + "grad_norm": 0.18366490304470062, + "learning_rate": 6.424898200276422e-05, + "loss": 0.2735, + "step": 35010 + }, + { + "epoch": 1.5566519980441837, + "grad_norm": 0.18708401918411255, + "learning_rate": 6.42291694568581e-05, + "loss": 0.274, + "step": 35020 + }, + { + "epoch": 1.5570965017557896, + "grad_norm": 0.192853644490242, + "learning_rate": 6.42093544795761e-05, + "loss": 0.2731, + "step": 35030 + }, + { + "epoch": 1.5575410054673955, + "grad_norm": 0.21818365156650543, + "learning_rate": 6.418953707430403e-05, + "loss": 0.2762, + "step": 35040 + }, + { + "epoch": 1.5579855091790016, + "grad_norm": 0.17380768060684204, + "learning_rate": 6.416971724442819e-05, + "loss": 0.2756, + "step": 35050 + }, + { + "epoch": 1.5584300128906077, + "grad_norm": 0.16791407763957977, + "learning_rate": 6.414989499333519e-05, + "loss": 0.2752, + "step": 35060 + }, + { + "epoch": 1.5588745166022138, + "grad_norm": 0.15499451756477356, + "learning_rate": 6.413007032441214e-05, + "loss": 0.2737, + "step": 35070 + }, + { + "epoch": 1.5593190203138196, + "grad_norm": 0.18127906322479248, + "learning_rate": 6.411024324104653e-05, + "loss": 0.273, + "step": 35080 + }, + { + "epoch": 1.5597635240254255, + "grad_norm": 0.19037578999996185, + "learning_rate": 6.409041374662628e-05, + "loss": 0.272, + "step": 35090 + }, + { + "epoch": 1.5602080277370316, + "grad_norm": 0.19522671401500702, + "learning_rate": 6.407058184453967e-05, + "loss": 0.2743, + "step": 35100 + }, + { + "epoch": 1.5606525314486377, + "grad_norm": 0.19298729300498962, + "learning_rate": 6.405074753817548e-05, + "loss": 0.2721, + "step": 35110 + }, + { + "epoch": 1.5610970351602436, + "grad_norm": 0.21039487421512604, + "learning_rate": 6.40309108309228e-05, + "loss": 0.2743, + "step": 35120 + }, + { + "epoch": 1.5615415388718494, + "grad_norm": 0.2032345086336136, + "learning_rate": 6.401107172617122e-05, + "loss": 0.2733, + "step": 35130 + }, + { + "epoch": 1.5619860425834555, + "grad_norm": 0.1808764785528183, + "learning_rate": 6.399123022731068e-05, + "loss": 0.2755, + "step": 35140 + }, + { + "epoch": 1.5624305462950616, + "grad_norm": 0.18543687462806702, + "learning_rate": 6.397138633773157e-05, + "loss": 0.2741, + "step": 35150 + }, + { + "epoch": 1.5628750500066677, + "grad_norm": 0.2120916098356247, + "learning_rate": 6.395154006082463e-05, + "loss": 0.2725, + "step": 35160 + }, + { + "epoch": 1.5633195537182736, + "grad_norm": 0.208845317363739, + "learning_rate": 6.393169139998109e-05, + "loss": 0.2735, + "step": 35170 + }, + { + "epoch": 1.5637640574298795, + "grad_norm": 0.20714281499385834, + "learning_rate": 6.39118403585925e-05, + "loss": 0.2722, + "step": 35180 + }, + { + "epoch": 1.5642085611414855, + "grad_norm": 0.18409262597560883, + "learning_rate": 6.38919869400509e-05, + "loss": 0.2738, + "step": 35190 + }, + { + "epoch": 1.5646530648530916, + "grad_norm": 0.17057634890079498, + "learning_rate": 6.387213114774865e-05, + "loss": 0.2717, + "step": 35200 + }, + { + "epoch": 1.5650975685646975, + "grad_norm": 0.18395066261291504, + "learning_rate": 6.385227298507863e-05, + "loss": 0.2735, + "step": 35210 + }, + { + "epoch": 1.5655420722763034, + "grad_norm": 0.16620267927646637, + "learning_rate": 6.3832412455434e-05, + "loss": 0.2745, + "step": 35220 + }, + { + "epoch": 1.5659865759879095, + "grad_norm": 0.19695422053337097, + "learning_rate": 6.381254956220841e-05, + "loss": 0.2743, + "step": 35230 + }, + { + "epoch": 1.5664310796995156, + "grad_norm": 0.18326374888420105, + "learning_rate": 6.379268430879586e-05, + "loss": 0.2705, + "step": 35240 + }, + { + "epoch": 1.5668755834111214, + "grad_norm": 0.20258906483650208, + "learning_rate": 6.37728166985908e-05, + "loss": 0.2751, + "step": 35250 + }, + { + "epoch": 1.5673200871227275, + "grad_norm": 0.17957398295402527, + "learning_rate": 6.375294673498804e-05, + "loss": 0.2735, + "step": 35260 + }, + { + "epoch": 1.5677645908343334, + "grad_norm": 0.2002374678850174, + "learning_rate": 6.373307442138284e-05, + "loss": 0.2729, + "step": 35270 + }, + { + "epoch": 1.5682090945459395, + "grad_norm": 0.2178662270307541, + "learning_rate": 6.371319976117081e-05, + "loss": 0.2752, + "step": 35280 + }, + { + "epoch": 1.5686535982575456, + "grad_norm": 0.19380812346935272, + "learning_rate": 6.3693322757748e-05, + "loss": 0.2739, + "step": 35290 + }, + { + "epoch": 1.5690981019691514, + "grad_norm": 0.193237766623497, + "learning_rate": 6.367344341451086e-05, + "loss": 0.2759, + "step": 35300 + }, + { + "epoch": 1.5695426056807573, + "grad_norm": 0.21210838854312897, + "learning_rate": 6.36535617348562e-05, + "loss": 0.2758, + "step": 35310 + }, + { + "epoch": 1.5699871093923634, + "grad_norm": 0.18735161423683167, + "learning_rate": 6.363367772218128e-05, + "loss": 0.2741, + "step": 35320 + }, + { + "epoch": 1.5704316131039695, + "grad_norm": 0.18327488005161285, + "learning_rate": 6.36137913798837e-05, + "loss": 0.2724, + "step": 35330 + }, + { + "epoch": 1.5708761168155754, + "grad_norm": 0.20089203119277954, + "learning_rate": 6.359390271136151e-05, + "loss": 0.2715, + "step": 35340 + }, + { + "epoch": 1.5713206205271812, + "grad_norm": 0.1830112487077713, + "learning_rate": 6.357401172001314e-05, + "loss": 0.27, + "step": 35350 + }, + { + "epoch": 1.5717651242387873, + "grad_norm": 0.19870224595069885, + "learning_rate": 6.355411840923742e-05, + "loss": 0.2741, + "step": 35360 + }, + { + "epoch": 1.5722096279503934, + "grad_norm": 0.2000361531972885, + "learning_rate": 6.353422278243358e-05, + "loss": 0.2788, + "step": 35370 + }, + { + "epoch": 1.5726541316619995, + "grad_norm": 0.17159543931484222, + "learning_rate": 6.351432484300121e-05, + "loss": 0.2729, + "step": 35380 + }, + { + "epoch": 1.5730986353736054, + "grad_norm": 0.18393008410930634, + "learning_rate": 6.349442459434036e-05, + "loss": 0.2738, + "step": 35390 + }, + { + "epoch": 1.5735431390852113, + "grad_norm": 0.1698843538761139, + "learning_rate": 6.34745220398514e-05, + "loss": 0.2715, + "step": 35400 + }, + { + "epoch": 1.5739876427968174, + "grad_norm": 0.18862470984458923, + "learning_rate": 6.345461718293518e-05, + "loss": 0.2737, + "step": 35410 + }, + { + "epoch": 1.5744321465084234, + "grad_norm": 0.1432826668024063, + "learning_rate": 6.343471002699286e-05, + "loss": 0.2726, + "step": 35420 + }, + { + "epoch": 1.5748766502200293, + "grad_norm": 0.15150229632854462, + "learning_rate": 6.341480057542602e-05, + "loss": 0.2727, + "step": 35430 + }, + { + "epoch": 1.5753211539316352, + "grad_norm": 0.19597174227237701, + "learning_rate": 6.339488883163667e-05, + "loss": 0.2742, + "step": 35440 + }, + { + "epoch": 1.5757656576432413, + "grad_norm": 0.14888468384742737, + "learning_rate": 6.337497479902716e-05, + "loss": 0.2735, + "step": 35450 + }, + { + "epoch": 1.5762101613548474, + "grad_norm": 0.1794658750295639, + "learning_rate": 6.335505848100027e-05, + "loss": 0.2728, + "step": 35460 + }, + { + "epoch": 1.5766546650664535, + "grad_norm": 0.1650031954050064, + "learning_rate": 6.333513988095915e-05, + "loss": 0.2736, + "step": 35470 + }, + { + "epoch": 1.5770991687780593, + "grad_norm": 0.15855352580547333, + "learning_rate": 6.331521900230735e-05, + "loss": 0.2718, + "step": 35480 + }, + { + "epoch": 1.5775436724896652, + "grad_norm": 0.18296696245670319, + "learning_rate": 6.329529584844878e-05, + "loss": 0.2728, + "step": 35490 + }, + { + "epoch": 1.5779881762012713, + "grad_norm": 0.1613931804895401, + "learning_rate": 6.327537042278777e-05, + "loss": 0.2741, + "step": 35500 + }, + { + "epoch": 1.5784326799128774, + "grad_norm": 0.17288784682750702, + "learning_rate": 6.325544272872905e-05, + "loss": 0.2707, + "step": 35510 + }, + { + "epoch": 1.5788771836244833, + "grad_norm": 0.21762464940547943, + "learning_rate": 6.323551276967771e-05, + "loss": 0.2702, + "step": 35520 + }, + { + "epoch": 1.5793216873360891, + "grad_norm": 0.1884748786687851, + "learning_rate": 6.321558054903922e-05, + "loss": 0.2718, + "step": 35530 + }, + { + "epoch": 1.5797661910476952, + "grad_norm": 0.194665789604187, + "learning_rate": 6.319564607021947e-05, + "loss": 0.2737, + "step": 35540 + }, + { + "epoch": 1.5802106947593013, + "grad_norm": 0.18341831862926483, + "learning_rate": 6.31757093366247e-05, + "loss": 0.273, + "step": 35550 + }, + { + "epoch": 1.5806551984709072, + "grad_norm": 0.18925155699253082, + "learning_rate": 6.315577035166154e-05, + "loss": 0.2736, + "step": 35560 + }, + { + "epoch": 1.5810997021825133, + "grad_norm": 0.20887154340744019, + "learning_rate": 6.313582911873708e-05, + "loss": 0.2733, + "step": 35570 + }, + { + "epoch": 1.5815442058941191, + "grad_norm": 0.1966438740491867, + "learning_rate": 6.311588564125865e-05, + "loss": 0.2738, + "step": 35580 + }, + { + "epoch": 1.5819887096057252, + "grad_norm": 0.1986958533525467, + "learning_rate": 6.30959399226341e-05, + "loss": 0.2727, + "step": 35590 + }, + { + "epoch": 1.5824332133173313, + "grad_norm": 0.17438887059688568, + "learning_rate": 6.30759919662716e-05, + "loss": 0.2733, + "step": 35600 + }, + { + "epoch": 1.5828777170289372, + "grad_norm": 0.16056686639785767, + "learning_rate": 6.30560417755797e-05, + "loss": 0.2747, + "step": 35610 + }, + { + "epoch": 1.583322220740543, + "grad_norm": 0.1421469897031784, + "learning_rate": 6.303608935396735e-05, + "loss": 0.2754, + "step": 35620 + }, + { + "epoch": 1.5837667244521492, + "grad_norm": 0.15716269612312317, + "learning_rate": 6.301613470484386e-05, + "loss": 0.2756, + "step": 35630 + }, + { + "epoch": 1.5842112281637553, + "grad_norm": 0.1658746600151062, + "learning_rate": 6.299617783161893e-05, + "loss": 0.2746, + "step": 35640 + }, + { + "epoch": 1.5846557318753611, + "grad_norm": 0.20010437071323395, + "learning_rate": 6.297621873770266e-05, + "loss": 0.274, + "step": 35650 + }, + { + "epoch": 1.585100235586967, + "grad_norm": 0.16103088855743408, + "learning_rate": 6.29562574265055e-05, + "loss": 0.2714, + "step": 35660 + }, + { + "epoch": 1.585544739298573, + "grad_norm": 0.18156416714191437, + "learning_rate": 6.293629390143834e-05, + "loss": 0.2719, + "step": 35670 + }, + { + "epoch": 1.5859892430101792, + "grad_norm": 0.1734149158000946, + "learning_rate": 6.291632816591232e-05, + "loss": 0.2744, + "step": 35680 + }, + { + "epoch": 1.5864337467217853, + "grad_norm": 0.17964942753314972, + "learning_rate": 6.28963602233391e-05, + "loss": 0.2728, + "step": 35690 + }, + { + "epoch": 1.5868782504333911, + "grad_norm": 0.1863485723733902, + "learning_rate": 6.287639007713062e-05, + "loss": 0.2735, + "step": 35700 + }, + { + "epoch": 1.587322754144997, + "grad_norm": 0.19849489629268646, + "learning_rate": 6.285641773069926e-05, + "loss": 0.2705, + "step": 35710 + }, + { + "epoch": 1.587767257856603, + "grad_norm": 0.20568382740020752, + "learning_rate": 6.283644318745773e-05, + "loss": 0.2735, + "step": 35720 + }, + { + "epoch": 1.5882117615682092, + "grad_norm": 0.21916361153125763, + "learning_rate": 6.281646645081912e-05, + "loss": 0.2744, + "step": 35730 + }, + { + "epoch": 1.588656265279815, + "grad_norm": 0.18518628180027008, + "learning_rate": 6.279648752419693e-05, + "loss": 0.272, + "step": 35740 + }, + { + "epoch": 1.589100768991421, + "grad_norm": 0.16341829299926758, + "learning_rate": 6.2776506411005e-05, + "loss": 0.2748, + "step": 35750 + }, + { + "epoch": 1.589545272703027, + "grad_norm": 0.1698753535747528, + "learning_rate": 6.275652311465758e-05, + "loss": 0.2729, + "step": 35760 + }, + { + "epoch": 1.5899897764146331, + "grad_norm": 0.20402942597866058, + "learning_rate": 6.273653763856926e-05, + "loss": 0.2726, + "step": 35770 + }, + { + "epoch": 1.5904342801262392, + "grad_norm": 0.1595955193042755, + "learning_rate": 6.271654998615501e-05, + "loss": 0.2735, + "step": 35780 + }, + { + "epoch": 1.590878783837845, + "grad_norm": 0.1861637830734253, + "learning_rate": 6.269656016083013e-05, + "loss": 0.2716, + "step": 35790 + }, + { + "epoch": 1.591323287549451, + "grad_norm": 0.2056669443845749, + "learning_rate": 6.267656816601038e-05, + "loss": 0.276, + "step": 35800 + }, + { + "epoch": 1.591767791261057, + "grad_norm": 0.18944720923900604, + "learning_rate": 6.265657400511185e-05, + "loss": 0.2707, + "step": 35810 + }, + { + "epoch": 1.5922122949726631, + "grad_norm": 0.17047585546970367, + "learning_rate": 6.263657768155098e-05, + "loss": 0.2713, + "step": 35820 + }, + { + "epoch": 1.592656798684269, + "grad_norm": 0.1688685417175293, + "learning_rate": 6.261657919874457e-05, + "loss": 0.2725, + "step": 35830 + }, + { + "epoch": 1.5931013023958749, + "grad_norm": 0.17079026997089386, + "learning_rate": 6.259657856010986e-05, + "loss": 0.2736, + "step": 35840 + }, + { + "epoch": 1.593545806107481, + "grad_norm": 0.18308112025260925, + "learning_rate": 6.257657576906439e-05, + "loss": 0.2797, + "step": 35850 + }, + { + "epoch": 1.593990309819087, + "grad_norm": 0.21194833517074585, + "learning_rate": 6.255657082902609e-05, + "loss": 0.2726, + "step": 35860 + }, + { + "epoch": 1.594434813530693, + "grad_norm": 0.195952907204628, + "learning_rate": 6.253656374341325e-05, + "loss": 0.2748, + "step": 35870 + }, + { + "epoch": 1.5948793172422988, + "grad_norm": 0.21490685641765594, + "learning_rate": 6.251655451564457e-05, + "loss": 0.2731, + "step": 35880 + }, + { + "epoch": 1.595323820953905, + "grad_norm": 0.1746848225593567, + "learning_rate": 6.249654314913902e-05, + "loss": 0.2713, + "step": 35890 + }, + { + "epoch": 1.595768324665511, + "grad_norm": 0.16716453433036804, + "learning_rate": 6.247652964731604e-05, + "loss": 0.2763, + "step": 35900 + }, + { + "epoch": 1.596212828377117, + "grad_norm": 0.19129648804664612, + "learning_rate": 6.245651401359537e-05, + "loss": 0.2733, + "step": 35910 + }, + { + "epoch": 1.596657332088723, + "grad_norm": 0.189659982919693, + "learning_rate": 6.243649625139715e-05, + "loss": 0.2726, + "step": 35920 + }, + { + "epoch": 1.5971018358003288, + "grad_norm": 0.21662046015262604, + "learning_rate": 6.241647636414185e-05, + "loss": 0.2723, + "step": 35930 + }, + { + "epoch": 1.597546339511935, + "grad_norm": 0.1877111792564392, + "learning_rate": 6.239645435525034e-05, + "loss": 0.2774, + "step": 35940 + }, + { + "epoch": 1.597990843223541, + "grad_norm": 0.19247159361839294, + "learning_rate": 6.237643022814381e-05, + "loss": 0.2709, + "step": 35950 + }, + { + "epoch": 1.5984353469351469, + "grad_norm": 0.20862708985805511, + "learning_rate": 6.235640398624386e-05, + "loss": 0.2741, + "step": 35960 + }, + { + "epoch": 1.5988798506467528, + "grad_norm": 0.20917603373527527, + "learning_rate": 6.233637563297243e-05, + "loss": 0.271, + "step": 35970 + }, + { + "epoch": 1.5993243543583588, + "grad_norm": 0.18749932944774628, + "learning_rate": 6.23163451717518e-05, + "loss": 0.2743, + "step": 35980 + }, + { + "epoch": 1.599768858069965, + "grad_norm": 0.19105860590934753, + "learning_rate": 6.229631260600463e-05, + "loss": 0.2716, + "step": 35990 + }, + { + "epoch": 1.600213361781571, + "grad_norm": 0.2211247831583023, + "learning_rate": 6.227627793915392e-05, + "loss": 0.272, + "step": 36000 + }, + { + "epoch": 1.600657865493177, + "grad_norm": 0.19004112482070923, + "learning_rate": 6.225624117462309e-05, + "loss": 0.2728, + "step": 36010 + }, + { + "epoch": 1.6011023692047828, + "grad_norm": 0.20967857539653778, + "learning_rate": 6.223620231583586e-05, + "loss": 0.2717, + "step": 36020 + }, + { + "epoch": 1.6015468729163889, + "grad_norm": 0.19819089770317078, + "learning_rate": 6.221616136621629e-05, + "loss": 0.2708, + "step": 36030 + }, + { + "epoch": 1.601991376627995, + "grad_norm": 0.18764415383338928, + "learning_rate": 6.219611832918887e-05, + "loss": 0.2714, + "step": 36040 + }, + { + "epoch": 1.6024358803396008, + "grad_norm": 0.19970957934856415, + "learning_rate": 6.217607320817838e-05, + "loss": 0.2713, + "step": 36050 + }, + { + "epoch": 1.6028803840512067, + "grad_norm": 0.20241610705852509, + "learning_rate": 6.215602600661001e-05, + "loss": 0.2742, + "step": 36060 + }, + { + "epoch": 1.6033248877628128, + "grad_norm": 0.1937139481306076, + "learning_rate": 6.213597672790925e-05, + "loss": 0.2737, + "step": 36070 + }, + { + "epoch": 1.6037693914744189, + "grad_norm": 0.1796075999736786, + "learning_rate": 6.2115925375502e-05, + "loss": 0.272, + "step": 36080 + }, + { + "epoch": 1.6042138951860248, + "grad_norm": 0.2193130999803543, + "learning_rate": 6.209587195281447e-05, + "loss": 0.2744, + "step": 36090 + }, + { + "epoch": 1.6046583988976308, + "grad_norm": 0.20481784641742706, + "learning_rate": 6.207581646327324e-05, + "loss": 0.2737, + "step": 36100 + }, + { + "epoch": 1.6051029026092367, + "grad_norm": 0.21536493301391602, + "learning_rate": 6.205575891030526e-05, + "loss": 0.2753, + "step": 36110 + }, + { + "epoch": 1.6055474063208428, + "grad_norm": 0.16847865283489227, + "learning_rate": 6.203569929733781e-05, + "loss": 0.275, + "step": 36120 + }, + { + "epoch": 1.605991910032449, + "grad_norm": 0.19514857232570648, + "learning_rate": 6.201563762779852e-05, + "loss": 0.2767, + "step": 36130 + }, + { + "epoch": 1.6064364137440548, + "grad_norm": 0.1984069049358368, + "learning_rate": 6.199557390511538e-05, + "loss": 0.2724, + "step": 36140 + }, + { + "epoch": 1.6068809174556606, + "grad_norm": 0.17558176815509796, + "learning_rate": 6.197550813271675e-05, + "loss": 0.2736, + "step": 36150 + }, + { + "epoch": 1.6073254211672667, + "grad_norm": 0.2018265426158905, + "learning_rate": 6.195544031403131e-05, + "loss": 0.2713, + "step": 36160 + }, + { + "epoch": 1.6077699248788728, + "grad_norm": 0.18385744094848633, + "learning_rate": 6.19353704524881e-05, + "loss": 0.2715, + "step": 36170 + }, + { + "epoch": 1.6082144285904787, + "grad_norm": 0.17079541087150574, + "learning_rate": 6.191529855151652e-05, + "loss": 0.2747, + "step": 36180 + }, + { + "epoch": 1.6086589323020846, + "grad_norm": 0.19398416578769684, + "learning_rate": 6.189522461454629e-05, + "loss": 0.2723, + "step": 36190 + }, + { + "epoch": 1.6091034360136907, + "grad_norm": 0.1847253441810608, + "learning_rate": 6.187514864500752e-05, + "loss": 0.2725, + "step": 36200 + }, + { + "epoch": 1.6095479397252968, + "grad_norm": 0.17903926968574524, + "learning_rate": 6.185507064633062e-05, + "loss": 0.2744, + "step": 36210 + }, + { + "epoch": 1.6099924434369028, + "grad_norm": 0.1504022181034088, + "learning_rate": 6.18349906219464e-05, + "loss": 0.2719, + "step": 36220 + }, + { + "epoch": 1.6104369471485087, + "grad_norm": 0.15358814597129822, + "learning_rate": 6.181490857528596e-05, + "loss": 0.2691, + "step": 36230 + }, + { + "epoch": 1.6108814508601146, + "grad_norm": 0.1799965500831604, + "learning_rate": 6.179482450978077e-05, + "loss": 0.2718, + "step": 36240 + }, + { + "epoch": 1.6113259545717207, + "grad_norm": 0.18837939202785492, + "learning_rate": 6.177473842886269e-05, + "loss": 0.2726, + "step": 36250 + }, + { + "epoch": 1.6117704582833268, + "grad_norm": 0.17468783259391785, + "learning_rate": 6.175465033596382e-05, + "loss": 0.2726, + "step": 36260 + }, + { + "epoch": 1.6122149619949326, + "grad_norm": 0.2128366231918335, + "learning_rate": 6.173456023451671e-05, + "loss": 0.2733, + "step": 36270 + }, + { + "epoch": 1.6126594657065385, + "grad_norm": 0.19006125628948212, + "learning_rate": 6.171446812795422e-05, + "loss": 0.2749, + "step": 36280 + }, + { + "epoch": 1.6131039694181446, + "grad_norm": 0.2078080028295517, + "learning_rate": 6.169437401970949e-05, + "loss": 0.2725, + "step": 36290 + }, + { + "epoch": 1.6135484731297507, + "grad_norm": 0.1957048624753952, + "learning_rate": 6.16742779132161e-05, + "loss": 0.2702, + "step": 36300 + }, + { + "epoch": 1.6139929768413568, + "grad_norm": 0.18097719550132751, + "learning_rate": 6.165417981190789e-05, + "loss": 0.272, + "step": 36310 + }, + { + "epoch": 1.6144374805529627, + "grad_norm": 0.19651757180690765, + "learning_rate": 6.16340797192191e-05, + "loss": 0.2741, + "step": 36320 + }, + { + "epoch": 1.6148819842645685, + "grad_norm": 0.17475250363349915, + "learning_rate": 6.161397763858427e-05, + "loss": 0.2752, + "step": 36330 + }, + { + "epoch": 1.6153264879761746, + "grad_norm": 0.16702309250831604, + "learning_rate": 6.159387357343834e-05, + "loss": 0.2728, + "step": 36340 + }, + { + "epoch": 1.6157709916877807, + "grad_norm": 0.17857635021209717, + "learning_rate": 6.157376752721648e-05, + "loss": 0.2719, + "step": 36350 + }, + { + "epoch": 1.6162154953993866, + "grad_norm": 0.19172194600105286, + "learning_rate": 6.155365950335428e-05, + "loss": 0.2734, + "step": 36360 + }, + { + "epoch": 1.6166599991109925, + "grad_norm": 0.18968777358531952, + "learning_rate": 6.153354950528768e-05, + "loss": 0.2715, + "step": 36370 + }, + { + "epoch": 1.6171045028225985, + "grad_norm": 0.18112584948539734, + "learning_rate": 6.151343753645293e-05, + "loss": 0.2734, + "step": 36380 + }, + { + "epoch": 1.6175490065342046, + "grad_norm": 0.1811489462852478, + "learning_rate": 6.149332360028657e-05, + "loss": 0.2707, + "step": 36390 + }, + { + "epoch": 1.6179935102458105, + "grad_norm": 0.20996378362178802, + "learning_rate": 6.147320770022555e-05, + "loss": 0.274, + "step": 36400 + }, + { + "epoch": 1.6184380139574166, + "grad_norm": 0.20555834472179413, + "learning_rate": 6.145308983970715e-05, + "loss": 0.276, + "step": 36410 + }, + { + "epoch": 1.6188825176690225, + "grad_norm": 0.1818380057811737, + "learning_rate": 6.143297002216892e-05, + "loss": 0.2717, + "step": 36420 + }, + { + "epoch": 1.6193270213806286, + "grad_norm": 0.2027517855167389, + "learning_rate": 6.141284825104882e-05, + "loss": 0.2755, + "step": 36430 + }, + { + "epoch": 1.6197715250922347, + "grad_norm": 0.18895547091960907, + "learning_rate": 6.13927245297851e-05, + "loss": 0.2741, + "step": 36440 + }, + { + "epoch": 1.6202160288038405, + "grad_norm": 0.17359377443790436, + "learning_rate": 6.137259886181633e-05, + "loss": 0.2743, + "step": 36450 + }, + { + "epoch": 1.6206605325154464, + "grad_norm": 0.21762323379516602, + "learning_rate": 6.135247125058145e-05, + "loss": 0.2725, + "step": 36460 + }, + { + "epoch": 1.6211050362270525, + "grad_norm": 0.1885450929403305, + "learning_rate": 6.133234169951974e-05, + "loss": 0.2743, + "step": 36470 + }, + { + "epoch": 1.6215495399386586, + "grad_norm": 0.19055888056755066, + "learning_rate": 6.131221021207078e-05, + "loss": 0.2734, + "step": 36480 + }, + { + "epoch": 1.6219940436502644, + "grad_norm": 0.20171663165092468, + "learning_rate": 6.129207679167448e-05, + "loss": 0.2732, + "step": 36490 + }, + { + "epoch": 1.6224385473618703, + "grad_norm": 0.17799989879131317, + "learning_rate": 6.127194144177109e-05, + "loss": 0.2723, + "step": 36500 + }, + { + "epoch": 1.6228830510734764, + "grad_norm": 0.19944514334201813, + "learning_rate": 6.125180416580118e-05, + "loss": 0.2718, + "step": 36510 + }, + { + "epoch": 1.6233275547850825, + "grad_norm": 0.17759284377098083, + "learning_rate": 6.123166496720571e-05, + "loss": 0.2708, + "step": 36520 + }, + { + "epoch": 1.6237720584966886, + "grad_norm": 0.18808822333812714, + "learning_rate": 6.121152384942588e-05, + "loss": 0.2723, + "step": 36530 + }, + { + "epoch": 1.6242165622082945, + "grad_norm": 0.2240743190050125, + "learning_rate": 6.119138081590324e-05, + "loss": 0.2752, + "step": 36540 + }, + { + "epoch": 1.6246610659199003, + "grad_norm": 0.234266459941864, + "learning_rate": 6.117123587007971e-05, + "loss": 0.2747, + "step": 36550 + }, + { + "epoch": 1.6251055696315064, + "grad_norm": 0.2006576657295227, + "learning_rate": 6.11510890153975e-05, + "loss": 0.2738, + "step": 36560 + }, + { + "epoch": 1.6255500733431125, + "grad_norm": 0.18541337549686432, + "learning_rate": 6.113094025529916e-05, + "loss": 0.2716, + "step": 36570 + }, + { + "epoch": 1.6259945770547184, + "grad_norm": 0.19427144527435303, + "learning_rate": 6.111078959322757e-05, + "loss": 0.2719, + "step": 36580 + }, + { + "epoch": 1.6264390807663243, + "grad_norm": 0.16812795400619507, + "learning_rate": 6.109063703262592e-05, + "loss": 0.2714, + "step": 36590 + }, + { + "epoch": 1.6268835844779304, + "grad_norm": 0.20084147155284882, + "learning_rate": 6.107048257693772e-05, + "loss": 0.2729, + "step": 36600 + }, + { + "epoch": 1.6273280881895364, + "grad_norm": 0.17922158539295197, + "learning_rate": 6.105032622960683e-05, + "loss": 0.2718, + "step": 36610 + }, + { + "epoch": 1.6277725919011425, + "grad_norm": 0.16510888934135437, + "learning_rate": 6.103016799407743e-05, + "loss": 0.2721, + "step": 36620 + }, + { + "epoch": 1.6282170956127484, + "grad_norm": 0.17182500660419464, + "learning_rate": 6.1010007873793984e-05, + "loss": 0.2717, + "step": 36630 + }, + { + "epoch": 1.6286615993243543, + "grad_norm": 0.17164170742034912, + "learning_rate": 6.098984587220131e-05, + "loss": 0.2723, + "step": 36640 + }, + { + "epoch": 1.6291061030359604, + "grad_norm": 0.17588217556476593, + "learning_rate": 6.096968199274456e-05, + "loss": 0.2745, + "step": 36650 + }, + { + "epoch": 1.6295506067475665, + "grad_norm": 0.19806408882141113, + "learning_rate": 6.0949516238869166e-05, + "loss": 0.2753, + "step": 36660 + }, + { + "epoch": 1.6299951104591723, + "grad_norm": 0.19028058648109436, + "learning_rate": 6.092934861402092e-05, + "loss": 0.273, + "step": 36670 + }, + { + "epoch": 1.6304396141707782, + "grad_norm": 0.18201155960559845, + "learning_rate": 6.0909179121645924e-05, + "loss": 0.273, + "step": 36680 + }, + { + "epoch": 1.6308841178823843, + "grad_norm": 0.16975320875644684, + "learning_rate": 6.0889007765190576e-05, + "loss": 0.2727, + "step": 36690 + }, + { + "epoch": 1.6313286215939904, + "grad_norm": 0.19974347949028015, + "learning_rate": 6.086883454810162e-05, + "loss": 0.2712, + "step": 36700 + }, + { + "epoch": 1.6317731253055963, + "grad_norm": 0.1805645078420639, + "learning_rate": 6.0848659473826084e-05, + "loss": 0.2728, + "step": 36710 + }, + { + "epoch": 1.6322176290172021, + "grad_norm": 0.15758028626441956, + "learning_rate": 6.082848254581138e-05, + "loss": 0.2696, + "step": 36720 + }, + { + "epoch": 1.6326621327288082, + "grad_norm": 0.222767636179924, + "learning_rate": 6.080830376750517e-05, + "loss": 0.2705, + "step": 36730 + }, + { + "epoch": 1.6331066364404143, + "grad_norm": 0.2057676613330841, + "learning_rate": 6.0788123142355445e-05, + "loss": 0.2735, + "step": 36740 + }, + { + "epoch": 1.6335511401520204, + "grad_norm": 0.17827756702899933, + "learning_rate": 6.076794067381052e-05, + "loss": 0.2746, + "step": 36750 + }, + { + "epoch": 1.6339956438636263, + "grad_norm": 0.2118443250656128, + "learning_rate": 6.074775636531905e-05, + "loss": 0.2691, + "step": 36760 + }, + { + "epoch": 1.6344401475752321, + "grad_norm": 0.16333124041557312, + "learning_rate": 6.072757022032997e-05, + "loss": 0.2731, + "step": 36770 + }, + { + "epoch": 1.6348846512868382, + "grad_norm": 0.18623778223991394, + "learning_rate": 6.070738224229253e-05, + "loss": 0.2732, + "step": 36780 + }, + { + "epoch": 1.6353291549984443, + "grad_norm": 0.17987388372421265, + "learning_rate": 6.0687192434656314e-05, + "loss": 0.2732, + "step": 36790 + }, + { + "epoch": 1.6357736587100502, + "grad_norm": 0.16977857053279877, + "learning_rate": 6.066700080087121e-05, + "loss": 0.2728, + "step": 36800 + }, + { + "epoch": 1.636218162421656, + "grad_norm": 0.1425679475069046, + "learning_rate": 6.0646807344387424e-05, + "loss": 0.2758, + "step": 36810 + }, + { + "epoch": 1.6366626661332622, + "grad_norm": 0.18605726957321167, + "learning_rate": 6.062661206865543e-05, + "loss": 0.273, + "step": 36820 + }, + { + "epoch": 1.6371071698448683, + "grad_norm": 0.20058505237102509, + "learning_rate": 6.06064149771261e-05, + "loss": 0.2698, + "step": 36830 + }, + { + "epoch": 1.6375516735564744, + "grad_norm": 0.1722363829612732, + "learning_rate": 6.058621607325051e-05, + "loss": 0.2726, + "step": 36840 + }, + { + "epoch": 1.6379961772680802, + "grad_norm": 0.1963963806629181, + "learning_rate": 6.056601536048014e-05, + "loss": 0.2737, + "step": 36850 + }, + { + "epoch": 1.638440680979686, + "grad_norm": 0.22676414251327515, + "learning_rate": 6.0545812842266725e-05, + "loss": 0.274, + "step": 36860 + }, + { + "epoch": 1.6388851846912922, + "grad_norm": 0.16064786911010742, + "learning_rate": 6.052560852206232e-05, + "loss": 0.2693, + "step": 36870 + }, + { + "epoch": 1.6393296884028983, + "grad_norm": 0.14513401687145233, + "learning_rate": 6.05054024033193e-05, + "loss": 0.2717, + "step": 36880 + }, + { + "epoch": 1.6397741921145041, + "grad_norm": 0.19125410914421082, + "learning_rate": 6.048519448949032e-05, + "loss": 0.274, + "step": 36890 + }, + { + "epoch": 1.64021869582611, + "grad_norm": 0.20526550710201263, + "learning_rate": 6.046498478402839e-05, + "loss": 0.2723, + "step": 36900 + }, + { + "epoch": 1.640663199537716, + "grad_norm": 0.17118553817272186, + "learning_rate": 6.044477329038677e-05, + "loss": 0.2725, + "step": 36910 + }, + { + "epoch": 1.6411077032493222, + "grad_norm": 0.1878269612789154, + "learning_rate": 6.042456001201906e-05, + "loss": 0.272, + "step": 36920 + }, + { + "epoch": 1.641552206960928, + "grad_norm": 0.1863107681274414, + "learning_rate": 6.040434495237917e-05, + "loss": 0.2731, + "step": 36930 + }, + { + "epoch": 1.6419967106725342, + "grad_norm": 0.2160511463880539, + "learning_rate": 6.0384128114921256e-05, + "loss": 0.2738, + "step": 36940 + }, + { + "epoch": 1.64244121438414, + "grad_norm": 0.22798582911491394, + "learning_rate": 6.036390950309987e-05, + "loss": 0.2722, + "step": 36950 + }, + { + "epoch": 1.6428857180957461, + "grad_norm": 0.20711484551429749, + "learning_rate": 6.0343689120369805e-05, + "loss": 0.271, + "step": 36960 + }, + { + "epoch": 1.6433302218073522, + "grad_norm": 0.1812049001455307, + "learning_rate": 6.032346697018616e-05, + "loss": 0.2709, + "step": 36970 + }, + { + "epoch": 1.643774725518958, + "grad_norm": 0.19427739083766937, + "learning_rate": 6.0303243056004375e-05, + "loss": 0.2708, + "step": 36980 + }, + { + "epoch": 1.644219229230564, + "grad_norm": 0.19371497631072998, + "learning_rate": 6.0283017381280136e-05, + "loss": 0.275, + "step": 36990 + }, + { + "epoch": 1.64466373294217, + "grad_norm": 0.15367721021175385, + "learning_rate": 6.026278994946948e-05, + "loss": 0.2704, + "step": 37000 + }, + { + "epoch": 1.6451082366537761, + "grad_norm": 0.1820140928030014, + "learning_rate": 6.02425607640287e-05, + "loss": 0.2714, + "step": 37010 + }, + { + "epoch": 1.645552740365382, + "grad_norm": 0.15976612269878387, + "learning_rate": 6.022232982841441e-05, + "loss": 0.2707, + "step": 37020 + }, + { + "epoch": 1.6459972440769879, + "grad_norm": 0.16923528909683228, + "learning_rate": 6.020209714608355e-05, + "loss": 0.2741, + "step": 37030 + }, + { + "epoch": 1.646441747788594, + "grad_norm": 0.16888760030269623, + "learning_rate": 6.018186272049331e-05, + "loss": 0.274, + "step": 37040 + }, + { + "epoch": 1.6468862515002, + "grad_norm": 0.17968451976776123, + "learning_rate": 6.01616265551012e-05, + "loss": 0.2735, + "step": 37050 + }, + { + "epoch": 1.6473307552118062, + "grad_norm": 0.18508858978748322, + "learning_rate": 6.014138865336503e-05, + "loss": 0.2723, + "step": 37060 + }, + { + "epoch": 1.647775258923412, + "grad_norm": 0.16607818007469177, + "learning_rate": 6.0121149018742905e-05, + "loss": 0.2734, + "step": 37070 + }, + { + "epoch": 1.648219762635018, + "grad_norm": 0.1726800948381424, + "learning_rate": 6.010090765469325e-05, + "loss": 0.2745, + "step": 37080 + }, + { + "epoch": 1.648664266346624, + "grad_norm": 0.23285000026226044, + "learning_rate": 6.008066456467473e-05, + "loss": 0.2737, + "step": 37090 + }, + { + "epoch": 1.64910877005823, + "grad_norm": 0.2228371798992157, + "learning_rate": 6.0060419752146335e-05, + "loss": 0.2736, + "step": 37100 + }, + { + "epoch": 1.649553273769836, + "grad_norm": 0.21786463260650635, + "learning_rate": 6.0040173220567353e-05, + "loss": 0.2705, + "step": 37110 + }, + { + "epoch": 1.6499977774814418, + "grad_norm": 0.22025136649608612, + "learning_rate": 6.001992497339737e-05, + "loss": 0.2712, + "step": 37120 + }, + { + "epoch": 1.650442281193048, + "grad_norm": 0.15450231730937958, + "learning_rate": 5.999967501409626e-05, + "loss": 0.274, + "step": 37130 + }, + { + "epoch": 1.650886784904654, + "grad_norm": 0.19266358017921448, + "learning_rate": 5.997942334612418e-05, + "loss": 0.2753, + "step": 37140 + }, + { + "epoch": 1.65133128861626, + "grad_norm": 0.17511260509490967, + "learning_rate": 5.995916997294158e-05, + "loss": 0.2727, + "step": 37150 + }, + { + "epoch": 1.651775792327866, + "grad_norm": 0.15349449217319489, + "learning_rate": 5.9938914898009214e-05, + "loss": 0.2726, + "step": 37160 + }, + { + "epoch": 1.6522202960394718, + "grad_norm": 0.1872551143169403, + "learning_rate": 5.991865812478813e-05, + "loss": 0.2742, + "step": 37170 + }, + { + "epoch": 1.652664799751078, + "grad_norm": 0.17932872474193573, + "learning_rate": 5.989839965673964e-05, + "loss": 0.2726, + "step": 37180 + }, + { + "epoch": 1.653109303462684, + "grad_norm": 0.17251485586166382, + "learning_rate": 5.987813949732539e-05, + "loss": 0.273, + "step": 37190 + }, + { + "epoch": 1.65355380717429, + "grad_norm": 0.19700740277767181, + "learning_rate": 5.9857877650007255e-05, + "loss": 0.2737, + "step": 37200 + }, + { + "epoch": 1.6539983108858958, + "grad_norm": 0.19179430603981018, + "learning_rate": 5.983761411824744e-05, + "loss": 0.2713, + "step": 37210 + }, + { + "epoch": 1.6544428145975019, + "grad_norm": 0.2074359804391861, + "learning_rate": 5.981734890550844e-05, + "loss": 0.2744, + "step": 37220 + }, + { + "epoch": 1.654887318309108, + "grad_norm": 0.22995665669441223, + "learning_rate": 5.979708201525301e-05, + "loss": 0.2769, + "step": 37230 + }, + { + "epoch": 1.6553318220207138, + "grad_norm": 0.16160140931606293, + "learning_rate": 5.977681345094422e-05, + "loss": 0.2715, + "step": 37240 + }, + { + "epoch": 1.65577632573232, + "grad_norm": 0.187421977519989, + "learning_rate": 5.97565432160454e-05, + "loss": 0.2738, + "step": 37250 + }, + { + "epoch": 1.6562208294439258, + "grad_norm": 0.23240628838539124, + "learning_rate": 5.9736271314020186e-05, + "loss": 0.2732, + "step": 37260 + }, + { + "epoch": 1.6566653331555319, + "grad_norm": 0.16059304773807526, + "learning_rate": 5.971599774833251e-05, + "loss": 0.2738, + "step": 37270 + }, + { + "epoch": 1.657109836867138, + "grad_norm": 0.19479450583457947, + "learning_rate": 5.9695722522446525e-05, + "loss": 0.2728, + "step": 37280 + }, + { + "epoch": 1.6575543405787438, + "grad_norm": 0.18236735463142395, + "learning_rate": 5.9675445639826765e-05, + "loss": 0.2723, + "step": 37290 + }, + { + "epoch": 1.6579988442903497, + "grad_norm": 0.16632212698459625, + "learning_rate": 5.965516710393796e-05, + "loss": 0.2719, + "step": 37300 + }, + { + "epoch": 1.6584433480019558, + "grad_norm": 0.21214696764945984, + "learning_rate": 5.963488691824516e-05, + "loss": 0.2708, + "step": 37310 + }, + { + "epoch": 1.658887851713562, + "grad_norm": 0.15852706134319305, + "learning_rate": 5.96146050862137e-05, + "loss": 0.2722, + "step": 37320 + }, + { + "epoch": 1.6593323554251678, + "grad_norm": 0.17106792330741882, + "learning_rate": 5.959432161130919e-05, + "loss": 0.2709, + "step": 37330 + }, + { + "epoch": 1.6597768591367736, + "grad_norm": 0.14483758807182312, + "learning_rate": 5.9574036496997545e-05, + "loss": 0.271, + "step": 37340 + }, + { + "epoch": 1.6602213628483797, + "grad_norm": 0.19706648588180542, + "learning_rate": 5.955374974674488e-05, + "loss": 0.2758, + "step": 37350 + }, + { + "epoch": 1.6606658665599858, + "grad_norm": 0.1932980865240097, + "learning_rate": 5.9533461364017696e-05, + "loss": 0.2708, + "step": 37360 + }, + { + "epoch": 1.661110370271592, + "grad_norm": 0.16503892838954926, + "learning_rate": 5.9513171352282716e-05, + "loss": 0.2719, + "step": 37370 + }, + { + "epoch": 1.6615548739831978, + "grad_norm": 0.17955490946769714, + "learning_rate": 5.949287971500692e-05, + "loss": 0.2751, + "step": 37380 + }, + { + "epoch": 1.6619993776948037, + "grad_norm": 0.18495552241802216, + "learning_rate": 5.947258645565762e-05, + "loss": 0.2684, + "step": 37390 + }, + { + "epoch": 1.6624438814064098, + "grad_norm": 0.22321337461471558, + "learning_rate": 5.945229157770237e-05, + "loss": 0.2744, + "step": 37400 + }, + { + "epoch": 1.6628883851180158, + "grad_norm": 0.1816638708114624, + "learning_rate": 5.9431995084609006e-05, + "loss": 0.2717, + "step": 37410 + }, + { + "epoch": 1.6633328888296217, + "grad_norm": 0.18745195865631104, + "learning_rate": 5.941169697984564e-05, + "loss": 0.271, + "step": 37420 + }, + { + "epoch": 1.6637773925412276, + "grad_norm": 0.15878330171108246, + "learning_rate": 5.9391397266880675e-05, + "loss": 0.2708, + "step": 37430 + }, + { + "epoch": 1.6642218962528337, + "grad_norm": 0.17512080073356628, + "learning_rate": 5.937109594918279e-05, + "loss": 0.27, + "step": 37440 + }, + { + "epoch": 1.6646663999644398, + "grad_norm": 0.20742443203926086, + "learning_rate": 5.9350793030220884e-05, + "loss": 0.2741, + "step": 37450 + }, + { + "epoch": 1.6651109036760459, + "grad_norm": 0.21391713619232178, + "learning_rate": 5.933048851346421e-05, + "loss": 0.271, + "step": 37460 + }, + { + "epoch": 1.6655554073876517, + "grad_norm": 0.19171030819416046, + "learning_rate": 5.931018240238224e-05, + "loss": 0.2754, + "step": 37470 + }, + { + "epoch": 1.6659999110992576, + "grad_norm": 0.2079736590385437, + "learning_rate": 5.928987470044471e-05, + "loss": 0.2712, + "step": 37480 + }, + { + "epoch": 1.6664444148108637, + "grad_norm": 0.16805075109004974, + "learning_rate": 5.9269565411121695e-05, + "loss": 0.2715, + "step": 37490 + }, + { + "epoch": 1.6668889185224698, + "grad_norm": 0.1772446185350418, + "learning_rate": 5.924925453788347e-05, + "loss": 0.2725, + "step": 37500 + }, + { + "epoch": 1.6673334222340757, + "grad_norm": 0.17678183317184448, + "learning_rate": 5.92289420842006e-05, + "loss": 0.2715, + "step": 37510 + }, + { + "epoch": 1.6677779259456815, + "grad_norm": 0.17005151510238647, + "learning_rate": 5.9208628053543945e-05, + "loss": 0.2708, + "step": 37520 + }, + { + "epoch": 1.6682224296572876, + "grad_norm": 0.2253073900938034, + "learning_rate": 5.918831244938462e-05, + "loss": 0.2725, + "step": 37530 + }, + { + "epoch": 1.6686669333688937, + "grad_norm": 0.18886059522628784, + "learning_rate": 5.9167995275194e-05, + "loss": 0.2704, + "step": 37540 + }, + { + "epoch": 1.6691114370804996, + "grad_norm": 0.20082859694957733, + "learning_rate": 5.914767653444373e-05, + "loss": 0.2755, + "step": 37550 + }, + { + "epoch": 1.6695559407921055, + "grad_norm": 0.17076003551483154, + "learning_rate": 5.912735623060572e-05, + "loss": 0.2709, + "step": 37560 + }, + { + "epoch": 1.6700004445037115, + "grad_norm": 0.1758895367383957, + "learning_rate": 5.910703436715217e-05, + "loss": 0.2735, + "step": 37570 + }, + { + "epoch": 1.6704449482153176, + "grad_norm": 0.1693122386932373, + "learning_rate": 5.908671094755552e-05, + "loss": 0.2726, + "step": 37580 + }, + { + "epoch": 1.6708894519269237, + "grad_norm": 0.17325125634670258, + "learning_rate": 5.906638597528851e-05, + "loss": 0.2681, + "step": 37590 + }, + { + "epoch": 1.6713339556385296, + "grad_norm": 0.1598874032497406, + "learning_rate": 5.9046059453824076e-05, + "loss": 0.269, + "step": 37600 + }, + { + "epoch": 1.6717784593501355, + "grad_norm": 0.15811869502067566, + "learning_rate": 5.9025731386635505e-05, + "loss": 0.2746, + "step": 37610 + }, + { + "epoch": 1.6722229630617416, + "grad_norm": 0.2065144181251526, + "learning_rate": 5.900540177719629e-05, + "loss": 0.2702, + "step": 37620 + }, + { + "epoch": 1.6726674667733477, + "grad_norm": 0.15652485191822052, + "learning_rate": 5.898507062898021e-05, + "loss": 0.2693, + "step": 37630 + }, + { + "epoch": 1.6731119704849535, + "grad_norm": 0.17757095396518707, + "learning_rate": 5.8964737945461316e-05, + "loss": 0.2752, + "step": 37640 + }, + { + "epoch": 1.6735564741965594, + "grad_norm": 0.1679610162973404, + "learning_rate": 5.8944403730113885e-05, + "loss": 0.2729, + "step": 37650 + }, + { + "epoch": 1.6740009779081655, + "grad_norm": 0.21665461361408234, + "learning_rate": 5.892406798641248e-05, + "loss": 0.2714, + "step": 37660 + }, + { + "epoch": 1.6744454816197716, + "grad_norm": 0.2038518786430359, + "learning_rate": 5.890373071783193e-05, + "loss": 0.2744, + "step": 37670 + }, + { + "epoch": 1.6748899853313777, + "grad_norm": 0.2038002759218216, + "learning_rate": 5.888339192784732e-05, + "loss": 0.2727, + "step": 37680 + }, + { + "epoch": 1.6753344890429835, + "grad_norm": 0.17463938891887665, + "learning_rate": 5.8863051619934003e-05, + "loss": 0.2718, + "step": 37690 + }, + { + "epoch": 1.6757789927545894, + "grad_norm": 0.16424426436424255, + "learning_rate": 5.8842709797567554e-05, + "loss": 0.2742, + "step": 37700 + }, + { + "epoch": 1.6762234964661955, + "grad_norm": 0.19593995809555054, + "learning_rate": 5.8822366464223855e-05, + "loss": 0.2716, + "step": 37710 + }, + { + "epoch": 1.6766680001778016, + "grad_norm": 0.2003490924835205, + "learning_rate": 5.880202162337901e-05, + "loss": 0.2721, + "step": 37720 + }, + { + "epoch": 1.6771125038894075, + "grad_norm": 0.1796686202287674, + "learning_rate": 5.8781675278509405e-05, + "loss": 0.2742, + "step": 37730 + }, + { + "epoch": 1.6775570076010133, + "grad_norm": 0.16420294344425201, + "learning_rate": 5.8761327433091696e-05, + "loss": 0.2726, + "step": 37740 + }, + { + "epoch": 1.6780015113126194, + "grad_norm": 0.1944759041070938, + "learning_rate": 5.874097809060275e-05, + "loss": 0.2716, + "step": 37750 + }, + { + "epoch": 1.6784460150242255, + "grad_norm": 0.1550225019454956, + "learning_rate": 5.87206272545197e-05, + "loss": 0.2714, + "step": 37760 + }, + { + "epoch": 1.6788905187358314, + "grad_norm": 0.14266985654830933, + "learning_rate": 5.8700274928319955e-05, + "loss": 0.2709, + "step": 37770 + }, + { + "epoch": 1.6793350224474375, + "grad_norm": 0.1892634928226471, + "learning_rate": 5.867992111548118e-05, + "loss": 0.2709, + "step": 37780 + }, + { + "epoch": 1.6797795261590434, + "grad_norm": 0.19170527160167694, + "learning_rate": 5.865956581948131e-05, + "loss": 0.2718, + "step": 37790 + }, + { + "epoch": 1.6802240298706494, + "grad_norm": 0.17632389068603516, + "learning_rate": 5.863920904379845e-05, + "loss": 0.2726, + "step": 37800 + }, + { + "epoch": 1.6806685335822555, + "grad_norm": 0.1758642941713333, + "learning_rate": 5.8618850791911064e-05, + "loss": 0.2752, + "step": 37810 + }, + { + "epoch": 1.6811130372938614, + "grad_norm": 0.18272735178470612, + "learning_rate": 5.859849106729779e-05, + "loss": 0.2747, + "step": 37820 + }, + { + "epoch": 1.6815575410054673, + "grad_norm": 0.16781732439994812, + "learning_rate": 5.857812987343758e-05, + "loss": 0.2733, + "step": 37830 + }, + { + "epoch": 1.6820020447170734, + "grad_norm": 0.18059222400188446, + "learning_rate": 5.855776721380957e-05, + "loss": 0.2716, + "step": 37840 + }, + { + "epoch": 1.6824465484286795, + "grad_norm": 0.18205392360687256, + "learning_rate": 5.8537403091893217e-05, + "loss": 0.2716, + "step": 37850 + }, + { + "epoch": 1.6828910521402853, + "grad_norm": 0.19454334676265717, + "learning_rate": 5.851703751116816e-05, + "loss": 0.27, + "step": 37860 + }, + { + "epoch": 1.6833355558518912, + "grad_norm": 0.19733692705631256, + "learning_rate": 5.8496670475114336e-05, + "loss": 0.2713, + "step": 37870 + }, + { + "epoch": 1.6837800595634973, + "grad_norm": 0.1863328069448471, + "learning_rate": 5.84763019872119e-05, + "loss": 0.272, + "step": 37880 + }, + { + "epoch": 1.6842245632751034, + "grad_norm": 0.18049342930316925, + "learning_rate": 5.845593205094131e-05, + "loss": 0.272, + "step": 37890 + }, + { + "epoch": 1.6846690669867095, + "grad_norm": 0.1919964700937271, + "learning_rate": 5.843556066978318e-05, + "loss": 0.2751, + "step": 37900 + }, + { + "epoch": 1.6851135706983154, + "grad_norm": 0.17439888417720795, + "learning_rate": 5.8415187847218455e-05, + "loss": 0.2715, + "step": 37910 + }, + { + "epoch": 1.6855580744099212, + "grad_norm": 0.16642531752586365, + "learning_rate": 5.839481358672827e-05, + "loss": 0.2708, + "step": 37920 + }, + { + "epoch": 1.6860025781215273, + "grad_norm": 0.16049028933048248, + "learning_rate": 5.837443789179407e-05, + "loss": 0.2729, + "step": 37930 + }, + { + "epoch": 1.6864470818331334, + "grad_norm": 0.17986537516117096, + "learning_rate": 5.8354060765897445e-05, + "loss": 0.2716, + "step": 37940 + }, + { + "epoch": 1.6868915855447393, + "grad_norm": 0.1827145367860794, + "learning_rate": 5.8333682212520334e-05, + "loss": 0.2727, + "step": 37950 + }, + { + "epoch": 1.6873360892563452, + "grad_norm": 0.18626070022583008, + "learning_rate": 5.831330223514486e-05, + "loss": 0.2758, + "step": 37960 + }, + { + "epoch": 1.6877805929679512, + "grad_norm": 0.19283327460289001, + "learning_rate": 5.8292920837253396e-05, + "loss": 0.2712, + "step": 37970 + }, + { + "epoch": 1.6882250966795573, + "grad_norm": 0.19249173998832703, + "learning_rate": 5.827253802232857e-05, + "loss": 0.2719, + "step": 37980 + }, + { + "epoch": 1.6886696003911634, + "grad_norm": 0.19505684077739716, + "learning_rate": 5.825215379385327e-05, + "loss": 0.2723, + "step": 37990 + }, + { + "epoch": 1.6891141041027693, + "grad_norm": 0.18178710341453552, + "learning_rate": 5.823176815531057e-05, + "loss": 0.2717, + "step": 38000 + }, + { + "epoch": 1.6895586078143752, + "grad_norm": 0.19725337624549866, + "learning_rate": 5.8211381110183826e-05, + "loss": 0.2714, + "step": 38010 + }, + { + "epoch": 1.6900031115259813, + "grad_norm": 0.1886989027261734, + "learning_rate": 5.8190992661956645e-05, + "loss": 0.2707, + "step": 38020 + }, + { + "epoch": 1.6904476152375874, + "grad_norm": 0.1885639876127243, + "learning_rate": 5.817060281411284e-05, + "loss": 0.2719, + "step": 38030 + }, + { + "epoch": 1.6908921189491932, + "grad_norm": 0.21608993411064148, + "learning_rate": 5.815021157013647e-05, + "loss": 0.2703, + "step": 38040 + }, + { + "epoch": 1.691336622660799, + "grad_norm": 0.20453009009361267, + "learning_rate": 5.8129818933511856e-05, + "loss": 0.2725, + "step": 38050 + }, + { + "epoch": 1.6917811263724052, + "grad_norm": 0.20932017266750336, + "learning_rate": 5.8109424907723544e-05, + "loss": 0.2722, + "step": 38060 + }, + { + "epoch": 1.6922256300840113, + "grad_norm": 0.1806226372718811, + "learning_rate": 5.80890294962563e-05, + "loss": 0.2719, + "step": 38070 + }, + { + "epoch": 1.6926701337956171, + "grad_norm": 0.16049712896347046, + "learning_rate": 5.806863270259515e-05, + "loss": 0.2718, + "step": 38080 + }, + { + "epoch": 1.693114637507223, + "grad_norm": 0.1885555237531662, + "learning_rate": 5.804823453022536e-05, + "loss": 0.2738, + "step": 38090 + }, + { + "epoch": 1.6935591412188291, + "grad_norm": 0.1805446594953537, + "learning_rate": 5.80278349826324e-05, + "loss": 0.272, + "step": 38100 + }, + { + "epoch": 1.6940036449304352, + "grad_norm": 0.1850568652153015, + "learning_rate": 5.8007434063302014e-05, + "loss": 0.2741, + "step": 38110 + }, + { + "epoch": 1.6944481486420413, + "grad_norm": 0.18533656001091003, + "learning_rate": 5.7987031775720136e-05, + "loss": 0.2735, + "step": 38120 + }, + { + "epoch": 1.6948926523536472, + "grad_norm": 0.16344155371189117, + "learning_rate": 5.7966628123372976e-05, + "loss": 0.2725, + "step": 38130 + }, + { + "epoch": 1.695337156065253, + "grad_norm": 0.1753750741481781, + "learning_rate": 5.7946223109746956e-05, + "loss": 0.2725, + "step": 38140 + }, + { + "epoch": 1.6957816597768591, + "grad_norm": 0.1727258265018463, + "learning_rate": 5.7925816738328754e-05, + "loss": 0.2729, + "step": 38150 + }, + { + "epoch": 1.6962261634884652, + "grad_norm": 0.16115306317806244, + "learning_rate": 5.790540901260521e-05, + "loss": 0.2724, + "step": 38160 + }, + { + "epoch": 1.696670667200071, + "grad_norm": 0.18335160613059998, + "learning_rate": 5.788499993606351e-05, + "loss": 0.2716, + "step": 38170 + }, + { + "epoch": 1.697115170911677, + "grad_norm": 0.16819845139980316, + "learning_rate": 5.786458951219096e-05, + "loss": 0.2741, + "step": 38180 + }, + { + "epoch": 1.697559674623283, + "grad_norm": 0.176747128367424, + "learning_rate": 5.784417774447517e-05, + "loss": 0.2712, + "step": 38190 + }, + { + "epoch": 1.6980041783348891, + "grad_norm": 0.2165624052286148, + "learning_rate": 5.782376463640393e-05, + "loss": 0.2727, + "step": 38200 + }, + { + "epoch": 1.6984486820464952, + "grad_norm": 0.1861310750246048, + "learning_rate": 5.780335019146531e-05, + "loss": 0.2734, + "step": 38210 + }, + { + "epoch": 1.698893185758101, + "grad_norm": 0.19108860194683075, + "learning_rate": 5.778293441314755e-05, + "loss": 0.2701, + "step": 38220 + }, + { + "epoch": 1.699337689469707, + "grad_norm": 0.1838565170764923, + "learning_rate": 5.776251730493917e-05, + "loss": 0.2726, + "step": 38230 + }, + { + "epoch": 1.699782193181313, + "grad_norm": 0.16640526056289673, + "learning_rate": 5.774209887032887e-05, + "loss": 0.2691, + "step": 38240 + }, + { + "epoch": 1.7002266968929192, + "grad_norm": 0.18789614737033844, + "learning_rate": 5.772167911280565e-05, + "loss": 0.2727, + "step": 38250 + }, + { + "epoch": 1.700671200604525, + "grad_norm": 0.16948258876800537, + "learning_rate": 5.770125803585864e-05, + "loss": 0.2753, + "step": 38260 + }, + { + "epoch": 1.701115704316131, + "grad_norm": 0.17323094606399536, + "learning_rate": 5.768083564297726e-05, + "loss": 0.2723, + "step": 38270 + }, + { + "epoch": 1.701560208027737, + "grad_norm": 0.18953458964824677, + "learning_rate": 5.766041193765114e-05, + "loss": 0.2707, + "step": 38280 + }, + { + "epoch": 1.702004711739343, + "grad_norm": 0.21781979501247406, + "learning_rate": 5.763998692337015e-05, + "loss": 0.2733, + "step": 38290 + }, + { + "epoch": 1.702449215450949, + "grad_norm": 0.14487427473068237, + "learning_rate": 5.761956060362433e-05, + "loss": 0.2707, + "step": 38300 + }, + { + "epoch": 1.702893719162555, + "grad_norm": 0.13407573103904724, + "learning_rate": 5.7599132981904e-05, + "loss": 0.2705, + "step": 38310 + }, + { + "epoch": 1.703338222874161, + "grad_norm": 0.17699284851551056, + "learning_rate": 5.75787040616997e-05, + "loss": 0.2683, + "step": 38320 + }, + { + "epoch": 1.703782726585767, + "grad_norm": 0.17241401970386505, + "learning_rate": 5.755827384650212e-05, + "loss": 0.2688, + "step": 38330 + }, + { + "epoch": 1.704227230297373, + "grad_norm": 0.19642874598503113, + "learning_rate": 5.753784233980228e-05, + "loss": 0.2715, + "step": 38340 + }, + { + "epoch": 1.704671734008979, + "grad_norm": 0.21424521505832672, + "learning_rate": 5.751740954509135e-05, + "loss": 0.2724, + "step": 38350 + }, + { + "epoch": 1.7051162377205848, + "grad_norm": 0.22586506605148315, + "learning_rate": 5.7496975465860715e-05, + "loss": 0.2697, + "step": 38360 + }, + { + "epoch": 1.705560741432191, + "grad_norm": 0.20113001763820648, + "learning_rate": 5.747654010560202e-05, + "loss": 0.2711, + "step": 38370 + }, + { + "epoch": 1.706005245143797, + "grad_norm": 0.2040012925863266, + "learning_rate": 5.7456103467807097e-05, + "loss": 0.2708, + "step": 38380 + }, + { + "epoch": 1.706449748855403, + "grad_norm": 0.186720073223114, + "learning_rate": 5.7435665555968046e-05, + "loss": 0.2701, + "step": 38390 + }, + { + "epoch": 1.7068942525670088, + "grad_norm": 0.1892739087343216, + "learning_rate": 5.74152263735771e-05, + "loss": 0.2704, + "step": 38400 + }, + { + "epoch": 1.7073387562786149, + "grad_norm": 0.1702302247285843, + "learning_rate": 5.739478592412677e-05, + "loss": 0.2722, + "step": 38410 + }, + { + "epoch": 1.707783259990221, + "grad_norm": 0.14816142618656158, + "learning_rate": 5.7374344211109766e-05, + "loss": 0.2738, + "step": 38420 + }, + { + "epoch": 1.708227763701827, + "grad_norm": 0.1559356302022934, + "learning_rate": 5.735390123801904e-05, + "loss": 0.2717, + "step": 38430 + }, + { + "epoch": 1.708672267413433, + "grad_norm": 0.17257265746593475, + "learning_rate": 5.7333457008347704e-05, + "loss": 0.2749, + "step": 38440 + }, + { + "epoch": 1.7091167711250388, + "grad_norm": 0.16692353785037994, + "learning_rate": 5.7313011525589156e-05, + "loss": 0.2736, + "step": 38450 + }, + { + "epoch": 1.7095612748366449, + "grad_norm": 0.14477072656154633, + "learning_rate": 5.729256479323694e-05, + "loss": 0.2699, + "step": 38460 + }, + { + "epoch": 1.710005778548251, + "grad_norm": 0.16830898821353912, + "learning_rate": 5.727211681478485e-05, + "loss": 0.2704, + "step": 38470 + }, + { + "epoch": 1.7104502822598568, + "grad_norm": 0.20513609051704407, + "learning_rate": 5.7251667593726886e-05, + "loss": 0.2712, + "step": 38480 + }, + { + "epoch": 1.7108947859714627, + "grad_norm": 0.17158550024032593, + "learning_rate": 5.723121713355728e-05, + "loss": 0.2737, + "step": 38490 + }, + { + "epoch": 1.7113392896830688, + "grad_norm": 0.19203855097293854, + "learning_rate": 5.721076543777044e-05, + "loss": 0.2759, + "step": 38500 + }, + { + "epoch": 1.711783793394675, + "grad_norm": 0.18123182654380798, + "learning_rate": 5.7190312509860986e-05, + "loss": 0.2712, + "step": 38510 + }, + { + "epoch": 1.712228297106281, + "grad_norm": 0.13495047390460968, + "learning_rate": 5.716985835332379e-05, + "loss": 0.272, + "step": 38520 + }, + { + "epoch": 1.7126728008178869, + "grad_norm": 0.15087643265724182, + "learning_rate": 5.714940297165389e-05, + "loss": 0.2722, + "step": 38530 + }, + { + "epoch": 1.7131173045294927, + "grad_norm": 0.1858915537595749, + "learning_rate": 5.712894636834656e-05, + "loss": 0.2711, + "step": 38540 + }, + { + "epoch": 1.7135618082410988, + "grad_norm": 0.19023078680038452, + "learning_rate": 5.7108488546897276e-05, + "loss": 0.2718, + "step": 38550 + }, + { + "epoch": 1.714006311952705, + "grad_norm": 0.22025197744369507, + "learning_rate": 5.708802951080172e-05, + "loss": 0.271, + "step": 38560 + }, + { + "epoch": 1.7144508156643108, + "grad_norm": 0.20291554927825928, + "learning_rate": 5.706756926355576e-05, + "loss": 0.272, + "step": 38570 + }, + { + "epoch": 1.7148953193759167, + "grad_norm": 0.1758091002702713, + "learning_rate": 5.704710780865554e-05, + "loss": 0.2719, + "step": 38580 + }, + { + "epoch": 1.7153398230875228, + "grad_norm": 0.198674276471138, + "learning_rate": 5.7026645149597325e-05, + "loss": 0.275, + "step": 38590 + }, + { + "epoch": 1.7157843267991288, + "grad_norm": 0.2047179788351059, + "learning_rate": 5.700618128987764e-05, + "loss": 0.2724, + "step": 38600 + }, + { + "epoch": 1.7162288305107347, + "grad_norm": 0.207895889878273, + "learning_rate": 5.698571623299317e-05, + "loss": 0.2714, + "step": 38610 + }, + { + "epoch": 1.7166733342223408, + "grad_norm": 0.19537939131259918, + "learning_rate": 5.696524998244086e-05, + "loss": 0.2755, + "step": 38620 + }, + { + "epoch": 1.7171178379339467, + "grad_norm": 0.2159481942653656, + "learning_rate": 5.6944782541717836e-05, + "loss": 0.2726, + "step": 38630 + }, + { + "epoch": 1.7175623416455528, + "grad_norm": 0.16745392978191376, + "learning_rate": 5.69243139143214e-05, + "loss": 0.2714, + "step": 38640 + }, + { + "epoch": 1.7180068453571589, + "grad_norm": 0.18168224394321442, + "learning_rate": 5.6903844103749125e-05, + "loss": 0.2752, + "step": 38650 + }, + { + "epoch": 1.7184513490687647, + "grad_norm": 0.17487666010856628, + "learning_rate": 5.688337311349869e-05, + "loss": 0.2725, + "step": 38660 + }, + { + "epoch": 1.7188958527803706, + "grad_norm": 0.16231350600719452, + "learning_rate": 5.6862900947068074e-05, + "loss": 0.2698, + "step": 38670 + }, + { + "epoch": 1.7193403564919767, + "grad_norm": 0.18138468265533447, + "learning_rate": 5.6842427607955374e-05, + "loss": 0.2685, + "step": 38680 + }, + { + "epoch": 1.7197848602035828, + "grad_norm": 0.19127337634563446, + "learning_rate": 5.682195309965893e-05, + "loss": 0.2685, + "step": 38690 + }, + { + "epoch": 1.7202293639151887, + "grad_norm": 0.1980864703655243, + "learning_rate": 5.6801477425677294e-05, + "loss": 0.2717, + "step": 38700 + }, + { + "epoch": 1.7206738676267945, + "grad_norm": 0.2157353162765503, + "learning_rate": 5.678100058950917e-05, + "loss": 0.2721, + "step": 38710 + }, + { + "epoch": 1.7211183713384006, + "grad_norm": 0.1785169243812561, + "learning_rate": 5.676052259465352e-05, + "loss": 0.2692, + "step": 38720 + }, + { + "epoch": 1.7215628750500067, + "grad_norm": 0.15501588582992554, + "learning_rate": 5.674004344460945e-05, + "loss": 0.2729, + "step": 38730 + }, + { + "epoch": 1.7220073787616128, + "grad_norm": 0.19992581009864807, + "learning_rate": 5.6719563142876295e-05, + "loss": 0.2735, + "step": 38740 + }, + { + "epoch": 1.7224518824732187, + "grad_norm": 0.1803990602493286, + "learning_rate": 5.669908169295359e-05, + "loss": 0.2721, + "step": 38750 + }, + { + "epoch": 1.7228963861848245, + "grad_norm": 0.18716314435005188, + "learning_rate": 5.667859909834105e-05, + "loss": 0.2734, + "step": 38760 + }, + { + "epoch": 1.7233408898964306, + "grad_norm": 0.20563332736492157, + "learning_rate": 5.6658115362538565e-05, + "loss": 0.2721, + "step": 38770 + }, + { + "epoch": 1.7237853936080367, + "grad_norm": 0.1950332671403885, + "learning_rate": 5.663763048904628e-05, + "loss": 0.2714, + "step": 38780 + }, + { + "epoch": 1.7242298973196426, + "grad_norm": 0.19455082714557648, + "learning_rate": 5.661714448136447e-05, + "loss": 0.2703, + "step": 38790 + }, + { + "epoch": 1.7246744010312485, + "grad_norm": 0.18267184495925903, + "learning_rate": 5.659665734299366e-05, + "loss": 0.2693, + "step": 38800 + }, + { + "epoch": 1.7251189047428546, + "grad_norm": 0.1934266984462738, + "learning_rate": 5.6576169077434516e-05, + "loss": 0.2729, + "step": 38810 + }, + { + "epoch": 1.7255634084544607, + "grad_norm": 0.18698258697986603, + "learning_rate": 5.6555679688187944e-05, + "loss": 0.2741, + "step": 38820 + }, + { + "epoch": 1.7260079121660667, + "grad_norm": 0.15763115882873535, + "learning_rate": 5.6535189178755e-05, + "loss": 0.2708, + "step": 38830 + }, + { + "epoch": 1.7264524158776726, + "grad_norm": 0.16823935508728027, + "learning_rate": 5.651469755263698e-05, + "loss": 0.2726, + "step": 38840 + }, + { + "epoch": 1.7268969195892785, + "grad_norm": 0.1585996299982071, + "learning_rate": 5.6494204813335316e-05, + "loss": 0.2693, + "step": 38850 + }, + { + "epoch": 1.7273414233008846, + "grad_norm": 0.20676244795322418, + "learning_rate": 5.647371096435168e-05, + "loss": 0.2711, + "step": 38860 + }, + { + "epoch": 1.7277859270124907, + "grad_norm": 0.20225386321544647, + "learning_rate": 5.645321600918788e-05, + "loss": 0.2731, + "step": 38870 + }, + { + "epoch": 1.7282304307240965, + "grad_norm": 0.2336128205060959, + "learning_rate": 5.643271995134597e-05, + "loss": 0.2718, + "step": 38880 + }, + { + "epoch": 1.7286749344357024, + "grad_norm": 0.1629551202058792, + "learning_rate": 5.641222279432814e-05, + "loss": 0.2675, + "step": 38890 + }, + { + "epoch": 1.7291194381473085, + "grad_norm": 0.17997440695762634, + "learning_rate": 5.6391724541636834e-05, + "loss": 0.2719, + "step": 38900 + }, + { + "epoch": 1.7295639418589146, + "grad_norm": 0.14870765805244446, + "learning_rate": 5.6371225196774605e-05, + "loss": 0.2706, + "step": 38910 + }, + { + "epoch": 1.7300084455705205, + "grad_norm": 0.17307324707508087, + "learning_rate": 5.635072476324423e-05, + "loss": 0.2704, + "step": 38920 + }, + { + "epoch": 1.7304529492821263, + "grad_norm": 0.14723430573940277, + "learning_rate": 5.63302232445487e-05, + "loss": 0.2705, + "step": 38930 + }, + { + "epoch": 1.7308974529937324, + "grad_norm": 0.19470547139644623, + "learning_rate": 5.6309720644191144e-05, + "loss": 0.2715, + "step": 38940 + }, + { + "epoch": 1.7313419567053385, + "grad_norm": 0.1934659779071808, + "learning_rate": 5.628921696567491e-05, + "loss": 0.2705, + "step": 38950 + }, + { + "epoch": 1.7317864604169446, + "grad_norm": 0.16463445127010345, + "learning_rate": 5.62687122125035e-05, + "loss": 0.2708, + "step": 38960 + }, + { + "epoch": 1.7322309641285505, + "grad_norm": 0.15710347890853882, + "learning_rate": 5.624820638818062e-05, + "loss": 0.2691, + "step": 38970 + }, + { + "epoch": 1.7326754678401564, + "grad_norm": 0.19121475517749786, + "learning_rate": 5.6227699496210164e-05, + "loss": 0.2722, + "step": 38980 + }, + { + "epoch": 1.7331199715517625, + "grad_norm": 0.190408393740654, + "learning_rate": 5.6207191540096195e-05, + "loss": 0.273, + "step": 38990 + }, + { + "epoch": 1.7335644752633685, + "grad_norm": 0.16993899643421173, + "learning_rate": 5.618668252334296e-05, + "loss": 0.2704, + "step": 39000 + }, + { + "epoch": 1.7340089789749744, + "grad_norm": 0.21140408515930176, + "learning_rate": 5.616617244945488e-05, + "loss": 0.2742, + "step": 39010 + }, + { + "epoch": 1.7344534826865803, + "grad_norm": 0.190747931599617, + "learning_rate": 5.614566132193656e-05, + "loss": 0.2702, + "step": 39020 + }, + { + "epoch": 1.7348979863981864, + "grad_norm": 0.16713155806064606, + "learning_rate": 5.612514914429282e-05, + "loss": 0.2715, + "step": 39030 + }, + { + "epoch": 1.7353424901097925, + "grad_norm": 0.1910816729068756, + "learning_rate": 5.610463592002863e-05, + "loss": 0.2727, + "step": 39040 + }, + { + "epoch": 1.7357869938213986, + "grad_norm": 0.19932414591312408, + "learning_rate": 5.608412165264909e-05, + "loss": 0.2744, + "step": 39050 + }, + { + "epoch": 1.7362314975330044, + "grad_norm": 0.18593189120292664, + "learning_rate": 5.606360634565959e-05, + "loss": 0.2697, + "step": 39060 + }, + { + "epoch": 1.7366760012446103, + "grad_norm": 0.15730856359004974, + "learning_rate": 5.604309000256559e-05, + "loss": 0.2697, + "step": 39070 + }, + { + "epoch": 1.7371205049562164, + "grad_norm": 0.1527421772480011, + "learning_rate": 5.6022572626872785e-05, + "loss": 0.269, + "step": 39080 + }, + { + "epoch": 1.7375650086678225, + "grad_norm": 0.19094279408454895, + "learning_rate": 5.600205422208704e-05, + "loss": 0.2731, + "step": 39090 + }, + { + "epoch": 1.7380095123794284, + "grad_norm": 0.21112872660160065, + "learning_rate": 5.59815347917144e-05, + "loss": 0.2685, + "step": 39100 + }, + { + "epoch": 1.7384540160910342, + "grad_norm": 0.17098139226436615, + "learning_rate": 5.596101433926103e-05, + "loss": 0.2722, + "step": 39110 + }, + { + "epoch": 1.7388985198026403, + "grad_norm": 0.1898183524608612, + "learning_rate": 5.5940492868233364e-05, + "loss": 0.2687, + "step": 39120 + }, + { + "epoch": 1.7393430235142464, + "grad_norm": 0.1413642317056656, + "learning_rate": 5.591997038213793e-05, + "loss": 0.2719, + "step": 39130 + }, + { + "epoch": 1.7397875272258523, + "grad_norm": 0.19374655187129974, + "learning_rate": 5.5899446884481475e-05, + "loss": 0.2734, + "step": 39140 + }, + { + "epoch": 1.7402320309374584, + "grad_norm": 0.20126710832118988, + "learning_rate": 5.5878922378770906e-05, + "loss": 0.2704, + "step": 39150 + }, + { + "epoch": 1.7406765346490642, + "grad_norm": 0.21838706731796265, + "learning_rate": 5.5858396868513285e-05, + "loss": 0.2692, + "step": 39160 + }, + { + "epoch": 1.7411210383606703, + "grad_norm": 0.18979395925998688, + "learning_rate": 5.583787035721586e-05, + "loss": 0.2699, + "step": 39170 + }, + { + "epoch": 1.7415655420722764, + "grad_norm": 0.16911214590072632, + "learning_rate": 5.581734284838606e-05, + "loss": 0.2716, + "step": 39180 + }, + { + "epoch": 1.7420100457838823, + "grad_norm": 0.1643724888563156, + "learning_rate": 5.579681434553147e-05, + "loss": 0.2693, + "step": 39190 + }, + { + "epoch": 1.7424545494954882, + "grad_norm": 0.15598180890083313, + "learning_rate": 5.5776284852159854e-05, + "loss": 0.2683, + "step": 39200 + }, + { + "epoch": 1.7428990532070943, + "grad_norm": 0.17632083594799042, + "learning_rate": 5.575575437177913e-05, + "loss": 0.269, + "step": 39210 + }, + { + "epoch": 1.7433435569187004, + "grad_norm": 0.17244230210781097, + "learning_rate": 5.573522290789742e-05, + "loss": 0.2718, + "step": 39220 + }, + { + "epoch": 1.7437880606303062, + "grad_norm": 0.15579168498516083, + "learning_rate": 5.571469046402298e-05, + "loss": 0.2687, + "step": 39230 + }, + { + "epoch": 1.744232564341912, + "grad_norm": 0.17983432114124298, + "learning_rate": 5.5694157043664205e-05, + "loss": 0.2717, + "step": 39240 + }, + { + "epoch": 1.7446770680535182, + "grad_norm": 0.1779630035161972, + "learning_rate": 5.567362265032975e-05, + "loss": 0.2731, + "step": 39250 + }, + { + "epoch": 1.7451215717651243, + "grad_norm": 0.16809479892253876, + "learning_rate": 5.565308728752836e-05, + "loss": 0.2686, + "step": 39260 + }, + { + "epoch": 1.7455660754767304, + "grad_norm": 0.16713567078113556, + "learning_rate": 5.5632550958768956e-05, + "loss": 0.2713, + "step": 39270 + }, + { + "epoch": 1.7460105791883362, + "grad_norm": 0.16761697828769684, + "learning_rate": 5.5612013667560636e-05, + "loss": 0.2697, + "step": 39280 + }, + { + "epoch": 1.7464550828999421, + "grad_norm": 0.19048544764518738, + "learning_rate": 5.5591475417412676e-05, + "loss": 0.2727, + "step": 39290 + }, + { + "epoch": 1.7468995866115482, + "grad_norm": 0.1751534789800644, + "learning_rate": 5.557093621183451e-05, + "loss": 0.2686, + "step": 39300 + }, + { + "epoch": 1.7473440903231543, + "grad_norm": 0.17052698135375977, + "learning_rate": 5.55503960543357e-05, + "loss": 0.2715, + "step": 39310 + }, + { + "epoch": 1.7477885940347602, + "grad_norm": 0.18976035714149475, + "learning_rate": 5.552985494842601e-05, + "loss": 0.2732, + "step": 39320 + }, + { + "epoch": 1.748233097746366, + "grad_norm": 0.16613368690013885, + "learning_rate": 5.550931289761534e-05, + "loss": 0.2717, + "step": 39330 + }, + { + "epoch": 1.7486776014579721, + "grad_norm": 0.1803520768880844, + "learning_rate": 5.548876990541378e-05, + "loss": 0.2734, + "step": 39340 + }, + { + "epoch": 1.7491221051695782, + "grad_norm": 0.2017059028148651, + "learning_rate": 5.5468225975331564e-05, + "loss": 0.2724, + "step": 39350 + }, + { + "epoch": 1.7495666088811843, + "grad_norm": 0.16560275852680206, + "learning_rate": 5.544768111087909e-05, + "loss": 0.2717, + "step": 39360 + }, + { + "epoch": 1.7500111125927902, + "grad_norm": 0.16023360192775726, + "learning_rate": 5.54271353155669e-05, + "loss": 0.2698, + "step": 39370 + }, + { + "epoch": 1.750455616304396, + "grad_norm": 0.19213709235191345, + "learning_rate": 5.5406588592905715e-05, + "loss": 0.2736, + "step": 39380 + }, + { + "epoch": 1.7509001200160021, + "grad_norm": 0.18782158195972443, + "learning_rate": 5.5386040946406416e-05, + "loss": 0.2713, + "step": 39390 + }, + { + "epoch": 1.7513446237276082, + "grad_norm": 0.17752328515052795, + "learning_rate": 5.536549237958004e-05, + "loss": 0.2701, + "step": 39400 + }, + { + "epoch": 1.751789127439214, + "grad_norm": 0.17920427024364471, + "learning_rate": 5.5344942895937744e-05, + "loss": 0.2717, + "step": 39410 + }, + { + "epoch": 1.75223363115082, + "grad_norm": 0.2157546877861023, + "learning_rate": 5.5324392498990904e-05, + "loss": 0.2677, + "step": 39420 + }, + { + "epoch": 1.752678134862426, + "grad_norm": 0.19310817122459412, + "learning_rate": 5.5303841192251e-05, + "loss": 0.2713, + "step": 39430 + }, + { + "epoch": 1.7531226385740322, + "grad_norm": 0.17266488075256348, + "learning_rate": 5.52832889792297e-05, + "loss": 0.2717, + "step": 39440 + }, + { + "epoch": 1.753567142285638, + "grad_norm": 0.20518921315670013, + "learning_rate": 5.526273586343881e-05, + "loss": 0.2703, + "step": 39450 + }, + { + "epoch": 1.7540116459972441, + "grad_norm": 0.16859492659568787, + "learning_rate": 5.5242181848390306e-05, + "loss": 0.2712, + "step": 39460 + }, + { + "epoch": 1.75445614970885, + "grad_norm": 0.16689351201057434, + "learning_rate": 5.5221626937596285e-05, + "loss": 0.2692, + "step": 39470 + }, + { + "epoch": 1.754900653420456, + "grad_norm": 0.17651519179344177, + "learning_rate": 5.520107113456903e-05, + "loss": 0.2714, + "step": 39480 + }, + { + "epoch": 1.7553451571320622, + "grad_norm": 0.20830774307250977, + "learning_rate": 5.5180514442820974e-05, + "loss": 0.2734, + "step": 39490 + }, + { + "epoch": 1.755789660843668, + "grad_norm": 0.16357840597629547, + "learning_rate": 5.515995686586469e-05, + "loss": 0.2716, + "step": 39500 + }, + { + "epoch": 1.756234164555274, + "grad_norm": 0.19953325390815735, + "learning_rate": 5.5139398407212916e-05, + "loss": 0.2735, + "step": 39510 + }, + { + "epoch": 1.75667866826688, + "grad_norm": 0.16763821244239807, + "learning_rate": 5.511883907037849e-05, + "loss": 0.2733, + "step": 39520 + }, + { + "epoch": 1.757123171978486, + "grad_norm": 0.17649808526039124, + "learning_rate": 5.509827885887449e-05, + "loss": 0.271, + "step": 39530 + }, + { + "epoch": 1.757567675690092, + "grad_norm": 0.19874997437000275, + "learning_rate": 5.507771777621406e-05, + "loss": 0.2686, + "step": 39540 + }, + { + "epoch": 1.7580121794016978, + "grad_norm": 0.17777135968208313, + "learning_rate": 5.505715582591052e-05, + "loss": 0.269, + "step": 39550 + }, + { + "epoch": 1.758456683113304, + "grad_norm": 0.1536773145198822, + "learning_rate": 5.50365930114774e-05, + "loss": 0.2747, + "step": 39560 + }, + { + "epoch": 1.75890118682491, + "grad_norm": 0.17021258175373077, + "learning_rate": 5.5016029336428255e-05, + "loss": 0.2736, + "step": 39570 + }, + { + "epoch": 1.7593456905365161, + "grad_norm": 0.18934953212738037, + "learning_rate": 5.49954648042769e-05, + "loss": 0.2713, + "step": 39580 + }, + { + "epoch": 1.759790194248122, + "grad_norm": 0.13532406091690063, + "learning_rate": 5.4974899418537226e-05, + "loss": 0.2718, + "step": 39590 + }, + { + "epoch": 1.7602346979597279, + "grad_norm": 0.17922239005565643, + "learning_rate": 5.4954333182723316e-05, + "loss": 0.2694, + "step": 39600 + }, + { + "epoch": 1.760679201671334, + "grad_norm": 0.19612471759319305, + "learning_rate": 5.493376610034937e-05, + "loss": 0.2727, + "step": 39610 + }, + { + "epoch": 1.76112370538294, + "grad_norm": 0.1797359734773636, + "learning_rate": 5.4913198174929735e-05, + "loss": 0.2702, + "step": 39620 + }, + { + "epoch": 1.761568209094546, + "grad_norm": 0.18709270656108856, + "learning_rate": 5.489262940997889e-05, + "loss": 0.2696, + "step": 39630 + }, + { + "epoch": 1.7620127128061518, + "grad_norm": 0.19595608115196228, + "learning_rate": 5.487205980901151e-05, + "loss": 0.2679, + "step": 39640 + }, + { + "epoch": 1.7624572165177579, + "grad_norm": 0.16219644248485565, + "learning_rate": 5.485148937554234e-05, + "loss": 0.2693, + "step": 39650 + }, + { + "epoch": 1.762901720229364, + "grad_norm": 0.15619884431362152, + "learning_rate": 5.483091811308635e-05, + "loss": 0.2712, + "step": 39660 + }, + { + "epoch": 1.76334622394097, + "grad_norm": 0.15050005912780762, + "learning_rate": 5.4810346025158564e-05, + "loss": 0.269, + "step": 39670 + }, + { + "epoch": 1.763790727652576, + "grad_norm": 0.17130392789840698, + "learning_rate": 5.478977311527421e-05, + "loss": 0.2686, + "step": 39680 + }, + { + "epoch": 1.7642352313641818, + "grad_norm": 0.17345842719078064, + "learning_rate": 5.476919938694863e-05, + "loss": 0.2686, + "step": 39690 + }, + { + "epoch": 1.764679735075788, + "grad_norm": 0.14661253988742828, + "learning_rate": 5.474862484369733e-05, + "loss": 0.2681, + "step": 39700 + }, + { + "epoch": 1.765124238787394, + "grad_norm": 0.16286474466323853, + "learning_rate": 5.472804948903589e-05, + "loss": 0.2698, + "step": 39710 + }, + { + "epoch": 1.7655687424989999, + "grad_norm": 0.18892575800418854, + "learning_rate": 5.470747332648013e-05, + "loss": 0.2711, + "step": 39720 + }, + { + "epoch": 1.7660132462106057, + "grad_norm": 0.17706914246082306, + "learning_rate": 5.468689635954591e-05, + "loss": 0.2715, + "step": 39730 + }, + { + "epoch": 1.7664577499222118, + "grad_norm": 0.16811485588550568, + "learning_rate": 5.46663185917493e-05, + "loss": 0.2725, + "step": 39740 + }, + { + "epoch": 1.766902253633818, + "grad_norm": 0.1686491221189499, + "learning_rate": 5.464574002660645e-05, + "loss": 0.2693, + "step": 39750 + }, + { + "epoch": 1.7673467573454238, + "grad_norm": 0.18984457850456238, + "learning_rate": 5.4625160667633724e-05, + "loss": 0.2697, + "step": 39760 + }, + { + "epoch": 1.7677912610570297, + "grad_norm": 0.16058231890201569, + "learning_rate": 5.4604580518347505e-05, + "loss": 0.2689, + "step": 39770 + }, + { + "epoch": 1.7682357647686358, + "grad_norm": 0.17109522223472595, + "learning_rate": 5.458399958226442e-05, + "loss": 0.2727, + "step": 39780 + }, + { + "epoch": 1.7686802684802418, + "grad_norm": 0.1546936184167862, + "learning_rate": 5.456341786290119e-05, + "loss": 0.2722, + "step": 39790 + }, + { + "epoch": 1.769124772191848, + "grad_norm": 0.17142757773399353, + "learning_rate": 5.454283536377465e-05, + "loss": 0.2694, + "step": 39800 + }, + { + "epoch": 1.7695692759034538, + "grad_norm": 0.17888154089450836, + "learning_rate": 5.452225208840179e-05, + "loss": 0.2726, + "step": 39810 + }, + { + "epoch": 1.7700137796150597, + "grad_norm": 0.172429621219635, + "learning_rate": 5.450166804029975e-05, + "loss": 0.2711, + "step": 39820 + }, + { + "epoch": 1.7704582833266658, + "grad_norm": 0.13389471173286438, + "learning_rate": 5.448108322298574e-05, + "loss": 0.2702, + "step": 39830 + }, + { + "epoch": 1.7709027870382719, + "grad_norm": 0.1545390635728836, + "learning_rate": 5.446049763997717e-05, + "loss": 0.2715, + "step": 39840 + }, + { + "epoch": 1.7713472907498777, + "grad_norm": 0.17835767567157745, + "learning_rate": 5.4439911294791546e-05, + "loss": 0.272, + "step": 39850 + }, + { + "epoch": 1.7717917944614836, + "grad_norm": 0.1790449321269989, + "learning_rate": 5.441932419094652e-05, + "loss": 0.2705, + "step": 39860 + }, + { + "epoch": 1.7722362981730897, + "grad_norm": 0.18930110335350037, + "learning_rate": 5.439873633195985e-05, + "loss": 0.2691, + "step": 39870 + }, + { + "epoch": 1.7726808018846958, + "grad_norm": 0.20019052922725677, + "learning_rate": 5.437814772134947e-05, + "loss": 0.2719, + "step": 39880 + }, + { + "epoch": 1.7731253055963019, + "grad_norm": 0.2107023447751999, + "learning_rate": 5.4357558362633366e-05, + "loss": 0.2707, + "step": 39890 + }, + { + "epoch": 1.7735698093079078, + "grad_norm": 0.17995303869247437, + "learning_rate": 5.4336968259329726e-05, + "loss": 0.2693, + "step": 39900 + }, + { + "epoch": 1.7740143130195136, + "grad_norm": 0.16530126333236694, + "learning_rate": 5.431637741495681e-05, + "loss": 0.2708, + "step": 39910 + }, + { + "epoch": 1.7744588167311197, + "grad_norm": 0.17797720432281494, + "learning_rate": 5.429578583303307e-05, + "loss": 0.2711, + "step": 39920 + }, + { + "epoch": 1.7749033204427258, + "grad_norm": 0.22498458623886108, + "learning_rate": 5.427519351707701e-05, + "loss": 0.2699, + "step": 39930 + }, + { + "epoch": 1.7753478241543317, + "grad_norm": 0.17660032212734222, + "learning_rate": 5.42546004706073e-05, + "loss": 0.2689, + "step": 39940 + }, + { + "epoch": 1.7757923278659375, + "grad_norm": 0.18163104355335236, + "learning_rate": 5.4234006697142735e-05, + "loss": 0.271, + "step": 39950 + }, + { + "epoch": 1.7762368315775436, + "grad_norm": 0.16206158697605133, + "learning_rate": 5.421341220020224e-05, + "loss": 0.2684, + "step": 39960 + }, + { + "epoch": 1.7766813352891497, + "grad_norm": 0.19787095487117767, + "learning_rate": 5.419281698330482e-05, + "loss": 0.2699, + "step": 39970 + }, + { + "epoch": 1.7771258390007556, + "grad_norm": 0.1968299150466919, + "learning_rate": 5.4172221049969665e-05, + "loss": 0.2702, + "step": 39980 + }, + { + "epoch": 1.7775703427123617, + "grad_norm": 0.16746027767658234, + "learning_rate": 5.415162440371604e-05, + "loss": 0.2723, + "step": 39990 + }, + { + "epoch": 1.7780148464239676, + "grad_norm": 0.19231268763542175, + "learning_rate": 5.413102704806334e-05, + "loss": 0.2679, + "step": 40000 + }, + { + "epoch": 1.7784593501355737, + "grad_norm": 0.17598193883895874, + "learning_rate": 5.41104289865311e-05, + "loss": 0.2683, + "step": 40010 + }, + { + "epoch": 1.7789038538471798, + "grad_norm": 0.16687165200710297, + "learning_rate": 5.408983022263898e-05, + "loss": 0.2695, + "step": 40020 + }, + { + "epoch": 1.7793483575587856, + "grad_norm": 0.20273229479789734, + "learning_rate": 5.406923075990673e-05, + "loss": 0.2724, + "step": 40030 + }, + { + "epoch": 1.7797928612703915, + "grad_norm": 0.21951834857463837, + "learning_rate": 5.404863060185423e-05, + "loss": 0.2685, + "step": 40040 + }, + { + "epoch": 1.7802373649819976, + "grad_norm": 0.16695064306259155, + "learning_rate": 5.40280297520015e-05, + "loss": 0.269, + "step": 40050 + }, + { + "epoch": 1.7806818686936037, + "grad_norm": 0.14864389598369598, + "learning_rate": 5.400742821386865e-05, + "loss": 0.2705, + "step": 40060 + }, + { + "epoch": 1.7811263724052095, + "grad_norm": 0.17526863515377045, + "learning_rate": 5.398682599097592e-05, + "loss": 0.2733, + "step": 40070 + }, + { + "epoch": 1.7815708761168154, + "grad_norm": 0.16877733170986176, + "learning_rate": 5.396622308684367e-05, + "loss": 0.2735, + "step": 40080 + }, + { + "epoch": 1.7820153798284215, + "grad_norm": 0.18618524074554443, + "learning_rate": 5.394561950499236e-05, + "loss": 0.2686, + "step": 40090 + }, + { + "epoch": 1.7824598835400276, + "grad_norm": 0.16283287107944489, + "learning_rate": 5.39250152489426e-05, + "loss": 0.2684, + "step": 40100 + }, + { + "epoch": 1.7829043872516337, + "grad_norm": 0.1780388355255127, + "learning_rate": 5.390441032221507e-05, + "loss": 0.2686, + "step": 40110 + }, + { + "epoch": 1.7833488909632396, + "grad_norm": 0.18114148080348969, + "learning_rate": 5.388380472833062e-05, + "loss": 0.2676, + "step": 40120 + }, + { + "epoch": 1.7837933946748454, + "grad_norm": 0.1732153594493866, + "learning_rate": 5.386319847081016e-05, + "loss": 0.2707, + "step": 40130 + }, + { + "epoch": 1.7842378983864515, + "grad_norm": 0.17620903253555298, + "learning_rate": 5.384259155317473e-05, + "loss": 0.2684, + "step": 40140 + }, + { + "epoch": 1.7846824020980576, + "grad_norm": 0.19489896297454834, + "learning_rate": 5.38219839789455e-05, + "loss": 0.2707, + "step": 40150 + }, + { + "epoch": 1.7851269058096635, + "grad_norm": 0.21913565695285797, + "learning_rate": 5.380137575164376e-05, + "loss": 0.2713, + "step": 40160 + }, + { + "epoch": 1.7855714095212694, + "grad_norm": 0.17836475372314453, + "learning_rate": 5.378076687479085e-05, + "loss": 0.2715, + "step": 40170 + }, + { + "epoch": 1.7860159132328755, + "grad_norm": 0.17556393146514893, + "learning_rate": 5.3760157351908305e-05, + "loss": 0.2693, + "step": 40180 + }, + { + "epoch": 1.7864604169444815, + "grad_norm": 0.17616809904575348, + "learning_rate": 5.373954718651768e-05, + "loss": 0.2687, + "step": 40190 + }, + { + "epoch": 1.7869049206560876, + "grad_norm": 0.1663760244846344, + "learning_rate": 5.371893638214074e-05, + "loss": 0.2686, + "step": 40200 + }, + { + "epoch": 1.7873494243676935, + "grad_norm": 0.17249497771263123, + "learning_rate": 5.369832494229927e-05, + "loss": 0.2705, + "step": 40210 + }, + { + "epoch": 1.7877939280792994, + "grad_norm": 0.17630325257778168, + "learning_rate": 5.367771287051524e-05, + "loss": 0.271, + "step": 40220 + }, + { + "epoch": 1.7882384317909055, + "grad_norm": 0.20277518033981323, + "learning_rate": 5.365710017031065e-05, + "loss": 0.2698, + "step": 40230 + }, + { + "epoch": 1.7886829355025116, + "grad_norm": 0.21024945378303528, + "learning_rate": 5.3636486845207654e-05, + "loss": 0.2697, + "step": 40240 + }, + { + "epoch": 1.7891274392141174, + "grad_norm": 0.2145407646894455, + "learning_rate": 5.3615872898728526e-05, + "loss": 0.2709, + "step": 40250 + }, + { + "epoch": 1.7895719429257233, + "grad_norm": 0.16325032711029053, + "learning_rate": 5.3595258334395614e-05, + "loss": 0.2705, + "step": 40260 + }, + { + "epoch": 1.7900164466373294, + "grad_norm": 0.19066694378852844, + "learning_rate": 5.3574643155731396e-05, + "loss": 0.2695, + "step": 40270 + }, + { + "epoch": 1.7904609503489355, + "grad_norm": 0.2012978196144104, + "learning_rate": 5.3554027366258405e-05, + "loss": 0.269, + "step": 40280 + }, + { + "epoch": 1.7909054540605414, + "grad_norm": 0.2045646756887436, + "learning_rate": 5.3533410969499355e-05, + "loss": 0.269, + "step": 40290 + }, + { + "epoch": 1.7913499577721474, + "grad_norm": 0.16724646091461182, + "learning_rate": 5.3512793968977e-05, + "loss": 0.2708, + "step": 40300 + }, + { + "epoch": 1.7917944614837533, + "grad_norm": 0.16231060028076172, + "learning_rate": 5.349217636821423e-05, + "loss": 0.2691, + "step": 40310 + }, + { + "epoch": 1.7922389651953594, + "grad_norm": 0.2000524252653122, + "learning_rate": 5.347155817073405e-05, + "loss": 0.2698, + "step": 40320 + }, + { + "epoch": 1.7926834689069655, + "grad_norm": 0.20569412410259247, + "learning_rate": 5.34509393800595e-05, + "loss": 0.2702, + "step": 40330 + }, + { + "epoch": 1.7931279726185714, + "grad_norm": 0.1712813675403595, + "learning_rate": 5.343031999971381e-05, + "loss": 0.2714, + "step": 40340 + }, + { + "epoch": 1.7935724763301772, + "grad_norm": 0.15291886031627655, + "learning_rate": 5.340970003322026e-05, + "loss": 0.2735, + "step": 40350 + }, + { + "epoch": 1.7940169800417833, + "grad_norm": 0.1875515878200531, + "learning_rate": 5.33890794841022e-05, + "loss": 0.2715, + "step": 40360 + }, + { + "epoch": 1.7944614837533894, + "grad_norm": 0.15845175087451935, + "learning_rate": 5.336845835588318e-05, + "loss": 0.2691, + "step": 40370 + }, + { + "epoch": 1.7949059874649953, + "grad_norm": 0.1955765336751938, + "learning_rate": 5.334783665208674e-05, + "loss": 0.2691, + "step": 40380 + }, + { + "epoch": 1.7953504911766012, + "grad_norm": 0.15770752727985382, + "learning_rate": 5.332721437623657e-05, + "loss": 0.2692, + "step": 40390 + }, + { + "epoch": 1.7957949948882073, + "grad_norm": 0.19124910235404968, + "learning_rate": 5.3306591531856464e-05, + "loss": 0.2693, + "step": 40400 + }, + { + "epoch": 1.7962394985998134, + "grad_norm": 0.1968972235918045, + "learning_rate": 5.3285968122470295e-05, + "loss": 0.2708, + "step": 40410 + }, + { + "epoch": 1.7966840023114194, + "grad_norm": 0.1816413551568985, + "learning_rate": 5.326534415160207e-05, + "loss": 0.2704, + "step": 40420 + }, + { + "epoch": 1.7971285060230253, + "grad_norm": 0.16220276057720184, + "learning_rate": 5.324471962277582e-05, + "loss": 0.2692, + "step": 40430 + }, + { + "epoch": 1.7975730097346312, + "grad_norm": 0.17925231158733368, + "learning_rate": 5.3224094539515746e-05, + "loss": 0.2729, + "step": 40440 + }, + { + "epoch": 1.7980175134462373, + "grad_norm": 0.18592865765094757, + "learning_rate": 5.3203468905346076e-05, + "loss": 0.2722, + "step": 40450 + }, + { + "epoch": 1.7984620171578434, + "grad_norm": 0.19275601208209991, + "learning_rate": 5.3182842723791195e-05, + "loss": 0.2689, + "step": 40460 + }, + { + "epoch": 1.7989065208694492, + "grad_norm": 0.18965895473957062, + "learning_rate": 5.316221599837554e-05, + "loss": 0.2699, + "step": 40470 + }, + { + "epoch": 1.7993510245810551, + "grad_norm": 0.16776303946971893, + "learning_rate": 5.314158873262366e-05, + "loss": 0.2697, + "step": 40480 + }, + { + "epoch": 1.7997955282926612, + "grad_norm": 0.16939552128314972, + "learning_rate": 5.312096093006018e-05, + "loss": 0.2674, + "step": 40490 + }, + { + "epoch": 1.8002400320042673, + "grad_norm": 0.17378072440624237, + "learning_rate": 5.3100332594209846e-05, + "loss": 0.2676, + "step": 40500 + }, + { + "epoch": 1.8006845357158734, + "grad_norm": 0.16682592034339905, + "learning_rate": 5.3079703728597454e-05, + "loss": 0.2715, + "step": 40510 + }, + { + "epoch": 1.8011290394274793, + "grad_norm": 0.1854587197303772, + "learning_rate": 5.305907433674794e-05, + "loss": 0.2714, + "step": 40520 + }, + { + "epoch": 1.8015735431390851, + "grad_norm": 0.20935547351837158, + "learning_rate": 5.3038444422186284e-05, + "loss": 0.2704, + "step": 40530 + }, + { + "epoch": 1.8020180468506912, + "grad_norm": 0.15976737439632416, + "learning_rate": 5.301781398843757e-05, + "loss": 0.2714, + "step": 40540 + }, + { + "epoch": 1.8024625505622973, + "grad_norm": 0.18487675487995148, + "learning_rate": 5.299718303902699e-05, + "loss": 0.2661, + "step": 40550 + }, + { + "epoch": 1.8029070542739032, + "grad_norm": 0.16286011040210724, + "learning_rate": 5.297655157747979e-05, + "loss": 0.2692, + "step": 40560 + }, + { + "epoch": 1.803351557985509, + "grad_norm": 0.17307260632514954, + "learning_rate": 5.295591960732136e-05, + "loss": 0.2679, + "step": 40570 + }, + { + "epoch": 1.8037960616971151, + "grad_norm": 0.16000205278396606, + "learning_rate": 5.293528713207708e-05, + "loss": 0.2707, + "step": 40580 + }, + { + "epoch": 1.8042405654087212, + "grad_norm": 0.18884144723415375, + "learning_rate": 5.291465415527253e-05, + "loss": 0.2703, + "step": 40590 + }, + { + "epoch": 1.8046850691203271, + "grad_norm": 0.17359547317028046, + "learning_rate": 5.2894020680433296e-05, + "loss": 0.2707, + "step": 40600 + }, + { + "epoch": 1.805129572831933, + "grad_norm": 0.1837819218635559, + "learning_rate": 5.287338671108507e-05, + "loss": 0.27, + "step": 40610 + }, + { + "epoch": 1.805574076543539, + "grad_norm": 0.2090587466955185, + "learning_rate": 5.285275225075367e-05, + "loss": 0.2688, + "step": 40620 + }, + { + "epoch": 1.8060185802551452, + "grad_norm": 0.191456139087677, + "learning_rate": 5.283211730296492e-05, + "loss": 0.269, + "step": 40630 + }, + { + "epoch": 1.8064630839667513, + "grad_norm": 0.1797323226928711, + "learning_rate": 5.281148187124477e-05, + "loss": 0.2677, + "step": 40640 + }, + { + "epoch": 1.8069075876783571, + "grad_norm": 0.1528487652540207, + "learning_rate": 5.279084595911927e-05, + "loss": 0.2709, + "step": 40650 + }, + { + "epoch": 1.807352091389963, + "grad_norm": 0.1958988457918167, + "learning_rate": 5.277020957011453e-05, + "loss": 0.2705, + "step": 40660 + }, + { + "epoch": 1.807796595101569, + "grad_norm": 0.16596807539463043, + "learning_rate": 5.274957270775673e-05, + "loss": 0.2715, + "step": 40670 + }, + { + "epoch": 1.8082410988131752, + "grad_norm": 0.16225668787956238, + "learning_rate": 5.2728935375572164e-05, + "loss": 0.2714, + "step": 40680 + }, + { + "epoch": 1.808685602524781, + "grad_norm": 0.18815040588378906, + "learning_rate": 5.2708297577087176e-05, + "loss": 0.2691, + "step": 40690 + }, + { + "epoch": 1.809130106236387, + "grad_norm": 0.19331692159175873, + "learning_rate": 5.2687659315828196e-05, + "loss": 0.268, + "step": 40700 + }, + { + "epoch": 1.809574609947993, + "grad_norm": 0.16059798002243042, + "learning_rate": 5.266702059532175e-05, + "loss": 0.2683, + "step": 40710 + }, + { + "epoch": 1.810019113659599, + "grad_norm": 0.19447696208953857, + "learning_rate": 5.264638141909444e-05, + "loss": 0.2701, + "step": 40720 + }, + { + "epoch": 1.8104636173712052, + "grad_norm": 0.21734869480133057, + "learning_rate": 5.2625741790672925e-05, + "loss": 0.2714, + "step": 40730 + }, + { + "epoch": 1.810908121082811, + "grad_norm": 0.1816960573196411, + "learning_rate": 5.2605101713583935e-05, + "loss": 0.2696, + "step": 40740 + }, + { + "epoch": 1.811352624794417, + "grad_norm": 0.19766093790531158, + "learning_rate": 5.2584461191354315e-05, + "loss": 0.2716, + "step": 40750 + }, + { + "epoch": 1.811797128506023, + "grad_norm": 0.1682954579591751, + "learning_rate": 5.2563820227510964e-05, + "loss": 0.271, + "step": 40760 + }, + { + "epoch": 1.8122416322176291, + "grad_norm": 0.1580345332622528, + "learning_rate": 5.2543178825580864e-05, + "loss": 0.2699, + "step": 40770 + }, + { + "epoch": 1.812686135929235, + "grad_norm": 0.18131987750530243, + "learning_rate": 5.2522536989091054e-05, + "loss": 0.269, + "step": 40780 + }, + { + "epoch": 1.8131306396408409, + "grad_norm": 0.1632091999053955, + "learning_rate": 5.2501894721568655e-05, + "loss": 0.27, + "step": 40790 + }, + { + "epoch": 1.813575143352447, + "grad_norm": 0.16469430923461914, + "learning_rate": 5.248125202654089e-05, + "loss": 0.2665, + "step": 40800 + }, + { + "epoch": 1.814019647064053, + "grad_norm": 0.1575835943222046, + "learning_rate": 5.246060890753501e-05, + "loss": 0.2692, + "step": 40810 + }, + { + "epoch": 1.814464150775659, + "grad_norm": 0.1508837342262268, + "learning_rate": 5.243996536807837e-05, + "loss": 0.268, + "step": 40820 + }, + { + "epoch": 1.814908654487265, + "grad_norm": 0.1751256287097931, + "learning_rate": 5.2419321411698384e-05, + "loss": 0.2678, + "step": 40830 + }, + { + "epoch": 1.8153531581988709, + "grad_norm": 0.1998724788427353, + "learning_rate": 5.239867704192253e-05, + "loss": 0.2682, + "step": 40840 + }, + { + "epoch": 1.815797661910477, + "grad_norm": 0.17317089438438416, + "learning_rate": 5.237803226227838e-05, + "loss": 0.2717, + "step": 40850 + }, + { + "epoch": 1.816242165622083, + "grad_norm": 0.21714520454406738, + "learning_rate": 5.235738707629354e-05, + "loss": 0.269, + "step": 40860 + }, + { + "epoch": 1.816686669333689, + "grad_norm": 0.18765048682689667, + "learning_rate": 5.233674148749575e-05, + "loss": 0.2699, + "step": 40870 + }, + { + "epoch": 1.8171311730452948, + "grad_norm": 0.18045499920845032, + "learning_rate": 5.231609549941272e-05, + "loss": 0.2679, + "step": 40880 + }, + { + "epoch": 1.817575676756901, + "grad_norm": 0.16748687624931335, + "learning_rate": 5.229544911557233e-05, + "loss": 0.268, + "step": 40890 + }, + { + "epoch": 1.818020180468507, + "grad_norm": 0.16534119844436646, + "learning_rate": 5.227480233950246e-05, + "loss": 0.2698, + "step": 40900 + }, + { + "epoch": 1.8184646841801129, + "grad_norm": 0.15863269567489624, + "learning_rate": 5.22541551747311e-05, + "loss": 0.2678, + "step": 40910 + }, + { + "epoch": 1.8189091878917187, + "grad_norm": 0.1759074479341507, + "learning_rate": 5.223350762478626e-05, + "loss": 0.2697, + "step": 40920 + }, + { + "epoch": 1.8193536916033248, + "grad_norm": 0.17752017080783844, + "learning_rate": 5.221285969319606e-05, + "loss": 0.271, + "step": 40930 + }, + { + "epoch": 1.819798195314931, + "grad_norm": 0.20194967091083527, + "learning_rate": 5.219221138348865e-05, + "loss": 0.2679, + "step": 40940 + }, + { + "epoch": 1.820242699026537, + "grad_norm": 0.18993887305259705, + "learning_rate": 5.217156269919228e-05, + "loss": 0.2693, + "step": 40950 + }, + { + "epoch": 1.8206872027381429, + "grad_norm": 0.18935109674930573, + "learning_rate": 5.215091364383523e-05, + "loss": 0.2705, + "step": 40960 + }, + { + "epoch": 1.8211317064497488, + "grad_norm": 0.20578642189502716, + "learning_rate": 5.213026422094588e-05, + "loss": 0.2685, + "step": 40970 + }, + { + "epoch": 1.8215762101613548, + "grad_norm": 0.19338436424732208, + "learning_rate": 5.210961443405262e-05, + "loss": 0.2677, + "step": 40980 + }, + { + "epoch": 1.822020713872961, + "grad_norm": 0.1706276386976242, + "learning_rate": 5.208896428668396e-05, + "loss": 0.2668, + "step": 40990 + }, + { + "epoch": 1.8224652175845668, + "grad_norm": 0.18378330767154694, + "learning_rate": 5.206831378236845e-05, + "loss": 0.2681, + "step": 41000 + }, + { + "epoch": 1.8229097212961727, + "grad_norm": 0.1506209522485733, + "learning_rate": 5.2047662924634666e-05, + "loss": 0.2675, + "step": 41010 + }, + { + "epoch": 1.8233542250077788, + "grad_norm": 0.13662089407444, + "learning_rate": 5.20270117170113e-05, + "loss": 0.2684, + "step": 41020 + }, + { + "epoch": 1.8237987287193849, + "grad_norm": 0.156500905752182, + "learning_rate": 5.200636016302707e-05, + "loss": 0.2668, + "step": 41030 + }, + { + "epoch": 1.824243232430991, + "grad_norm": 0.16269175708293915, + "learning_rate": 5.1985708266210754e-05, + "loss": 0.2653, + "step": 41040 + }, + { + "epoch": 1.8246877361425968, + "grad_norm": 0.1609772890806198, + "learning_rate": 5.1965056030091206e-05, + "loss": 0.2687, + "step": 41050 + }, + { + "epoch": 1.8251322398542027, + "grad_norm": 0.20527832210063934, + "learning_rate": 5.194440345819732e-05, + "loss": 0.2689, + "step": 41060 + }, + { + "epoch": 1.8255767435658088, + "grad_norm": 0.18378978967666626, + "learning_rate": 5.1923750554058084e-05, + "loss": 0.2683, + "step": 41070 + }, + { + "epoch": 1.8260212472774149, + "grad_norm": 0.1527072638273239, + "learning_rate": 5.1903097321202476e-05, + "loss": 0.2691, + "step": 41080 + }, + { + "epoch": 1.8264657509890208, + "grad_norm": 0.16095231473445892, + "learning_rate": 5.18824437631596e-05, + "loss": 0.2739, + "step": 41090 + }, + { + "epoch": 1.8269102547006266, + "grad_norm": 0.14078330993652344, + "learning_rate": 5.186178988345856e-05, + "loss": 0.2683, + "step": 41100 + }, + { + "epoch": 1.8273547584122327, + "grad_norm": 0.17818476259708405, + "learning_rate": 5.184113568562855e-05, + "loss": 0.2709, + "step": 41110 + }, + { + "epoch": 1.8277992621238388, + "grad_norm": 0.18324999511241913, + "learning_rate": 5.18204811731988e-05, + "loss": 0.2704, + "step": 41120 + }, + { + "epoch": 1.8282437658354447, + "grad_norm": 0.1699710339307785, + "learning_rate": 5.179982634969862e-05, + "loss": 0.2687, + "step": 41130 + }, + { + "epoch": 1.8286882695470508, + "grad_norm": 0.1684783548116684, + "learning_rate": 5.177917121865734e-05, + "loss": 0.2697, + "step": 41140 + }, + { + "epoch": 1.8291327732586566, + "grad_norm": 0.15368016064167023, + "learning_rate": 5.1758515783604346e-05, + "loss": 0.268, + "step": 41150 + }, + { + "epoch": 1.8295772769702627, + "grad_norm": 0.15880055725574493, + "learning_rate": 5.173786004806911e-05, + "loss": 0.2708, + "step": 41160 + }, + { + "epoch": 1.8300217806818688, + "grad_norm": 0.19706076383590698, + "learning_rate": 5.1717204015581135e-05, + "loss": 0.271, + "step": 41170 + }, + { + "epoch": 1.8304662843934747, + "grad_norm": 0.18534782528877258, + "learning_rate": 5.169654768966994e-05, + "loss": 0.272, + "step": 41180 + }, + { + "epoch": 1.8309107881050806, + "grad_norm": 0.20514258742332458, + "learning_rate": 5.1675891073865156e-05, + "loss": 0.2705, + "step": 41190 + }, + { + "epoch": 1.8313552918166867, + "grad_norm": 0.17659543454647064, + "learning_rate": 5.1655234171696424e-05, + "loss": 0.2675, + "step": 41200 + }, + { + "epoch": 1.8317997955282928, + "grad_norm": 0.1548663228750229, + "learning_rate": 5.163457698669343e-05, + "loss": 0.2731, + "step": 41210 + }, + { + "epoch": 1.8322442992398986, + "grad_norm": 0.16308818757534027, + "learning_rate": 5.1613919522385936e-05, + "loss": 0.2714, + "step": 41220 + }, + { + "epoch": 1.8326888029515045, + "grad_norm": 0.14263446629047394, + "learning_rate": 5.1593261782303746e-05, + "loss": 0.2695, + "step": 41230 + }, + { + "epoch": 1.8331333066631106, + "grad_norm": 0.15783600509166718, + "learning_rate": 5.157260376997669e-05, + "loss": 0.268, + "step": 41240 + }, + { + "epoch": 1.8335778103747167, + "grad_norm": 0.18757028877735138, + "learning_rate": 5.155194548893466e-05, + "loss": 0.2693, + "step": 41250 + }, + { + "epoch": 1.8340223140863228, + "grad_norm": 0.21349427103996277, + "learning_rate": 5.153128694270759e-05, + "loss": 0.2683, + "step": 41260 + }, + { + "epoch": 1.8344668177979286, + "grad_norm": 0.17480672895908356, + "learning_rate": 5.151062813482548e-05, + "loss": 0.2671, + "step": 41270 + }, + { + "epoch": 1.8349113215095345, + "grad_norm": 0.18456466495990753, + "learning_rate": 5.148996906881832e-05, + "loss": 0.268, + "step": 41280 + }, + { + "epoch": 1.8353558252211406, + "grad_norm": 0.16250966489315033, + "learning_rate": 5.1469309748216196e-05, + "loss": 0.269, + "step": 41290 + }, + { + "epoch": 1.8358003289327467, + "grad_norm": 0.17771214246749878, + "learning_rate": 5.144865017654923e-05, + "loss": 0.2685, + "step": 41300 + }, + { + "epoch": 1.8362448326443526, + "grad_norm": 0.16934415698051453, + "learning_rate": 5.1427990357347556e-05, + "loss": 0.2658, + "step": 41310 + }, + { + "epoch": 1.8366893363559584, + "grad_norm": 0.19988024234771729, + "learning_rate": 5.140733029414139e-05, + "loss": 0.2704, + "step": 41320 + }, + { + "epoch": 1.8371338400675645, + "grad_norm": 0.16514062881469727, + "learning_rate": 5.138666999046098e-05, + "loss": 0.271, + "step": 41330 + }, + { + "epoch": 1.8375783437791706, + "grad_norm": 0.14672760665416718, + "learning_rate": 5.136600944983658e-05, + "loss": 0.2684, + "step": 41340 + }, + { + "epoch": 1.8380228474907767, + "grad_norm": 0.1598864495754242, + "learning_rate": 5.134534867579853e-05, + "loss": 0.2691, + "step": 41350 + }, + { + "epoch": 1.8384673512023826, + "grad_norm": 0.19569627940654755, + "learning_rate": 5.1324687671877184e-05, + "loss": 0.268, + "step": 41360 + }, + { + "epoch": 1.8389118549139885, + "grad_norm": 0.19025956094264984, + "learning_rate": 5.130402644160296e-05, + "loss": 0.2684, + "step": 41370 + }, + { + "epoch": 1.8393563586255945, + "grad_norm": 0.16914252936840057, + "learning_rate": 5.128336498850628e-05, + "loss": 0.2677, + "step": 41380 + }, + { + "epoch": 1.8398008623372006, + "grad_norm": 0.18425898253917694, + "learning_rate": 5.126270331611761e-05, + "loss": 0.2675, + "step": 41390 + }, + { + "epoch": 1.8402453660488065, + "grad_norm": 0.1838991343975067, + "learning_rate": 5.124204142796748e-05, + "loss": 0.2689, + "step": 41400 + }, + { + "epoch": 1.8406898697604124, + "grad_norm": 0.17372022569179535, + "learning_rate": 5.122137932758644e-05, + "loss": 0.2679, + "step": 41410 + }, + { + "epoch": 1.8411343734720185, + "grad_norm": 0.14663545787334442, + "learning_rate": 5.1200717018505074e-05, + "loss": 0.2699, + "step": 41420 + }, + { + "epoch": 1.8415788771836246, + "grad_norm": 0.1573220044374466, + "learning_rate": 5.118005450425403e-05, + "loss": 0.2707, + "step": 41430 + }, + { + "epoch": 1.8420233808952304, + "grad_norm": 0.14019683003425598, + "learning_rate": 5.115939178836392e-05, + "loss": 0.2675, + "step": 41440 + }, + { + "epoch": 1.8424678846068363, + "grad_norm": 0.160489022731781, + "learning_rate": 5.113872887436547e-05, + "loss": 0.271, + "step": 41450 + }, + { + "epoch": 1.8429123883184424, + "grad_norm": 0.20540104806423187, + "learning_rate": 5.11180657657894e-05, + "loss": 0.2683, + "step": 41460 + }, + { + "epoch": 1.8433568920300485, + "grad_norm": 0.1845792680978775, + "learning_rate": 5.109740246616649e-05, + "loss": 0.2671, + "step": 41470 + }, + { + "epoch": 1.8438013957416546, + "grad_norm": 0.15941324830055237, + "learning_rate": 5.10767389790275e-05, + "loss": 0.2681, + "step": 41480 + }, + { + "epoch": 1.8442458994532605, + "grad_norm": 0.1597800850868225, + "learning_rate": 5.1056075307903265e-05, + "loss": 0.2715, + "step": 41490 + }, + { + "epoch": 1.8446904031648663, + "grad_norm": 0.1847238689661026, + "learning_rate": 5.1035411456324634e-05, + "loss": 0.2677, + "step": 41500 + }, + { + "epoch": 1.8451349068764724, + "grad_norm": 0.17517440021038055, + "learning_rate": 5.1014747427822504e-05, + "loss": 0.2708, + "step": 41510 + }, + { + "epoch": 1.8455794105880785, + "grad_norm": 0.1753060668706894, + "learning_rate": 5.0994083225927804e-05, + "loss": 0.2685, + "step": 41520 + }, + { + "epoch": 1.8460239142996844, + "grad_norm": 0.17510062456130981, + "learning_rate": 5.0973418854171475e-05, + "loss": 0.27, + "step": 41530 + }, + { + "epoch": 1.8464684180112902, + "grad_norm": 0.1965351402759552, + "learning_rate": 5.095275431608447e-05, + "loss": 0.2665, + "step": 41540 + }, + { + "epoch": 1.8469129217228963, + "grad_norm": 0.1828535497188568, + "learning_rate": 5.093208961519782e-05, + "loss": 0.2696, + "step": 41550 + }, + { + "epoch": 1.8473574254345024, + "grad_norm": 0.16297025978565216, + "learning_rate": 5.091142475504255e-05, + "loss": 0.2675, + "step": 41560 + }, + { + "epoch": 1.8478019291461085, + "grad_norm": 0.2034635990858078, + "learning_rate": 5.089075973914971e-05, + "loss": 0.2716, + "step": 41570 + }, + { + "epoch": 1.8482464328577144, + "grad_norm": 0.21986502408981323, + "learning_rate": 5.08700945710504e-05, + "loss": 0.2686, + "step": 41580 + }, + { + "epoch": 1.8486909365693203, + "grad_norm": 0.17981696128845215, + "learning_rate": 5.0849429254275714e-05, + "loss": 0.2691, + "step": 41590 + }, + { + "epoch": 1.8491354402809264, + "grad_norm": 0.16959746181964874, + "learning_rate": 5.0828763792356804e-05, + "loss": 0.2671, + "step": 41600 + }, + { + "epoch": 1.8495799439925324, + "grad_norm": 0.1590961068868637, + "learning_rate": 5.080809818882483e-05, + "loss": 0.2695, + "step": 41610 + }, + { + "epoch": 1.8500244477041383, + "grad_norm": 0.1709538996219635, + "learning_rate": 5.078743244721097e-05, + "loss": 0.2701, + "step": 41620 + }, + { + "epoch": 1.8504689514157442, + "grad_norm": 0.15788322687149048, + "learning_rate": 5.0766766571046455e-05, + "loss": 0.2686, + "step": 41630 + }, + { + "epoch": 1.8509134551273503, + "grad_norm": 0.1852489560842514, + "learning_rate": 5.07461005638625e-05, + "loss": 0.2709, + "step": 41640 + }, + { + "epoch": 1.8513579588389564, + "grad_norm": 0.17345015704631805, + "learning_rate": 5.072543442919037e-05, + "loss": 0.2694, + "step": 41650 + }, + { + "epoch": 1.8518024625505622, + "grad_norm": 0.17311282455921173, + "learning_rate": 5.070476817056132e-05, + "loss": 0.2696, + "step": 41660 + }, + { + "epoch": 1.8522469662621683, + "grad_norm": 0.17826738953590393, + "learning_rate": 5.068410179150668e-05, + "loss": 0.2673, + "step": 41670 + }, + { + "epoch": 1.8526914699737742, + "grad_norm": 0.14363375306129456, + "learning_rate": 5.066343529555775e-05, + "loss": 0.269, + "step": 41680 + }, + { + "epoch": 1.8531359736853803, + "grad_norm": 0.1820768266916275, + "learning_rate": 5.064276868624587e-05, + "loss": 0.2687, + "step": 41690 + }, + { + "epoch": 1.8535804773969864, + "grad_norm": 0.17046837508678436, + "learning_rate": 5.0622101967102396e-05, + "loss": 0.2685, + "step": 41700 + }, + { + "epoch": 1.8540249811085923, + "grad_norm": 0.18023712933063507, + "learning_rate": 5.0601435141658714e-05, + "loss": 0.2667, + "step": 41710 + }, + { + "epoch": 1.8544694848201981, + "grad_norm": 0.16819073259830475, + "learning_rate": 5.058076821344622e-05, + "loss": 0.2678, + "step": 41720 + }, + { + "epoch": 1.8549139885318042, + "grad_norm": 0.19737380743026733, + "learning_rate": 5.056010118599632e-05, + "loss": 0.2691, + "step": 41730 + }, + { + "epoch": 1.8553584922434103, + "grad_norm": 0.15406909584999084, + "learning_rate": 5.053943406284044e-05, + "loss": 0.2685, + "step": 41740 + }, + { + "epoch": 1.8558029959550162, + "grad_norm": 0.18010462820529938, + "learning_rate": 5.051876684751006e-05, + "loss": 0.2682, + "step": 41750 + }, + { + "epoch": 1.856247499666622, + "grad_norm": 0.1806551218032837, + "learning_rate": 5.0498099543536584e-05, + "loss": 0.2706, + "step": 41760 + }, + { + "epoch": 1.8566920033782282, + "grad_norm": 0.15465225279331207, + "learning_rate": 5.047743215445152e-05, + "loss": 0.2663, + "step": 41770 + }, + { + "epoch": 1.8571365070898342, + "grad_norm": 0.1696760654449463, + "learning_rate": 5.045676468378637e-05, + "loss": 0.2692, + "step": 41780 + }, + { + "epoch": 1.8575810108014403, + "grad_norm": 0.18754571676254272, + "learning_rate": 5.0436097135072626e-05, + "loss": 0.2704, + "step": 41790 + }, + { + "epoch": 1.8580255145130462, + "grad_norm": 0.20593594014644623, + "learning_rate": 5.041542951184181e-05, + "loss": 0.2675, + "step": 41800 + }, + { + "epoch": 1.858470018224652, + "grad_norm": 0.18006989359855652, + "learning_rate": 5.039476181762545e-05, + "loss": 0.27, + "step": 41810 + }, + { + "epoch": 1.8589145219362582, + "grad_norm": 0.1785564422607422, + "learning_rate": 5.037409405595508e-05, + "loss": 0.2689, + "step": 41820 + }, + { + "epoch": 1.8593590256478643, + "grad_norm": 0.19221314787864685, + "learning_rate": 5.035342623036229e-05, + "loss": 0.2689, + "step": 41830 + }, + { + "epoch": 1.8598035293594701, + "grad_norm": 0.18709354102611542, + "learning_rate": 5.033275834437862e-05, + "loss": 0.2714, + "step": 41840 + }, + { + "epoch": 1.860248033071076, + "grad_norm": 0.20733048021793365, + "learning_rate": 5.031209040153564e-05, + "loss": 0.2665, + "step": 41850 + }, + { + "epoch": 1.860692536782682, + "grad_norm": 0.17655102908611298, + "learning_rate": 5.0291422405364955e-05, + "loss": 0.2664, + "step": 41860 + }, + { + "epoch": 1.8611370404942882, + "grad_norm": 0.15775494277477264, + "learning_rate": 5.0270754359398133e-05, + "loss": 0.2709, + "step": 41870 + }, + { + "epoch": 1.8615815442058943, + "grad_norm": 0.1900552362203598, + "learning_rate": 5.025008626716682e-05, + "loss": 0.2706, + "step": 41880 + }, + { + "epoch": 1.8620260479175001, + "grad_norm": 0.13947147130966187, + "learning_rate": 5.0229418132202585e-05, + "loss": 0.2691, + "step": 41890 + }, + { + "epoch": 1.862470551629106, + "grad_norm": 0.16437223553657532, + "learning_rate": 5.020874995803707e-05, + "loss": 0.2716, + "step": 41900 + }, + { + "epoch": 1.8629150553407121, + "grad_norm": 0.16293495893478394, + "learning_rate": 5.01880817482019e-05, + "loss": 0.2694, + "step": 41910 + }, + { + "epoch": 1.8633595590523182, + "grad_norm": 0.16374270617961884, + "learning_rate": 5.01674135062287e-05, + "loss": 0.2695, + "step": 41920 + }, + { + "epoch": 1.863804062763924, + "grad_norm": 0.1880612075328827, + "learning_rate": 5.0146745235649115e-05, + "loss": 0.2707, + "step": 41930 + }, + { + "epoch": 1.86424856647553, + "grad_norm": 0.18206718564033508, + "learning_rate": 5.0126076939994795e-05, + "loss": 0.2684, + "step": 41940 + }, + { + "epoch": 1.864693070187136, + "grad_norm": 0.16541598737239838, + "learning_rate": 5.010540862279736e-05, + "loss": 0.2683, + "step": 41950 + }, + { + "epoch": 1.8651375738987421, + "grad_norm": 0.15704454481601715, + "learning_rate": 5.008474028758846e-05, + "loss": 0.2673, + "step": 41960 + }, + { + "epoch": 1.865582077610348, + "grad_norm": 0.1732553392648697, + "learning_rate": 5.0064071937899784e-05, + "loss": 0.2694, + "step": 41970 + }, + { + "epoch": 1.866026581321954, + "grad_norm": 0.1561509072780609, + "learning_rate": 5.004340357726296e-05, + "loss": 0.2704, + "step": 41980 + }, + { + "epoch": 1.86647108503356, + "grad_norm": 0.15797972679138184, + "learning_rate": 5.002273520920965e-05, + "loss": 0.2647, + "step": 41990 + }, + { + "epoch": 1.866915588745166, + "grad_norm": 0.1455693393945694, + "learning_rate": 5.000206683727151e-05, + "loss": 0.2667, + "step": 42000 + }, + { + "epoch": 1.8673600924567721, + "grad_norm": 0.15300500392913818, + "learning_rate": 4.998139846498021e-05, + "loss": 0.2677, + "step": 42010 + }, + { + "epoch": 1.867804596168378, + "grad_norm": 0.1690310686826706, + "learning_rate": 4.9960730095867405e-05, + "loss": 0.2706, + "step": 42020 + }, + { + "epoch": 1.8682490998799839, + "grad_norm": 0.17974711954593658, + "learning_rate": 4.9940061733464755e-05, + "loss": 0.2702, + "step": 42030 + }, + { + "epoch": 1.86869360359159, + "grad_norm": 0.2023264467716217, + "learning_rate": 4.991939338130392e-05, + "loss": 0.271, + "step": 42040 + }, + { + "epoch": 1.869138107303196, + "grad_norm": 0.1880364716053009, + "learning_rate": 4.989872504291653e-05, + "loss": 0.2668, + "step": 42050 + }, + { + "epoch": 1.869582611014802, + "grad_norm": 0.1697692573070526, + "learning_rate": 4.9878056721834273e-05, + "loss": 0.2665, + "step": 42060 + }, + { + "epoch": 1.8700271147264078, + "grad_norm": 0.1706308275461197, + "learning_rate": 4.98573884215888e-05, + "loss": 0.2694, + "step": 42070 + }, + { + "epoch": 1.870471618438014, + "grad_norm": 0.1765649914741516, + "learning_rate": 4.9836720145711715e-05, + "loss": 0.2698, + "step": 42080 + }, + { + "epoch": 1.87091612214962, + "grad_norm": 0.19290174543857574, + "learning_rate": 4.9816051897734725e-05, + "loss": 0.2677, + "step": 42090 + }, + { + "epoch": 1.871360625861226, + "grad_norm": 0.1848367154598236, + "learning_rate": 4.979538368118941e-05, + "loss": 0.268, + "step": 42100 + }, + { + "epoch": 1.871805129572832, + "grad_norm": 0.15484017133712769, + "learning_rate": 4.9774715499607446e-05, + "loss": 0.2664, + "step": 42110 + }, + { + "epoch": 1.8722496332844378, + "grad_norm": 0.15579764544963837, + "learning_rate": 4.9754047356520416e-05, + "loss": 0.269, + "step": 42120 + }, + { + "epoch": 1.872694136996044, + "grad_norm": 0.1733039766550064, + "learning_rate": 4.973337925545999e-05, + "loss": 0.2679, + "step": 42130 + }, + { + "epoch": 1.87313864070765, + "grad_norm": 0.15354834496974945, + "learning_rate": 4.9712711199957746e-05, + "loss": 0.2675, + "step": 42140 + }, + { + "epoch": 1.8735831444192559, + "grad_norm": 0.25291192531585693, + "learning_rate": 4.969204319354531e-05, + "loss": 0.2713, + "step": 42150 + }, + { + "epoch": 1.8740276481308618, + "grad_norm": 0.17581447958946228, + "learning_rate": 4.9671375239754267e-05, + "loss": 0.2677, + "step": 42160 + }, + { + "epoch": 1.8744721518424678, + "grad_norm": 0.18519851565361023, + "learning_rate": 4.9650707342116195e-05, + "loss": 0.271, + "step": 42170 + }, + { + "epoch": 1.874916655554074, + "grad_norm": 0.18755747377872467, + "learning_rate": 4.96300395041627e-05, + "loss": 0.2709, + "step": 42180 + }, + { + "epoch": 1.87536115926568, + "grad_norm": 0.18724936246871948, + "learning_rate": 4.960937172942532e-05, + "loss": 0.2707, + "step": 42190 + }, + { + "epoch": 1.875805662977286, + "grad_norm": 0.21075987815856934, + "learning_rate": 4.958870402143565e-05, + "loss": 0.2697, + "step": 42200 + }, + { + "epoch": 1.8762501666888918, + "grad_norm": 0.17207559943199158, + "learning_rate": 4.9568036383725186e-05, + "loss": 0.2692, + "step": 42210 + }, + { + "epoch": 1.8766946704004979, + "grad_norm": 0.1493176966905594, + "learning_rate": 4.9547368819825516e-05, + "loss": 0.2657, + "step": 42220 + }, + { + "epoch": 1.877139174112104, + "grad_norm": 0.14918364584445953, + "learning_rate": 4.952670133326812e-05, + "loss": 0.2687, + "step": 42230 + }, + { + "epoch": 1.8775836778237098, + "grad_norm": 0.15746961534023285, + "learning_rate": 4.950603392758453e-05, + "loss": 0.2692, + "step": 42240 + }, + { + "epoch": 1.8780281815353157, + "grad_norm": 0.1839819848537445, + "learning_rate": 4.948536660630621e-05, + "loss": 0.2693, + "step": 42250 + }, + { + "epoch": 1.8784726852469218, + "grad_norm": 0.17825838923454285, + "learning_rate": 4.9464699372964685e-05, + "loss": 0.2675, + "step": 42260 + }, + { + "epoch": 1.8789171889585279, + "grad_norm": 0.16189177334308624, + "learning_rate": 4.9444032231091395e-05, + "loss": 0.2665, + "step": 42270 + }, + { + "epoch": 1.8793616926701338, + "grad_norm": 0.16237910091876984, + "learning_rate": 4.9423365184217776e-05, + "loss": 0.2714, + "step": 42280 + }, + { + "epoch": 1.8798061963817396, + "grad_norm": 0.18336281180381775, + "learning_rate": 4.940269823587529e-05, + "loss": 0.2689, + "step": 42290 + }, + { + "epoch": 1.8802507000933457, + "grad_norm": 0.17786960303783417, + "learning_rate": 4.938203138959533e-05, + "loss": 0.2677, + "step": 42300 + }, + { + "epoch": 1.8806952038049518, + "grad_norm": 0.18385642766952515, + "learning_rate": 4.9361364648909325e-05, + "loss": 0.2666, + "step": 42310 + }, + { + "epoch": 1.881139707516558, + "grad_norm": 0.16237401962280273, + "learning_rate": 4.9340698017348605e-05, + "loss": 0.2674, + "step": 42320 + }, + { + "epoch": 1.8815842112281638, + "grad_norm": 0.1914304941892624, + "learning_rate": 4.932003149844458e-05, + "loss": 0.2707, + "step": 42330 + }, + { + "epoch": 1.8820287149397696, + "grad_norm": 0.14047037065029144, + "learning_rate": 4.929936509572857e-05, + "loss": 0.2691, + "step": 42340 + }, + { + "epoch": 1.8824732186513757, + "grad_norm": 0.1794261485338211, + "learning_rate": 4.927869881273191e-05, + "loss": 0.2665, + "step": 42350 + }, + { + "epoch": 1.8829177223629818, + "grad_norm": 0.1652449667453766, + "learning_rate": 4.9258032652985894e-05, + "loss": 0.2675, + "step": 42360 + }, + { + "epoch": 1.8833622260745877, + "grad_norm": 0.14715929329395294, + "learning_rate": 4.9237366620021786e-05, + "loss": 0.2673, + "step": 42370 + }, + { + "epoch": 1.8838067297861936, + "grad_norm": 0.15877148509025574, + "learning_rate": 4.921670071737089e-05, + "loss": 0.2717, + "step": 42380 + }, + { + "epoch": 1.8842512334977997, + "grad_norm": 0.16689488291740417, + "learning_rate": 4.91960349485644e-05, + "loss": 0.2677, + "step": 42390 + }, + { + "epoch": 1.8846957372094058, + "grad_norm": 0.17398320138454437, + "learning_rate": 4.9175369317133566e-05, + "loss": 0.2665, + "step": 42400 + }, + { + "epoch": 1.8851402409210118, + "grad_norm": 0.14171820878982544, + "learning_rate": 4.915470382660953e-05, + "loss": 0.269, + "step": 42410 + }, + { + "epoch": 1.8855847446326177, + "grad_norm": 0.161507248878479, + "learning_rate": 4.9134038480523524e-05, + "loss": 0.2666, + "step": 42420 + }, + { + "epoch": 1.8860292483442236, + "grad_norm": 0.16316574811935425, + "learning_rate": 4.911337328240664e-05, + "loss": 0.2693, + "step": 42430 + }, + { + "epoch": 1.8864737520558297, + "grad_norm": 0.16098839044570923, + "learning_rate": 4.909270823579003e-05, + "loss": 0.2695, + "step": 42440 + }, + { + "epoch": 1.8869182557674358, + "grad_norm": 0.1635853350162506, + "learning_rate": 4.907204334420476e-05, + "loss": 0.2673, + "step": 42450 + }, + { + "epoch": 1.8873627594790416, + "grad_norm": 0.2067151665687561, + "learning_rate": 4.9051378611181884e-05, + "loss": 0.2675, + "step": 42460 + }, + { + "epoch": 1.8878072631906475, + "grad_norm": 0.17338800430297852, + "learning_rate": 4.9030714040252486e-05, + "loss": 0.2657, + "step": 42470 + }, + { + "epoch": 1.8882517669022536, + "grad_norm": 0.19933518767356873, + "learning_rate": 4.901004963494752e-05, + "loss": 0.2716, + "step": 42480 + }, + { + "epoch": 1.8886962706138597, + "grad_norm": 0.16183778643608093, + "learning_rate": 4.898938539879802e-05, + "loss": 0.2671, + "step": 42490 + }, + { + "epoch": 1.8891407743254656, + "grad_norm": 0.18069986999034882, + "learning_rate": 4.8968721335334904e-05, + "loss": 0.2684, + "step": 42500 + }, + { + "epoch": 1.8895852780370717, + "grad_norm": 0.1521652638912201, + "learning_rate": 4.894805744808912e-05, + "loss": 0.2668, + "step": 42510 + }, + { + "epoch": 1.8900297817486775, + "grad_norm": 0.1801007241010666, + "learning_rate": 4.8927393740591524e-05, + "loss": 0.2677, + "step": 42520 + }, + { + "epoch": 1.8904742854602836, + "grad_norm": 0.18591246008872986, + "learning_rate": 4.890673021637302e-05, + "loss": 0.2662, + "step": 42530 + }, + { + "epoch": 1.8909187891718897, + "grad_norm": 0.1877308487892151, + "learning_rate": 4.888606687896442e-05, + "loss": 0.2682, + "step": 42540 + }, + { + "epoch": 1.8913632928834956, + "grad_norm": 0.17111946642398834, + "learning_rate": 4.886540373189652e-05, + "loss": 0.2664, + "step": 42550 + }, + { + "epoch": 1.8918077965951015, + "grad_norm": 0.16158297657966614, + "learning_rate": 4.88447407787001e-05, + "loss": 0.2672, + "step": 42560 + }, + { + "epoch": 1.8922523003067075, + "grad_norm": 0.21404148638248444, + "learning_rate": 4.8824078022905864e-05, + "loss": 0.2668, + "step": 42570 + }, + { + "epoch": 1.8926968040183136, + "grad_norm": 0.17329774796962738, + "learning_rate": 4.880341546804455e-05, + "loss": 0.2669, + "step": 42580 + }, + { + "epoch": 1.8931413077299195, + "grad_norm": 0.15050868690013885, + "learning_rate": 4.878275311764679e-05, + "loss": 0.2671, + "step": 42590 + }, + { + "epoch": 1.8935858114415254, + "grad_norm": 0.18222616612911224, + "learning_rate": 4.8762090975243254e-05, + "loss": 0.2673, + "step": 42600 + }, + { + "epoch": 1.8940303151531315, + "grad_norm": 0.20553933084011078, + "learning_rate": 4.874142904436448e-05, + "loss": 0.2692, + "step": 42610 + }, + { + "epoch": 1.8944748188647376, + "grad_norm": 0.19723308086395264, + "learning_rate": 4.8720767328541094e-05, + "loss": 0.2686, + "step": 42620 + }, + { + "epoch": 1.8949193225763437, + "grad_norm": 0.15647748112678528, + "learning_rate": 4.870010583130358e-05, + "loss": 0.2672, + "step": 42630 + }, + { + "epoch": 1.8953638262879495, + "grad_norm": 0.17033588886260986, + "learning_rate": 4.867944455618241e-05, + "loss": 0.2675, + "step": 42640 + }, + { + "epoch": 1.8958083299995554, + "grad_norm": 0.18325451016426086, + "learning_rate": 4.865878350670807e-05, + "loss": 0.2693, + "step": 42650 + }, + { + "epoch": 1.8962528337111615, + "grad_norm": 0.15770381689071655, + "learning_rate": 4.8638122686410914e-05, + "loss": 0.2688, + "step": 42660 + }, + { + "epoch": 1.8966973374227676, + "grad_norm": 0.1730765998363495, + "learning_rate": 4.861746209882137e-05, + "loss": 0.2687, + "step": 42670 + }, + { + "epoch": 1.8971418411343735, + "grad_norm": 0.20821891725063324, + "learning_rate": 4.859680174746972e-05, + "loss": 0.2683, + "step": 42680 + }, + { + "epoch": 1.8975863448459793, + "grad_norm": 0.18986444175243378, + "learning_rate": 4.857614163588629e-05, + "loss": 0.2678, + "step": 42690 + }, + { + "epoch": 1.8980308485575854, + "grad_norm": 0.1797534078359604, + "learning_rate": 4.855548176760131e-05, + "loss": 0.2699, + "step": 42700 + }, + { + "epoch": 1.8984753522691915, + "grad_norm": 0.21393898129463196, + "learning_rate": 4.8534822146144986e-05, + "loss": 0.2695, + "step": 42710 + }, + { + "epoch": 1.8989198559807976, + "grad_norm": 0.16644059121608734, + "learning_rate": 4.851416277504749e-05, + "loss": 0.2701, + "step": 42720 + }, + { + "epoch": 1.8993643596924035, + "grad_norm": 0.2130965292453766, + "learning_rate": 4.8493503657838923e-05, + "loss": 0.2661, + "step": 42730 + }, + { + "epoch": 1.8998088634040093, + "grad_norm": 0.1655196100473404, + "learning_rate": 4.84728447980494e-05, + "loss": 0.2667, + "step": 42740 + }, + { + "epoch": 1.9002533671156154, + "grad_norm": 0.13913047313690186, + "learning_rate": 4.8452186199208914e-05, + "loss": 0.2686, + "step": 42750 + }, + { + "epoch": 1.9006978708272215, + "grad_norm": 0.14545272290706635, + "learning_rate": 4.843152786484749e-05, + "loss": 0.268, + "step": 42760 + }, + { + "epoch": 1.9011423745388274, + "grad_norm": 0.14947286248207092, + "learning_rate": 4.8410869798495035e-05, + "loss": 0.2677, + "step": 42770 + }, + { + "epoch": 1.9015868782504333, + "grad_norm": 0.1519133597612381, + "learning_rate": 4.8390212003681486e-05, + "loss": 0.2685, + "step": 42780 + }, + { + "epoch": 1.9020313819620394, + "grad_norm": 0.19632023572921753, + "learning_rate": 4.836955448393667e-05, + "loss": 0.2694, + "step": 42790 + }, + { + "epoch": 1.9024758856736455, + "grad_norm": 0.17237669229507446, + "learning_rate": 4.8348897242790406e-05, + "loss": 0.2684, + "step": 42800 + }, + { + "epoch": 1.9029203893852513, + "grad_norm": 0.15425142645835876, + "learning_rate": 4.832824028377243e-05, + "loss": 0.2686, + "step": 42810 + }, + { + "epoch": 1.9033648930968574, + "grad_norm": 0.17963336408138275, + "learning_rate": 4.830758361041249e-05, + "loss": 0.2676, + "step": 42820 + }, + { + "epoch": 1.9038093968084633, + "grad_norm": 0.20585545897483826, + "learning_rate": 4.828692722624022e-05, + "loss": 0.2678, + "step": 42830 + }, + { + "epoch": 1.9042539005200694, + "grad_norm": 0.16308259963989258, + "learning_rate": 4.826627113478522e-05, + "loss": 0.2679, + "step": 42840 + }, + { + "epoch": 1.9046984042316755, + "grad_norm": 0.16714277863502502, + "learning_rate": 4.824561533957708e-05, + "loss": 0.2676, + "step": 42850 + }, + { + "epoch": 1.9051429079432813, + "grad_norm": 0.16811715066432953, + "learning_rate": 4.822495984414527e-05, + "loss": 0.2686, + "step": 42860 + }, + { + "epoch": 1.9055874116548872, + "grad_norm": 0.14795762300491333, + "learning_rate": 4.8204304652019304e-05, + "loss": 0.2677, + "step": 42870 + }, + { + "epoch": 1.9060319153664933, + "grad_norm": 0.1767570823431015, + "learning_rate": 4.8183649766728537e-05, + "loss": 0.2704, + "step": 42880 + }, + { + "epoch": 1.9064764190780994, + "grad_norm": 0.1734824925661087, + "learning_rate": 4.816299519180237e-05, + "loss": 0.263, + "step": 42890 + }, + { + "epoch": 1.9069209227897053, + "grad_norm": 0.1591903418302536, + "learning_rate": 4.8142340930770066e-05, + "loss": 0.2673, + "step": 42900 + }, + { + "epoch": 1.9073654265013111, + "grad_norm": 0.17874839901924133, + "learning_rate": 4.81216869871609e-05, + "loss": 0.2638, + "step": 42910 + }, + { + "epoch": 1.9078099302129172, + "grad_norm": 0.21075953543186188, + "learning_rate": 4.8101033364504064e-05, + "loss": 0.2671, + "step": 42920 + }, + { + "epoch": 1.9082544339245233, + "grad_norm": 0.169281467795372, + "learning_rate": 4.808038006632866e-05, + "loss": 0.2659, + "step": 42930 + }, + { + "epoch": 1.9086989376361294, + "grad_norm": 0.1745496392250061, + "learning_rate": 4.805972709616383e-05, + "loss": 0.2687, + "step": 42940 + }, + { + "epoch": 1.9091434413477353, + "grad_norm": 0.18321764469146729, + "learning_rate": 4.8039074457538556e-05, + "loss": 0.2655, + "step": 42950 + }, + { + "epoch": 1.9095879450593412, + "grad_norm": 0.15406882762908936, + "learning_rate": 4.801842215398184e-05, + "loss": 0.2668, + "step": 42960 + }, + { + "epoch": 1.9100324487709472, + "grad_norm": 0.17014172673225403, + "learning_rate": 4.799777018902256e-05, + "loss": 0.2659, + "step": 42970 + }, + { + "epoch": 1.9104769524825533, + "grad_norm": 0.15372276306152344, + "learning_rate": 4.797711856618961e-05, + "loss": 0.269, + "step": 42980 + }, + { + "epoch": 1.9109214561941592, + "grad_norm": 0.16014721989631653, + "learning_rate": 4.7956467289011765e-05, + "loss": 0.2663, + "step": 42990 + }, + { + "epoch": 1.911365959905765, + "grad_norm": 0.1493186503648758, + "learning_rate": 4.793581636101778e-05, + "loss": 0.2686, + "step": 43000 + }, + { + "epoch": 1.9118104636173712, + "grad_norm": 0.16326089203357697, + "learning_rate": 4.7915165785736326e-05, + "loss": 0.2679, + "step": 43010 + }, + { + "epoch": 1.9122549673289773, + "grad_norm": 0.17414048314094543, + "learning_rate": 4.789451556669599e-05, + "loss": 0.2678, + "step": 43020 + }, + { + "epoch": 1.9126994710405834, + "grad_norm": 0.1858467161655426, + "learning_rate": 4.7873865707425384e-05, + "loss": 0.2668, + "step": 43030 + }, + { + "epoch": 1.9131439747521892, + "grad_norm": 0.20612090826034546, + "learning_rate": 4.7853216211452975e-05, + "loss": 0.2723, + "step": 43040 + }, + { + "epoch": 1.913588478463795, + "grad_norm": 0.17120467126369476, + "learning_rate": 4.78325670823072e-05, + "loss": 0.266, + "step": 43050 + }, + { + "epoch": 1.9140329821754012, + "grad_norm": 0.20417262613773346, + "learning_rate": 4.781191832351641e-05, + "loss": 0.2644, + "step": 43060 + }, + { + "epoch": 1.9144774858870073, + "grad_norm": 0.18853768706321716, + "learning_rate": 4.7791269938608965e-05, + "loss": 0.2648, + "step": 43070 + }, + { + "epoch": 1.9149219895986132, + "grad_norm": 0.16614322364330292, + "learning_rate": 4.777062193111305e-05, + "loss": 0.2671, + "step": 43080 + }, + { + "epoch": 1.915366493310219, + "grad_norm": 0.1629277914762497, + "learning_rate": 4.77499743045569e-05, + "loss": 0.2692, + "step": 43090 + }, + { + "epoch": 1.9158109970218251, + "grad_norm": 0.19883257150650024, + "learning_rate": 4.7729327062468596e-05, + "loss": 0.2661, + "step": 43100 + }, + { + "epoch": 1.9162555007334312, + "grad_norm": 0.15603519976139069, + "learning_rate": 4.770868020837619e-05, + "loss": 0.2682, + "step": 43110 + }, + { + "epoch": 1.916700004445037, + "grad_norm": 0.16674967110157013, + "learning_rate": 4.768803374580768e-05, + "loss": 0.2685, + "step": 43120 + }, + { + "epoch": 1.917144508156643, + "grad_norm": 0.17558921873569489, + "learning_rate": 4.766738767829094e-05, + "loss": 0.2697, + "step": 43130 + }, + { + "epoch": 1.917589011868249, + "grad_norm": 0.1855425238609314, + "learning_rate": 4.764674200935388e-05, + "loss": 0.2686, + "step": 43140 + }, + { + "epoch": 1.9180335155798551, + "grad_norm": 0.16429315507411957, + "learning_rate": 4.762609674252424e-05, + "loss": 0.2665, + "step": 43150 + }, + { + "epoch": 1.9184780192914612, + "grad_norm": 0.19376298785209656, + "learning_rate": 4.760545188132974e-05, + "loss": 0.2664, + "step": 43160 + }, + { + "epoch": 1.918922523003067, + "grad_norm": 0.15324905514717102, + "learning_rate": 4.758480742929801e-05, + "loss": 0.2706, + "step": 43170 + }, + { + "epoch": 1.919367026714673, + "grad_norm": 0.17844116687774658, + "learning_rate": 4.756416338995664e-05, + "loss": 0.266, + "step": 43180 + }, + { + "epoch": 1.919811530426279, + "grad_norm": 0.18272168934345245, + "learning_rate": 4.7543519766833145e-05, + "loss": 0.2661, + "step": 43190 + }, + { + "epoch": 1.9202560341378851, + "grad_norm": 0.1726585477590561, + "learning_rate": 4.752287656345492e-05, + "loss": 0.2689, + "step": 43200 + }, + { + "epoch": 1.920700537849491, + "grad_norm": 0.1934775710105896, + "learning_rate": 4.7502233783349355e-05, + "loss": 0.2691, + "step": 43210 + }, + { + "epoch": 1.9211450415610969, + "grad_norm": 0.20436939597129822, + "learning_rate": 4.7481591430043694e-05, + "loss": 0.2668, + "step": 43220 + }, + { + "epoch": 1.921589545272703, + "grad_norm": 0.16639918088912964, + "learning_rate": 4.746094950706521e-05, + "loss": 0.2685, + "step": 43230 + }, + { + "epoch": 1.922034048984309, + "grad_norm": 0.15962105989456177, + "learning_rate": 4.744030801794099e-05, + "loss": 0.2656, + "step": 43240 + }, + { + "epoch": 1.9224785526959152, + "grad_norm": 0.15894219279289246, + "learning_rate": 4.741966696619813e-05, + "loss": 0.2683, + "step": 43250 + }, + { + "epoch": 1.922923056407521, + "grad_norm": 0.21187688410282135, + "learning_rate": 4.739902635536361e-05, + "loss": 0.2683, + "step": 43260 + }, + { + "epoch": 1.923367560119127, + "grad_norm": 0.17784829437732697, + "learning_rate": 4.737838618896436e-05, + "loss": 0.2674, + "step": 43270 + }, + { + "epoch": 1.923812063830733, + "grad_norm": 0.19455471634864807, + "learning_rate": 4.7357746470527203e-05, + "loss": 0.2695, + "step": 43280 + }, + { + "epoch": 1.924256567542339, + "grad_norm": 0.19174623489379883, + "learning_rate": 4.73371072035789e-05, + "loss": 0.2699, + "step": 43290 + }, + { + "epoch": 1.924701071253945, + "grad_norm": 0.16154678165912628, + "learning_rate": 4.731646839164616e-05, + "loss": 0.267, + "step": 43300 + }, + { + "epoch": 1.9251455749655508, + "grad_norm": 0.17766566574573517, + "learning_rate": 4.729583003825556e-05, + "loss": 0.2677, + "step": 43310 + }, + { + "epoch": 1.925590078677157, + "grad_norm": 0.164158433675766, + "learning_rate": 4.727519214693367e-05, + "loss": 0.2677, + "step": 43320 + }, + { + "epoch": 1.926034582388763, + "grad_norm": 0.18326881527900696, + "learning_rate": 4.725455472120689e-05, + "loss": 0.2678, + "step": 43330 + }, + { + "epoch": 1.9264790861003689, + "grad_norm": 0.1772029995918274, + "learning_rate": 4.723391776460164e-05, + "loss": 0.2679, + "step": 43340 + }, + { + "epoch": 1.926923589811975, + "grad_norm": 0.1643243134021759, + "learning_rate": 4.7213281280644186e-05, + "loss": 0.2674, + "step": 43350 + }, + { + "epoch": 1.9273680935235808, + "grad_norm": 0.1466251015663147, + "learning_rate": 4.719264527286075e-05, + "loss": 0.2642, + "step": 43360 + }, + { + "epoch": 1.927812597235187, + "grad_norm": 0.1497475504875183, + "learning_rate": 4.717200974477744e-05, + "loss": 0.2677, + "step": 43370 + }, + { + "epoch": 1.928257100946793, + "grad_norm": 0.14198116958141327, + "learning_rate": 4.715137469992034e-05, + "loss": 0.2668, + "step": 43380 + }, + { + "epoch": 1.928701604658399, + "grad_norm": 0.15913712978363037, + "learning_rate": 4.713074014181539e-05, + "loss": 0.2684, + "step": 43390 + }, + { + "epoch": 1.9291461083700048, + "grad_norm": 0.19380132853984833, + "learning_rate": 4.711010607398846e-05, + "loss": 0.2674, + "step": 43400 + }, + { + "epoch": 1.9295906120816109, + "grad_norm": 0.16491565108299255, + "learning_rate": 4.708947249996538e-05, + "loss": 0.2646, + "step": 43410 + }, + { + "epoch": 1.930035115793217, + "grad_norm": 0.14545516669750214, + "learning_rate": 4.706883942327183e-05, + "loss": 0.2694, + "step": 43420 + }, + { + "epoch": 1.9304796195048228, + "grad_norm": 0.15470191836357117, + "learning_rate": 4.704820684743347e-05, + "loss": 0.2688, + "step": 43430 + }, + { + "epoch": 1.9309241232164287, + "grad_norm": 0.1572125107049942, + "learning_rate": 4.702757477597581e-05, + "loss": 0.268, + "step": 43440 + }, + { + "epoch": 1.9313686269280348, + "grad_norm": 0.1669055074453354, + "learning_rate": 4.700694321242435e-05, + "loss": 0.2726, + "step": 43450 + }, + { + "epoch": 1.9318131306396409, + "grad_norm": 0.18335632979869843, + "learning_rate": 4.698631216030441e-05, + "loss": 0.266, + "step": 43460 + }, + { + "epoch": 1.932257634351247, + "grad_norm": 0.17314282059669495, + "learning_rate": 4.6965681623141314e-05, + "loss": 0.2697, + "step": 43470 + }, + { + "epoch": 1.9327021380628528, + "grad_norm": 0.16119930148124695, + "learning_rate": 4.694505160446024e-05, + "loss": 0.269, + "step": 43480 + }, + { + "epoch": 1.9331466417744587, + "grad_norm": 0.19928516447544098, + "learning_rate": 4.692442210778628e-05, + "loss": 0.2693, + "step": 43490 + }, + { + "epoch": 1.9335911454860648, + "grad_norm": 0.20016545057296753, + "learning_rate": 4.690379313664448e-05, + "loss": 0.2686, + "step": 43500 + }, + { + "epoch": 1.934035649197671, + "grad_norm": 0.1839679628610611, + "learning_rate": 4.688316469455973e-05, + "loss": 0.2661, + "step": 43510 + }, + { + "epoch": 1.9344801529092768, + "grad_norm": 0.17355108261108398, + "learning_rate": 4.6862536785056914e-05, + "loss": 0.2698, + "step": 43520 + }, + { + "epoch": 1.9349246566208826, + "grad_norm": 0.1995788812637329, + "learning_rate": 4.684190941166073e-05, + "loss": 0.2684, + "step": 43530 + }, + { + "epoch": 1.9353691603324887, + "grad_norm": 0.1830091029405594, + "learning_rate": 4.682128257789587e-05, + "loss": 0.2692, + "step": 43540 + }, + { + "epoch": 1.9358136640440948, + "grad_norm": 0.17197033762931824, + "learning_rate": 4.680065628728687e-05, + "loss": 0.2691, + "step": 43550 + }, + { + "epoch": 1.936258167755701, + "grad_norm": 0.1939389705657959, + "learning_rate": 4.678003054335822e-05, + "loss": 0.2653, + "step": 43560 + }, + { + "epoch": 1.9367026714673068, + "grad_norm": 0.16476573050022125, + "learning_rate": 4.675940534963428e-05, + "loss": 0.2653, + "step": 43570 + }, + { + "epoch": 1.9371471751789127, + "grad_norm": 0.1915678083896637, + "learning_rate": 4.673878070963931e-05, + "loss": 0.2667, + "step": 43580 + }, + { + "epoch": 1.9375916788905188, + "grad_norm": 0.1631985753774643, + "learning_rate": 4.671815662689756e-05, + "loss": 0.2663, + "step": 43590 + }, + { + "epoch": 1.9380361826021248, + "grad_norm": 0.14390191435813904, + "learning_rate": 4.669753310493306e-05, + "loss": 0.2655, + "step": 43600 + }, + { + "epoch": 1.9384806863137307, + "grad_norm": 0.17811840772628784, + "learning_rate": 4.6676910147269845e-05, + "loss": 0.2695, + "step": 43610 + }, + { + "epoch": 1.9389251900253366, + "grad_norm": 0.19132032990455627, + "learning_rate": 4.665628775743177e-05, + "loss": 0.2671, + "step": 43620 + }, + { + "epoch": 1.9393696937369427, + "grad_norm": 0.1997167319059372, + "learning_rate": 4.66356659389427e-05, + "loss": 0.2675, + "step": 43630 + }, + { + "epoch": 1.9398141974485488, + "grad_norm": 0.16395708918571472, + "learning_rate": 4.661504469532629e-05, + "loss": 0.2645, + "step": 43640 + }, + { + "epoch": 1.9402587011601546, + "grad_norm": 0.1812075674533844, + "learning_rate": 4.659442403010617e-05, + "loss": 0.2685, + "step": 43650 + }, + { + "epoch": 1.9407032048717607, + "grad_norm": 0.17410948872566223, + "learning_rate": 4.6573803946805845e-05, + "loss": 0.2684, + "step": 43660 + }, + { + "epoch": 1.9411477085833666, + "grad_norm": 0.1776179075241089, + "learning_rate": 4.6553184448948695e-05, + "loss": 0.2649, + "step": 43670 + }, + { + "epoch": 1.9415922122949727, + "grad_norm": 0.18624331057071686, + "learning_rate": 4.653256554005807e-05, + "loss": 0.2676, + "step": 43680 + }, + { + "epoch": 1.9420367160065788, + "grad_norm": 0.18344372510910034, + "learning_rate": 4.6511947223657145e-05, + "loss": 0.2695, + "step": 43690 + }, + { + "epoch": 1.9424812197181847, + "grad_norm": 0.15081310272216797, + "learning_rate": 4.649132950326906e-05, + "loss": 0.2676, + "step": 43700 + }, + { + "epoch": 1.9429257234297905, + "grad_norm": 0.16453540325164795, + "learning_rate": 4.647071238241679e-05, + "loss": 0.2675, + "step": 43710 + }, + { + "epoch": 1.9433702271413966, + "grad_norm": 0.18497174978256226, + "learning_rate": 4.6450095864623264e-05, + "loss": 0.2678, + "step": 43720 + }, + { + "epoch": 1.9438147308530027, + "grad_norm": 0.14939726889133453, + "learning_rate": 4.642947995341123e-05, + "loss": 0.2682, + "step": 43730 + }, + { + "epoch": 1.9442592345646086, + "grad_norm": 0.17351068556308746, + "learning_rate": 4.640886465230345e-05, + "loss": 0.2707, + "step": 43740 + }, + { + "epoch": 1.9447037382762145, + "grad_norm": 0.16187416017055511, + "learning_rate": 4.6388249964822485e-05, + "loss": 0.2661, + "step": 43750 + }, + { + "epoch": 1.9451482419878205, + "grad_norm": 0.164433091878891, + "learning_rate": 4.63676358944908e-05, + "loss": 0.2672, + "step": 43760 + }, + { + "epoch": 1.9455927456994266, + "grad_norm": 0.1614636480808258, + "learning_rate": 4.634702244483081e-05, + "loss": 0.2678, + "step": 43770 + }, + { + "epoch": 1.9460372494110327, + "grad_norm": 0.18647772073745728, + "learning_rate": 4.632640961936475e-05, + "loss": 0.2664, + "step": 43780 + }, + { + "epoch": 1.9464817531226386, + "grad_norm": 0.1597995162010193, + "learning_rate": 4.6305797421614835e-05, + "loss": 0.2669, + "step": 43790 + }, + { + "epoch": 1.9469262568342445, + "grad_norm": 0.1668267697095871, + "learning_rate": 4.6285185855103105e-05, + "loss": 0.2654, + "step": 43800 + }, + { + "epoch": 1.9473707605458506, + "grad_norm": 0.14935511350631714, + "learning_rate": 4.626457492335151e-05, + "loss": 0.2637, + "step": 43810 + }, + { + "epoch": 1.9478152642574567, + "grad_norm": 0.1407187283039093, + "learning_rate": 4.624396462988188e-05, + "loss": 0.2697, + "step": 43820 + }, + { + "epoch": 1.9482597679690625, + "grad_norm": 0.18950840830802917, + "learning_rate": 4.6223354978215985e-05, + "loss": 0.2639, + "step": 43830 + }, + { + "epoch": 1.9487042716806684, + "grad_norm": 0.19441823661327362, + "learning_rate": 4.620274597187544e-05, + "loss": 0.2664, + "step": 43840 + }, + { + "epoch": 1.9491487753922745, + "grad_norm": 0.15625624358654022, + "learning_rate": 4.6182137614381736e-05, + "loss": 0.2656, + "step": 43850 + }, + { + "epoch": 1.9495932791038806, + "grad_norm": 0.19727785885334015, + "learning_rate": 4.616152990925631e-05, + "loss": 0.2682, + "step": 43860 + }, + { + "epoch": 1.9500377828154867, + "grad_norm": 0.1516989767551422, + "learning_rate": 4.614092286002041e-05, + "loss": 0.2647, + "step": 43870 + }, + { + "epoch": 1.9504822865270925, + "grad_norm": 0.15734682977199554, + "learning_rate": 4.612031647019528e-05, + "loss": 0.2636, + "step": 43880 + }, + { + "epoch": 1.9509267902386984, + "grad_norm": 0.17650678753852844, + "learning_rate": 4.609971074330193e-05, + "loss": 0.2676, + "step": 43890 + }, + { + "epoch": 1.9513712939503045, + "grad_norm": 0.15474265813827515, + "learning_rate": 4.607910568286136e-05, + "loss": 0.2681, + "step": 43900 + }, + { + "epoch": 1.9518157976619106, + "grad_norm": 0.15898777544498444, + "learning_rate": 4.605850129239438e-05, + "loss": 0.2654, + "step": 43910 + }, + { + "epoch": 1.9522603013735165, + "grad_norm": 0.15985621511936188, + "learning_rate": 4.603789757542174e-05, + "loss": 0.2679, + "step": 43920 + }, + { + "epoch": 1.9527048050851223, + "grad_norm": 0.1562805026769638, + "learning_rate": 4.601729453546404e-05, + "loss": 0.269, + "step": 43930 + }, + { + "epoch": 1.9531493087967284, + "grad_norm": 0.1547333300113678, + "learning_rate": 4.599669217604177e-05, + "loss": 0.2649, + "step": 43940 + }, + { + "epoch": 1.9535938125083345, + "grad_norm": 0.16045089066028595, + "learning_rate": 4.597609050067532e-05, + "loss": 0.2696, + "step": 43950 + }, + { + "epoch": 1.9540383162199404, + "grad_norm": 0.19849132001399994, + "learning_rate": 4.595548951288495e-05, + "loss": 0.2697, + "step": 43960 + }, + { + "epoch": 1.9544828199315463, + "grad_norm": 0.16817544400691986, + "learning_rate": 4.593488921619081e-05, + "loss": 0.2688, + "step": 43970 + }, + { + "epoch": 1.9549273236431524, + "grad_norm": 0.1750713735818863, + "learning_rate": 4.591428961411289e-05, + "loss": 0.2652, + "step": 43980 + }, + { + "epoch": 1.9553718273547585, + "grad_norm": 0.16412405669689178, + "learning_rate": 4.589369071017117e-05, + "loss": 0.2678, + "step": 43990 + }, + { + "epoch": 1.9558163310663645, + "grad_norm": 0.16067244112491608, + "learning_rate": 4.587309250788538e-05, + "loss": 0.2642, + "step": 44000 + }, + { + "epoch": 1.9562608347779704, + "grad_norm": 0.1562677025794983, + "learning_rate": 4.585249501077522e-05, + "loss": 0.265, + "step": 44010 + }, + { + "epoch": 1.9567053384895763, + "grad_norm": 0.1733907014131546, + "learning_rate": 4.583189822236021e-05, + "loss": 0.2696, + "step": 44020 + }, + { + "epoch": 1.9571498422011824, + "grad_norm": 0.18752314150333405, + "learning_rate": 4.5811302146159816e-05, + "loss": 0.2642, + "step": 44030 + }, + { + "epoch": 1.9575943459127885, + "grad_norm": 0.2046699821949005, + "learning_rate": 4.579070678569332e-05, + "loss": 0.2671, + "step": 44040 + }, + { + "epoch": 1.9580388496243943, + "grad_norm": 0.22363123297691345, + "learning_rate": 4.5770112144479904e-05, + "loss": 0.2661, + "step": 44050 + }, + { + "epoch": 1.9584833533360002, + "grad_norm": 0.17823828756809235, + "learning_rate": 4.5749518226038645e-05, + "loss": 0.2663, + "step": 44060 + }, + { + "epoch": 1.9589278570476063, + "grad_norm": 0.1853550523519516, + "learning_rate": 4.572892503388845e-05, + "loss": 0.2633, + "step": 44070 + }, + { + "epoch": 1.9593723607592124, + "grad_norm": 0.1823190450668335, + "learning_rate": 4.570833257154817e-05, + "loss": 0.2685, + "step": 44080 + }, + { + "epoch": 1.9598168644708185, + "grad_norm": 0.15361779928207397, + "learning_rate": 4.568774084253646e-05, + "loss": 0.2647, + "step": 44090 + }, + { + "epoch": 1.9602613681824244, + "grad_norm": 0.18439675867557526, + "learning_rate": 4.566714985037191e-05, + "loss": 0.2659, + "step": 44100 + }, + { + "epoch": 1.9607058718940302, + "grad_norm": 0.1690114289522171, + "learning_rate": 4.564655959857295e-05, + "loss": 0.2676, + "step": 44110 + }, + { + "epoch": 1.9611503756056363, + "grad_norm": 0.18338027596473694, + "learning_rate": 4.5625970090657893e-05, + "loss": 0.2675, + "step": 44120 + }, + { + "epoch": 1.9615948793172424, + "grad_norm": 0.20785681903362274, + "learning_rate": 4.560538133014491e-05, + "loss": 0.2702, + "step": 44130 + }, + { + "epoch": 1.9620393830288483, + "grad_norm": 0.1615620255470276, + "learning_rate": 4.5584793320552055e-05, + "loss": 0.2655, + "step": 44140 + }, + { + "epoch": 1.9624838867404542, + "grad_norm": 0.14832694828510284, + "learning_rate": 4.556420606539728e-05, + "loss": 0.2656, + "step": 44150 + }, + { + "epoch": 1.9629283904520602, + "grad_norm": 0.14714542031288147, + "learning_rate": 4.554361956819836e-05, + "loss": 0.2678, + "step": 44160 + }, + { + "epoch": 1.9633728941636663, + "grad_norm": 0.14864413440227509, + "learning_rate": 4.552303383247299e-05, + "loss": 0.2655, + "step": 44170 + }, + { + "epoch": 1.9638173978752722, + "grad_norm": 0.1412007361650467, + "learning_rate": 4.5502448861738664e-05, + "loss": 0.2672, + "step": 44180 + }, + { + "epoch": 1.9642619015868783, + "grad_norm": 0.14704906940460205, + "learning_rate": 4.548186465951284e-05, + "loss": 0.2683, + "step": 44190 + }, + { + "epoch": 1.9647064052984842, + "grad_norm": 0.15577298402786255, + "learning_rate": 4.546128122931277e-05, + "loss": 0.2637, + "step": 44200 + }, + { + "epoch": 1.9651509090100903, + "grad_norm": 0.15793699026107788, + "learning_rate": 4.544069857465561e-05, + "loss": 0.2652, + "step": 44210 + }, + { + "epoch": 1.9655954127216964, + "grad_norm": 0.1788339614868164, + "learning_rate": 4.542011669905837e-05, + "loss": 0.2673, + "step": 44220 + }, + { + "epoch": 1.9660399164333022, + "grad_norm": 0.16678552329540253, + "learning_rate": 4.53995356060379e-05, + "loss": 0.2658, + "step": 44230 + }, + { + "epoch": 1.966484420144908, + "grad_norm": 0.16403284668922424, + "learning_rate": 4.5378955299110994e-05, + "loss": 0.266, + "step": 44240 + }, + { + "epoch": 1.9669289238565142, + "grad_norm": 0.1530139148235321, + "learning_rate": 4.5358375781794233e-05, + "loss": 0.2663, + "step": 44250 + }, + { + "epoch": 1.9673734275681203, + "grad_norm": 0.14844468235969543, + "learning_rate": 4.533779705760411e-05, + "loss": 0.2688, + "step": 44260 + }, + { + "epoch": 1.9678179312797262, + "grad_norm": 0.16692882776260376, + "learning_rate": 4.5317219130056934e-05, + "loss": 0.264, + "step": 44270 + }, + { + "epoch": 1.968262434991332, + "grad_norm": 0.1964273750782013, + "learning_rate": 4.5296642002668964e-05, + "loss": 0.2704, + "step": 44280 + }, + { + "epoch": 1.9687069387029381, + "grad_norm": 0.19702424108982086, + "learning_rate": 4.5276065678956216e-05, + "loss": 0.2645, + "step": 44290 + }, + { + "epoch": 1.9691514424145442, + "grad_norm": 0.1820601224899292, + "learning_rate": 4.525549016243466e-05, + "loss": 0.2703, + "step": 44300 + }, + { + "epoch": 1.9695959461261503, + "grad_norm": 0.16948756575584412, + "learning_rate": 4.523491545662008e-05, + "loss": 0.2681, + "step": 44310 + }, + { + "epoch": 1.9700404498377562, + "grad_norm": 0.1382879912853241, + "learning_rate": 4.5214341565028104e-05, + "loss": 0.2662, + "step": 44320 + }, + { + "epoch": 1.970484953549362, + "grad_norm": 0.15289205312728882, + "learning_rate": 4.519376849117428e-05, + "loss": 0.2671, + "step": 44330 + }, + { + "epoch": 1.9709294572609681, + "grad_norm": 0.14684653282165527, + "learning_rate": 4.517319623857395e-05, + "loss": 0.2647, + "step": 44340 + }, + { + "epoch": 1.9713739609725742, + "grad_norm": 0.1845719814300537, + "learning_rate": 4.5152624810742394e-05, + "loss": 0.2651, + "step": 44350 + }, + { + "epoch": 1.97181846468418, + "grad_norm": 0.18812093138694763, + "learning_rate": 4.5132054211194664e-05, + "loss": 0.2679, + "step": 44360 + }, + { + "epoch": 1.972262968395786, + "grad_norm": 0.16211576759815216, + "learning_rate": 4.511148444344574e-05, + "loss": 0.2654, + "step": 44370 + }, + { + "epoch": 1.972707472107392, + "grad_norm": 0.1576479971408844, + "learning_rate": 4.509091551101041e-05, + "loss": 0.2646, + "step": 44380 + }, + { + "epoch": 1.9731519758189981, + "grad_norm": 0.1656247228384018, + "learning_rate": 4.507034741740338e-05, + "loss": 0.2684, + "step": 44390 + }, + { + "epoch": 1.9735964795306042, + "grad_norm": 0.17629757523536682, + "learning_rate": 4.5049780166139145e-05, + "loss": 0.2662, + "step": 44400 + }, + { + "epoch": 1.9740409832422101, + "grad_norm": 0.17195026576519012, + "learning_rate": 4.5029213760732075e-05, + "loss": 0.2666, + "step": 44410 + }, + { + "epoch": 1.974485486953816, + "grad_norm": 0.177549347281456, + "learning_rate": 4.5008648204696434e-05, + "loss": 0.2652, + "step": 44420 + }, + { + "epoch": 1.974929990665422, + "grad_norm": 0.16722214221954346, + "learning_rate": 4.4988083501546284e-05, + "loss": 0.2678, + "step": 44430 + }, + { + "epoch": 1.9753744943770282, + "grad_norm": 0.17046840488910675, + "learning_rate": 4.4967519654795606e-05, + "loss": 0.2677, + "step": 44440 + }, + { + "epoch": 1.975818998088634, + "grad_norm": 0.16333997249603271, + "learning_rate": 4.494695666795816e-05, + "loss": 0.2655, + "step": 44450 + }, + { + "epoch": 1.97626350180024, + "grad_norm": 0.19316770136356354, + "learning_rate": 4.4926394544547644e-05, + "loss": 0.2667, + "step": 44460 + }, + { + "epoch": 1.976708005511846, + "grad_norm": 0.19240038096904755, + "learning_rate": 4.490583328807752e-05, + "loss": 0.2652, + "step": 44470 + }, + { + "epoch": 1.977152509223452, + "grad_norm": 0.19501100480556488, + "learning_rate": 4.488527290206117e-05, + "loss": 0.2643, + "step": 44480 + }, + { + "epoch": 1.977597012935058, + "grad_norm": 0.17460572719573975, + "learning_rate": 4.48647133900118e-05, + "loss": 0.2652, + "step": 44490 + }, + { + "epoch": 1.9780415166466638, + "grad_norm": 0.2054593414068222, + "learning_rate": 4.484415475544244e-05, + "loss": 0.2663, + "step": 44500 + }, + { + "epoch": 1.97848602035827, + "grad_norm": 0.1603982001543045, + "learning_rate": 4.4823597001866035e-05, + "loss": 0.2639, + "step": 44510 + }, + { + "epoch": 1.978930524069876, + "grad_norm": 0.17379160225391388, + "learning_rate": 4.480304013279532e-05, + "loss": 0.268, + "step": 44520 + }, + { + "epoch": 1.979375027781482, + "grad_norm": 0.21426784992218018, + "learning_rate": 4.478248415174292e-05, + "loss": 0.2694, + "step": 44530 + }, + { + "epoch": 1.979819531493088, + "grad_norm": 0.191019207239151, + "learning_rate": 4.476192906222126e-05, + "loss": 0.2666, + "step": 44540 + }, + { + "epoch": 1.9802640352046939, + "grad_norm": 0.15579482913017273, + "learning_rate": 4.474137486774268e-05, + "loss": 0.2688, + "step": 44550 + }, + { + "epoch": 1.9807085389163, + "grad_norm": 0.18907786905765533, + "learning_rate": 4.4720821571819296e-05, + "loss": 0.2657, + "step": 44560 + }, + { + "epoch": 1.981153042627906, + "grad_norm": 0.16369682550430298, + "learning_rate": 4.470026917796314e-05, + "loss": 0.2648, + "step": 44570 + }, + { + "epoch": 1.981597546339512, + "grad_norm": 0.13942281901836395, + "learning_rate": 4.4679717689686005e-05, + "loss": 0.2677, + "step": 44580 + }, + { + "epoch": 1.9820420500511178, + "grad_norm": 0.15572358667850494, + "learning_rate": 4.465916711049963e-05, + "loss": 0.2676, + "step": 44590 + }, + { + "epoch": 1.9824865537627239, + "grad_norm": 0.1417306810617447, + "learning_rate": 4.4638617443915524e-05, + "loss": 0.2661, + "step": 44600 + }, + { + "epoch": 1.98293105747433, + "grad_norm": 0.17268121242523193, + "learning_rate": 4.4618068693445055e-05, + "loss": 0.2649, + "step": 44610 + }, + { + "epoch": 1.983375561185936, + "grad_norm": 0.17389561235904694, + "learning_rate": 4.459752086259946e-05, + "loss": 0.2665, + "step": 44620 + }, + { + "epoch": 1.983820064897542, + "grad_norm": 0.17936941981315613, + "learning_rate": 4.457697395488977e-05, + "loss": 0.2684, + "step": 44630 + }, + { + "epoch": 1.9842645686091478, + "grad_norm": 0.17397284507751465, + "learning_rate": 4.455642797382693e-05, + "loss": 0.2691, + "step": 44640 + }, + { + "epoch": 1.9847090723207539, + "grad_norm": 0.1734071969985962, + "learning_rate": 4.4535882922921643e-05, + "loss": 0.268, + "step": 44650 + }, + { + "epoch": 1.98515357603236, + "grad_norm": 0.19089089334011078, + "learning_rate": 4.451533880568455e-05, + "loss": 0.2681, + "step": 44660 + }, + { + "epoch": 1.9855980797439658, + "grad_norm": 0.18453116714954376, + "learning_rate": 4.449479562562603e-05, + "loss": 0.2665, + "step": 44670 + }, + { + "epoch": 1.9860425834555717, + "grad_norm": 0.15481066703796387, + "learning_rate": 4.447425338625639e-05, + "loss": 0.2683, + "step": 44680 + }, + { + "epoch": 1.9864870871671778, + "grad_norm": 0.16948996484279633, + "learning_rate": 4.4453712091085705e-05, + "loss": 0.2686, + "step": 44690 + }, + { + "epoch": 1.986931590878784, + "grad_norm": 0.16720153391361237, + "learning_rate": 4.443317174362392e-05, + "loss": 0.2673, + "step": 44700 + }, + { + "epoch": 1.9873760945903898, + "grad_norm": 0.17583666741847992, + "learning_rate": 4.441263234738085e-05, + "loss": 0.2651, + "step": 44710 + }, + { + "epoch": 1.9878205983019959, + "grad_norm": 0.1651710718870163, + "learning_rate": 4.4392093905866086e-05, + "loss": 0.265, + "step": 44720 + }, + { + "epoch": 1.9882651020136017, + "grad_norm": 0.1764715164899826, + "learning_rate": 4.437155642258911e-05, + "loss": 0.2673, + "step": 44730 + }, + { + "epoch": 1.9887096057252078, + "grad_norm": 0.1355331540107727, + "learning_rate": 4.4351019901059177e-05, + "loss": 0.2679, + "step": 44740 + }, + { + "epoch": 1.989154109436814, + "grad_norm": 0.15448887646198273, + "learning_rate": 4.433048434478547e-05, + "loss": 0.2686, + "step": 44750 + }, + { + "epoch": 1.9895986131484198, + "grad_norm": 0.1779009997844696, + "learning_rate": 4.4309949757276906e-05, + "loss": 0.2666, + "step": 44760 + }, + { + "epoch": 1.9900431168600257, + "grad_norm": 0.15341098606586456, + "learning_rate": 4.428941614204233e-05, + "loss": 0.2661, + "step": 44770 + }, + { + "epoch": 1.9904876205716318, + "grad_norm": 0.1612335592508316, + "learning_rate": 4.426888350259034e-05, + "loss": 0.2665, + "step": 44780 + }, + { + "epoch": 1.9909321242832378, + "grad_norm": 0.15383665263652802, + "learning_rate": 4.424835184242939e-05, + "loss": 0.2676, + "step": 44790 + }, + { + "epoch": 1.9913766279948437, + "grad_norm": 0.1753322035074234, + "learning_rate": 4.422782116506784e-05, + "loss": 0.2669, + "step": 44800 + }, + { + "epoch": 1.9918211317064496, + "grad_norm": 0.17448198795318604, + "learning_rate": 4.4207291474013754e-05, + "loss": 0.2664, + "step": 44810 + }, + { + "epoch": 1.9922656354180557, + "grad_norm": 0.16835422813892365, + "learning_rate": 4.418676277277514e-05, + "loss": 0.2685, + "step": 44820 + }, + { + "epoch": 1.9927101391296618, + "grad_norm": 0.16774410009384155, + "learning_rate": 4.416623506485976e-05, + "loss": 0.2667, + "step": 44830 + }, + { + "epoch": 1.9931546428412679, + "grad_norm": 0.18244807422161102, + "learning_rate": 4.414570835377527e-05, + "loss": 0.2665, + "step": 44840 + }, + { + "epoch": 1.9935991465528737, + "grad_norm": 0.15575507283210754, + "learning_rate": 4.412518264302909e-05, + "loss": 0.2655, + "step": 44850 + }, + { + "epoch": 1.9940436502644796, + "grad_norm": 0.15131549537181854, + "learning_rate": 4.4104657936128535e-05, + "loss": 0.2673, + "step": 44860 + }, + { + "epoch": 1.9944881539760857, + "grad_norm": 0.13865748047828674, + "learning_rate": 4.408413423658071e-05, + "loss": 0.272, + "step": 44870 + }, + { + "epoch": 1.9949326576876918, + "grad_norm": 0.16374395787715912, + "learning_rate": 4.406361154789253e-05, + "loss": 0.2659, + "step": 44880 + }, + { + "epoch": 1.9953771613992977, + "grad_norm": 0.14464972913265228, + "learning_rate": 4.4043089873570776e-05, + "loss": 0.2688, + "step": 44890 + }, + { + "epoch": 1.9958216651109035, + "grad_norm": 0.14353390038013458, + "learning_rate": 4.4022569217122025e-05, + "loss": 0.2673, + "step": 44900 + }, + { + "epoch": 1.9962661688225096, + "grad_norm": 0.16363446414470673, + "learning_rate": 4.400204958205274e-05, + "loss": 0.2673, + "step": 44910 + }, + { + "epoch": 1.9967106725341157, + "grad_norm": 0.15863041579723358, + "learning_rate": 4.3981530971869125e-05, + "loss": 0.269, + "step": 44920 + }, + { + "epoch": 1.9971551762457218, + "grad_norm": 0.16114193201065063, + "learning_rate": 4.396101339007727e-05, + "loss": 0.2691, + "step": 44930 + }, + { + "epoch": 1.9975996799573277, + "grad_norm": 0.19969308376312256, + "learning_rate": 4.394049684018304e-05, + "loss": 0.2695, + "step": 44940 + }, + { + "epoch": 1.9980441836689335, + "grad_norm": 0.1867971569299698, + "learning_rate": 4.3919981325692186e-05, + "loss": 0.2687, + "step": 44950 + }, + { + "epoch": 1.9984886873805396, + "grad_norm": 0.1657050997018814, + "learning_rate": 4.389946685011024e-05, + "loss": 0.2672, + "step": 44960 + }, + { + "epoch": 1.9989331910921457, + "grad_norm": 0.17078661918640137, + "learning_rate": 4.387895341694255e-05, + "loss": 0.2647, + "step": 44970 + }, + { + "epoch": 1.9993776948037516, + "grad_norm": 0.16048918664455414, + "learning_rate": 4.3858441029694324e-05, + "loss": 0.2664, + "step": 44980 + }, + { + "epoch": 1.9998221985153575, + "grad_norm": 0.18850351870059967, + "learning_rate": 4.3837929691870527e-05, + "loss": 0.2672, + "step": 44990 + }, + { + "epoch": 2.0002667022269636, + "grad_norm": 0.15392762422561646, + "learning_rate": 4.381741940697604e-05, + "loss": 0.2669, + "step": 45000 + }, + { + "epoch": 2.0007112059385697, + "grad_norm": 0.18306627869606018, + "learning_rate": 4.379691017851547e-05, + "loss": 0.2666, + "step": 45010 + }, + { + "epoch": 2.0011557096501758, + "grad_norm": 0.156044602394104, + "learning_rate": 4.3776402009993304e-05, + "loss": 0.2687, + "step": 45020 + }, + { + "epoch": 2.0016002133617814, + "grad_norm": 0.18824583292007446, + "learning_rate": 4.3755894904913794e-05, + "loss": 0.2676, + "step": 45030 + }, + { + "epoch": 2.0020447170733875, + "grad_norm": 0.15311959385871887, + "learning_rate": 4.373538886678109e-05, + "loss": 0.2677, + "step": 45040 + }, + { + "epoch": 2.0024892207849936, + "grad_norm": 0.16250695288181305, + "learning_rate": 4.371488389909909e-05, + "loss": 0.2635, + "step": 45050 + }, + { + "epoch": 2.0029337244965997, + "grad_norm": 0.15651914477348328, + "learning_rate": 4.3694380005371515e-05, + "loss": 0.2651, + "step": 45060 + }, + { + "epoch": 2.0033782282082053, + "grad_norm": 0.16628554463386536, + "learning_rate": 4.367387718910196e-05, + "loss": 0.2635, + "step": 45070 + }, + { + "epoch": 2.0038227319198114, + "grad_norm": 0.20221003890037537, + "learning_rate": 4.3653375453793764e-05, + "loss": 0.2657, + "step": 45080 + }, + { + "epoch": 2.0042672356314175, + "grad_norm": 0.20822086930274963, + "learning_rate": 4.3632874802950136e-05, + "loss": 0.2672, + "step": 45090 + }, + { + "epoch": 2.0047117393430236, + "grad_norm": 0.17351403832435608, + "learning_rate": 4.3612375240074034e-05, + "loss": 0.2679, + "step": 45100 + }, + { + "epoch": 2.0051562430546297, + "grad_norm": 0.17303778231143951, + "learning_rate": 4.3591876768668325e-05, + "loss": 0.2686, + "step": 45110 + }, + { + "epoch": 2.0056007467662353, + "grad_norm": 0.15208648145198822, + "learning_rate": 4.3571379392235605e-05, + "loss": 0.2659, + "step": 45120 + }, + { + "epoch": 2.0060452504778414, + "grad_norm": 0.16162878274917603, + "learning_rate": 4.3550883114278335e-05, + "loss": 0.2689, + "step": 45130 + }, + { + "epoch": 2.0064897541894475, + "grad_norm": 0.1612592190504074, + "learning_rate": 4.353038793829876e-05, + "loss": 0.2676, + "step": 45140 + }, + { + "epoch": 2.0069342579010536, + "grad_norm": 0.16110315918922424, + "learning_rate": 4.350989386779891e-05, + "loss": 0.2691, + "step": 45150 + }, + { + "epoch": 2.0073787616126593, + "grad_norm": 0.1480487436056137, + "learning_rate": 4.3489400906280724e-05, + "loss": 0.2667, + "step": 45160 + }, + { + "epoch": 2.0078232653242654, + "grad_norm": 0.2090287059545517, + "learning_rate": 4.3468909057245845e-05, + "loss": 0.2663, + "step": 45170 + }, + { + "epoch": 2.0082677690358715, + "grad_norm": 0.17867164313793182, + "learning_rate": 4.3448418324195794e-05, + "loss": 0.2665, + "step": 45180 + }, + { + "epoch": 2.0087122727474775, + "grad_norm": 0.200210303068161, + "learning_rate": 4.342792871063184e-05, + "loss": 0.2654, + "step": 45190 + }, + { + "epoch": 2.0091567764590836, + "grad_norm": 0.17101424932479858, + "learning_rate": 4.3407440220055145e-05, + "loss": 0.2657, + "step": 45200 + }, + { + "epoch": 2.0096012801706893, + "grad_norm": 0.17911428213119507, + "learning_rate": 4.33869528559666e-05, + "loss": 0.2703, + "step": 45210 + }, + { + "epoch": 2.0100457838822954, + "grad_norm": 0.18829630315303802, + "learning_rate": 4.336646662186696e-05, + "loss": 0.2656, + "step": 45220 + }, + { + "epoch": 2.0104902875939015, + "grad_norm": 0.19327601790428162, + "learning_rate": 4.334598152125672e-05, + "loss": 0.2678, + "step": 45230 + }, + { + "epoch": 2.0109347913055076, + "grad_norm": 0.19979393482208252, + "learning_rate": 4.3325497557636276e-05, + "loss": 0.2699, + "step": 45240 + }, + { + "epoch": 2.011379295017113, + "grad_norm": 0.18933379650115967, + "learning_rate": 4.330501473450574e-05, + "loss": 0.2668, + "step": 45250 + }, + { + "epoch": 2.0118237987287193, + "grad_norm": 0.15480200946331024, + "learning_rate": 4.328453305536507e-05, + "loss": 0.2679, + "step": 45260 + }, + { + "epoch": 2.0122683024403254, + "grad_norm": 0.1734711229801178, + "learning_rate": 4.326405252371404e-05, + "loss": 0.2689, + "step": 45270 + }, + { + "epoch": 2.0127128061519315, + "grad_norm": 0.1681431233882904, + "learning_rate": 4.324357314305221e-05, + "loss": 0.2648, + "step": 45280 + }, + { + "epoch": 2.0131573098635376, + "grad_norm": 0.18298277258872986, + "learning_rate": 4.3223094916878945e-05, + "loss": 0.2678, + "step": 45290 + }, + { + "epoch": 2.0136018135751432, + "grad_norm": 0.17892642319202423, + "learning_rate": 4.320261784869338e-05, + "loss": 0.2659, + "step": 45300 + }, + { + "epoch": 2.0140463172867493, + "grad_norm": 0.1469869166612625, + "learning_rate": 4.318214194199455e-05, + "loss": 0.2644, + "step": 45310 + }, + { + "epoch": 2.0144908209983554, + "grad_norm": 0.1658293455839157, + "learning_rate": 4.316166720028118e-05, + "loss": 0.265, + "step": 45320 + }, + { + "epoch": 2.0149353247099615, + "grad_norm": 0.19186758995056152, + "learning_rate": 4.3141193627051864e-05, + "loss": 0.2699, + "step": 45330 + }, + { + "epoch": 2.015379828421567, + "grad_norm": 0.1567402482032776, + "learning_rate": 4.312072122580496e-05, + "loss": 0.2667, + "step": 45340 + }, + { + "epoch": 2.0158243321331732, + "grad_norm": 0.16448786854743958, + "learning_rate": 4.3100250000038646e-05, + "loss": 0.2654, + "step": 45350 + }, + { + "epoch": 2.0162688358447793, + "grad_norm": 0.1782263219356537, + "learning_rate": 4.307977995325091e-05, + "loss": 0.267, + "step": 45360 + }, + { + "epoch": 2.0167133395563854, + "grad_norm": 0.19928835332393646, + "learning_rate": 4.30593110889395e-05, + "loss": 0.2671, + "step": 45370 + }, + { + "epoch": 2.017157843267991, + "grad_norm": 0.14767859876155853, + "learning_rate": 4.3038843410602016e-05, + "loss": 0.268, + "step": 45380 + }, + { + "epoch": 2.017602346979597, + "grad_norm": 0.18008294701576233, + "learning_rate": 4.3018376921735774e-05, + "loss": 0.2665, + "step": 45390 + }, + { + "epoch": 2.0180468506912033, + "grad_norm": 0.16601188480854034, + "learning_rate": 4.299791162583799e-05, + "loss": 0.2663, + "step": 45400 + }, + { + "epoch": 2.0184913544028094, + "grad_norm": 0.1660025268793106, + "learning_rate": 4.29774475264056e-05, + "loss": 0.2657, + "step": 45410 + }, + { + "epoch": 2.0189358581144154, + "grad_norm": 0.15958473086357117, + "learning_rate": 4.2956984626935365e-05, + "loss": 0.2679, + "step": 45420 + }, + { + "epoch": 2.019380361826021, + "grad_norm": 0.1365078240633011, + "learning_rate": 4.293652293092383e-05, + "loss": 0.2661, + "step": 45430 + }, + { + "epoch": 2.019824865537627, + "grad_norm": 0.14725518226623535, + "learning_rate": 4.2916062441867324e-05, + "loss": 0.2654, + "step": 45440 + }, + { + "epoch": 2.0202693692492333, + "grad_norm": 0.16470514237880707, + "learning_rate": 4.289560316326201e-05, + "loss": 0.2685, + "step": 45450 + }, + { + "epoch": 2.0207138729608394, + "grad_norm": 0.18443237245082855, + "learning_rate": 4.28751450986038e-05, + "loss": 0.266, + "step": 45460 + }, + { + "epoch": 2.021158376672445, + "grad_norm": 0.15890949964523315, + "learning_rate": 4.2854688251388444e-05, + "loss": 0.265, + "step": 45470 + }, + { + "epoch": 2.021602880384051, + "grad_norm": 0.18526867032051086, + "learning_rate": 4.2834232625111425e-05, + "loss": 0.265, + "step": 45480 + }, + { + "epoch": 2.022047384095657, + "grad_norm": 0.17617002129554749, + "learning_rate": 4.2813778223268086e-05, + "loss": 0.2669, + "step": 45490 + }, + { + "epoch": 2.0224918878072633, + "grad_norm": 0.16448330879211426, + "learning_rate": 4.2793325049353477e-05, + "loss": 0.2681, + "step": 45500 + }, + { + "epoch": 2.0229363915188694, + "grad_norm": 0.1844799667596817, + "learning_rate": 4.2772873106862535e-05, + "loss": 0.265, + "step": 45510 + }, + { + "epoch": 2.023380895230475, + "grad_norm": 0.16026552021503448, + "learning_rate": 4.275242239928993e-05, + "loss": 0.267, + "step": 45520 + }, + { + "epoch": 2.023825398942081, + "grad_norm": 0.17548267543315887, + "learning_rate": 4.273197293013009e-05, + "loss": 0.2677, + "step": 45530 + }, + { + "epoch": 2.0242699026536872, + "grad_norm": 0.16327454149723053, + "learning_rate": 4.271152470287731e-05, + "loss": 0.2645, + "step": 45540 + }, + { + "epoch": 2.0247144063652933, + "grad_norm": 0.19432847201824188, + "learning_rate": 4.2691077721025594e-05, + "loss": 0.2656, + "step": 45550 + }, + { + "epoch": 2.025158910076899, + "grad_norm": 0.1732344925403595, + "learning_rate": 4.267063198806883e-05, + "loss": 0.2666, + "step": 45560 + }, + { + "epoch": 2.025603413788505, + "grad_norm": 0.16143886744976044, + "learning_rate": 4.2650187507500574e-05, + "loss": 0.2654, + "step": 45570 + }, + { + "epoch": 2.026047917500111, + "grad_norm": 0.1358281970024109, + "learning_rate": 4.2629744282814275e-05, + "loss": 0.2653, + "step": 45580 + }, + { + "epoch": 2.0264924212117172, + "grad_norm": 0.16263259947299957, + "learning_rate": 4.2609302317503074e-05, + "loss": 0.2622, + "step": 45590 + }, + { + "epoch": 2.026936924923323, + "grad_norm": 0.1388048529624939, + "learning_rate": 4.258886161505999e-05, + "loss": 0.2651, + "step": 45600 + }, + { + "epoch": 2.027381428634929, + "grad_norm": 0.1469038426876068, + "learning_rate": 4.2568422178977775e-05, + "loss": 0.2666, + "step": 45610 + }, + { + "epoch": 2.027825932346535, + "grad_norm": 0.15145158767700195, + "learning_rate": 4.254798401274894e-05, + "loss": 0.2638, + "step": 45620 + }, + { + "epoch": 2.028270436058141, + "grad_norm": 0.16353383660316467, + "learning_rate": 4.252754711986583e-05, + "loss": 0.266, + "step": 45630 + }, + { + "epoch": 2.0287149397697473, + "grad_norm": 0.14802642166614532, + "learning_rate": 4.250711150382052e-05, + "loss": 0.2645, + "step": 45640 + }, + { + "epoch": 2.029159443481353, + "grad_norm": 0.18174807727336884, + "learning_rate": 4.248667716810495e-05, + "loss": 0.2616, + "step": 45650 + }, + { + "epoch": 2.029603947192959, + "grad_norm": 0.15355128049850464, + "learning_rate": 4.246624411621074e-05, + "loss": 0.2664, + "step": 45660 + }, + { + "epoch": 2.030048450904565, + "grad_norm": 0.23153455555438995, + "learning_rate": 4.244581235162938e-05, + "loss": 0.2654, + "step": 45670 + }, + { + "epoch": 2.030492954616171, + "grad_norm": 0.18321126699447632, + "learning_rate": 4.2425381877852075e-05, + "loss": 0.2635, + "step": 45680 + }, + { + "epoch": 2.030937458327777, + "grad_norm": 0.16975469887256622, + "learning_rate": 4.2404952698369856e-05, + "loss": 0.2642, + "step": 45690 + }, + { + "epoch": 2.031381962039383, + "grad_norm": 0.1711466759443283, + "learning_rate": 4.238452481667349e-05, + "loss": 0.2676, + "step": 45700 + }, + { + "epoch": 2.031826465750989, + "grad_norm": 0.164314404129982, + "learning_rate": 4.2364098236253526e-05, + "loss": 0.2663, + "step": 45710 + }, + { + "epoch": 2.032270969462595, + "grad_norm": 0.14651192724704742, + "learning_rate": 4.2343672960600356e-05, + "loss": 0.2659, + "step": 45720 + }, + { + "epoch": 2.032715473174201, + "grad_norm": 0.18347886204719543, + "learning_rate": 4.232324899320406e-05, + "loss": 0.2649, + "step": 45730 + }, + { + "epoch": 2.033159976885807, + "grad_norm": 0.16446270048618317, + "learning_rate": 4.230282633755457e-05, + "loss": 0.2682, + "step": 45740 + }, + { + "epoch": 2.033604480597413, + "grad_norm": 0.15212956070899963, + "learning_rate": 4.2282404997141515e-05, + "loss": 0.2654, + "step": 45750 + }, + { + "epoch": 2.034048984309019, + "grad_norm": 0.18162880837917328, + "learning_rate": 4.2261984975454397e-05, + "loss": 0.2641, + "step": 45760 + }, + { + "epoch": 2.034493488020625, + "grad_norm": 0.17986367642879486, + "learning_rate": 4.224156627598239e-05, + "loss": 0.2652, + "step": 45770 + }, + { + "epoch": 2.0349379917322308, + "grad_norm": 0.1557452380657196, + "learning_rate": 4.222114890221453e-05, + "loss": 0.2636, + "step": 45780 + }, + { + "epoch": 2.035382495443837, + "grad_norm": 0.15664970874786377, + "learning_rate": 4.2200732857639546e-05, + "loss": 0.265, + "step": 45790 + }, + { + "epoch": 2.035826999155443, + "grad_norm": 0.16459771990776062, + "learning_rate": 4.2180318145746035e-05, + "loss": 0.2666, + "step": 45800 + }, + { + "epoch": 2.036271502867049, + "grad_norm": 0.15573598444461823, + "learning_rate": 4.215990477002227e-05, + "loss": 0.2674, + "step": 45810 + }, + { + "epoch": 2.036716006578655, + "grad_norm": 0.1529129594564438, + "learning_rate": 4.2139492733956356e-05, + "loss": 0.2677, + "step": 45820 + }, + { + "epoch": 2.037160510290261, + "grad_norm": 0.16375622153282166, + "learning_rate": 4.211908204103615e-05, + "loss": 0.2656, + "step": 45830 + }, + { + "epoch": 2.037605014001867, + "grad_norm": 0.16896677017211914, + "learning_rate": 4.2098672694749265e-05, + "loss": 0.2624, + "step": 45840 + }, + { + "epoch": 2.038049517713473, + "grad_norm": 0.15068762004375458, + "learning_rate": 4.2078264698583133e-05, + "loss": 0.2659, + "step": 45850 + }, + { + "epoch": 2.038494021425079, + "grad_norm": 0.17776159942150116, + "learning_rate": 4.205785805602488e-05, + "loss": 0.2667, + "step": 45860 + }, + { + "epoch": 2.0389385251366847, + "grad_norm": 0.1547495275735855, + "learning_rate": 4.203745277056149e-05, + "loss": 0.2654, + "step": 45870 + }, + { + "epoch": 2.039383028848291, + "grad_norm": 0.15774881839752197, + "learning_rate": 4.201704884567964e-05, + "loss": 0.2629, + "step": 45880 + }, + { + "epoch": 2.039827532559897, + "grad_norm": 0.14407290518283844, + "learning_rate": 4.1996646284865816e-05, + "loss": 0.2665, + "step": 45890 + }, + { + "epoch": 2.040272036271503, + "grad_norm": 0.16829398274421692, + "learning_rate": 4.197624509160625e-05, + "loss": 0.2677, + "step": 45900 + }, + { + "epoch": 2.0407165399831086, + "grad_norm": 0.16115862131118774, + "learning_rate": 4.195584526938692e-05, + "loss": 0.2639, + "step": 45910 + }, + { + "epoch": 2.0411610436947147, + "grad_norm": 0.14440900087356567, + "learning_rate": 4.193544682169365e-05, + "loss": 0.2659, + "step": 45920 + }, + { + "epoch": 2.041605547406321, + "grad_norm": 0.18402744829654694, + "learning_rate": 4.1915049752011946e-05, + "loss": 0.2673, + "step": 45930 + }, + { + "epoch": 2.042050051117927, + "grad_norm": 0.14599910378456116, + "learning_rate": 4.189465406382712e-05, + "loss": 0.2652, + "step": 45940 + }, + { + "epoch": 2.042494554829533, + "grad_norm": 0.15505078434944153, + "learning_rate": 4.187425976062422e-05, + "loss": 0.2683, + "step": 45950 + }, + { + "epoch": 2.0429390585411387, + "grad_norm": 0.1537613719701767, + "learning_rate": 4.18538668458881e-05, + "loss": 0.2642, + "step": 45960 + }, + { + "epoch": 2.0433835622527448, + "grad_norm": 0.16223734617233276, + "learning_rate": 4.183347532310333e-05, + "loss": 0.2636, + "step": 45970 + }, + { + "epoch": 2.043828065964351, + "grad_norm": 0.18749304115772247, + "learning_rate": 4.181308519575429e-05, + "loss": 0.2654, + "step": 45980 + }, + { + "epoch": 2.044272569675957, + "grad_norm": 0.16900236904621124, + "learning_rate": 4.179269646732507e-05, + "loss": 0.2671, + "step": 45990 + }, + { + "epoch": 2.0447170733875626, + "grad_norm": 0.1656070500612259, + "learning_rate": 4.177230914129954e-05, + "loss": 0.2626, + "step": 46000 + }, + { + "epoch": 2.0451615770991687, + "grad_norm": 0.16478605568408966, + "learning_rate": 4.175192322116136e-05, + "loss": 0.2647, + "step": 46010 + }, + { + "epoch": 2.0456060808107748, + "grad_norm": 0.19815105199813843, + "learning_rate": 4.173153871039391e-05, + "loss": 0.2661, + "step": 46020 + }, + { + "epoch": 2.046050584522381, + "grad_norm": 0.17302079498767853, + "learning_rate": 4.171115561248036e-05, + "loss": 0.2643, + "step": 46030 + }, + { + "epoch": 2.046495088233987, + "grad_norm": 0.14167572557926178, + "learning_rate": 4.16907739309036e-05, + "loss": 0.265, + "step": 46040 + }, + { + "epoch": 2.0469395919455926, + "grad_norm": 0.16752567887306213, + "learning_rate": 4.167039366914633e-05, + "loss": 0.2669, + "step": 46050 + }, + { + "epoch": 2.0473840956571987, + "grad_norm": 0.1557178497314453, + "learning_rate": 4.165001483069096e-05, + "loss": 0.2682, + "step": 46060 + }, + { + "epoch": 2.047828599368805, + "grad_norm": 0.15558621287345886, + "learning_rate": 4.16296374190197e-05, + "loss": 0.2671, + "step": 46070 + }, + { + "epoch": 2.048273103080411, + "grad_norm": 0.16188035905361176, + "learning_rate": 4.1609261437614464e-05, + "loss": 0.268, + "step": 46080 + }, + { + "epoch": 2.0487176067920165, + "grad_norm": 0.1663292944431305, + "learning_rate": 4.158888688995696e-05, + "loss": 0.2658, + "step": 46090 + }, + { + "epoch": 2.0491621105036226, + "grad_norm": 0.16263167560100555, + "learning_rate": 4.1568513779528645e-05, + "loss": 0.265, + "step": 46100 + }, + { + "epoch": 2.0496066142152287, + "grad_norm": 0.1525607407093048, + "learning_rate": 4.1548142109810704e-05, + "loss": 0.2644, + "step": 46110 + }, + { + "epoch": 2.050051117926835, + "grad_norm": 0.13832224905490875, + "learning_rate": 4.152777188428414e-05, + "loss": 0.2663, + "step": 46120 + }, + { + "epoch": 2.050495621638441, + "grad_norm": 0.14418840408325195, + "learning_rate": 4.1507403106429646e-05, + "loss": 0.2649, + "step": 46130 + }, + { + "epoch": 2.0509401253500466, + "grad_norm": 0.18551665544509888, + "learning_rate": 4.148703577972768e-05, + "loss": 0.2664, + "step": 46140 + }, + { + "epoch": 2.0513846290616526, + "grad_norm": 0.17617808282375336, + "learning_rate": 4.146666990765846e-05, + "loss": 0.2629, + "step": 46150 + }, + { + "epoch": 2.0518291327732587, + "grad_norm": 0.21603479981422424, + "learning_rate": 4.1446305493701986e-05, + "loss": 0.2645, + "step": 46160 + }, + { + "epoch": 2.052273636484865, + "grad_norm": 0.16587033867835999, + "learning_rate": 4.142594254133796e-05, + "loss": 0.2682, + "step": 46170 + }, + { + "epoch": 2.0527181401964705, + "grad_norm": 0.14648671448230743, + "learning_rate": 4.1405581054045835e-05, + "loss": 0.2655, + "step": 46180 + }, + { + "epoch": 2.0531626439080766, + "grad_norm": 0.1431286334991455, + "learning_rate": 4.1385221035304864e-05, + "loss": 0.2651, + "step": 46190 + }, + { + "epoch": 2.0536071476196827, + "grad_norm": 0.14534887671470642, + "learning_rate": 4.136486248859398e-05, + "loss": 0.2653, + "step": 46200 + }, + { + "epoch": 2.0540516513312888, + "grad_norm": 0.21880526840686798, + "learning_rate": 4.1344505417391955e-05, + "loss": 0.2693, + "step": 46210 + }, + { + "epoch": 2.0544961550428944, + "grad_norm": 0.1661769598722458, + "learning_rate": 4.132414982517721e-05, + "loss": 0.2631, + "step": 46220 + }, + { + "epoch": 2.0549406587545005, + "grad_norm": 0.156210258603096, + "learning_rate": 4.130379571542798e-05, + "loss": 0.268, + "step": 46230 + }, + { + "epoch": 2.0553851624661066, + "grad_norm": 0.16406996548175812, + "learning_rate": 4.12834430916222e-05, + "loss": 0.2658, + "step": 46240 + }, + { + "epoch": 2.0558296661777127, + "grad_norm": 0.1617971658706665, + "learning_rate": 4.126309195723763e-05, + "loss": 0.2658, + "step": 46250 + }, + { + "epoch": 2.0562741698893188, + "grad_norm": 0.1622219681739807, + "learning_rate": 4.124274231575168e-05, + "loss": 0.2638, + "step": 46260 + }, + { + "epoch": 2.0567186736009244, + "grad_norm": 0.6275656223297119, + "learning_rate": 4.122239417064154e-05, + "loss": 0.2672, + "step": 46270 + }, + { + "epoch": 2.0571631773125305, + "grad_norm": 0.18491493165493011, + "learning_rate": 4.1202047525384184e-05, + "loss": 0.2667, + "step": 46280 + }, + { + "epoch": 2.0576076810241366, + "grad_norm": 0.1461198627948761, + "learning_rate": 4.118170238345627e-05, + "loss": 0.2677, + "step": 46290 + }, + { + "epoch": 2.0580521847357427, + "grad_norm": 0.1583092361688614, + "learning_rate": 4.1161358748334256e-05, + "loss": 0.2639, + "step": 46300 + }, + { + "epoch": 2.0584966884473483, + "grad_norm": 0.1520121693611145, + "learning_rate": 4.1141016623494266e-05, + "loss": 0.2649, + "step": 46310 + }, + { + "epoch": 2.0589411921589544, + "grad_norm": 0.17922967672348022, + "learning_rate": 4.112067601241227e-05, + "loss": 0.2645, + "step": 46320 + }, + { + "epoch": 2.0593856958705605, + "grad_norm": 0.17473389208316803, + "learning_rate": 4.110033691856387e-05, + "loss": 0.2684, + "step": 46330 + }, + { + "epoch": 2.0598301995821666, + "grad_norm": 0.15682613849639893, + "learning_rate": 4.107999934542451e-05, + "loss": 0.267, + "step": 46340 + }, + { + "epoch": 2.0602747032937727, + "grad_norm": 0.16216708719730377, + "learning_rate": 4.105966329646928e-05, + "loss": 0.2642, + "step": 46350 + }, + { + "epoch": 2.0607192070053784, + "grad_norm": 0.15043999254703522, + "learning_rate": 4.103932877517308e-05, + "loss": 0.2652, + "step": 46360 + }, + { + "epoch": 2.0611637107169845, + "grad_norm": 0.15624293684959412, + "learning_rate": 4.101899578501052e-05, + "loss": 0.2652, + "step": 46370 + }, + { + "epoch": 2.0616082144285905, + "grad_norm": 0.1489008665084839, + "learning_rate": 4.099866432945595e-05, + "loss": 0.2659, + "step": 46380 + }, + { + "epoch": 2.0620527181401966, + "grad_norm": 0.1486898511648178, + "learning_rate": 4.097833441198346e-05, + "loss": 0.2693, + "step": 46390 + }, + { + "epoch": 2.0624972218518023, + "grad_norm": 0.16491056978702545, + "learning_rate": 4.095800603606685e-05, + "loss": 0.2646, + "step": 46400 + }, + { + "epoch": 2.0629417255634084, + "grad_norm": 0.14610818028450012, + "learning_rate": 4.093767920517975e-05, + "loss": 0.2639, + "step": 46410 + }, + { + "epoch": 2.0633862292750145, + "grad_norm": 0.14319582283496857, + "learning_rate": 4.091735392279539e-05, + "loss": 0.2636, + "step": 46420 + }, + { + "epoch": 2.0638307329866206, + "grad_norm": 0.1514127254486084, + "learning_rate": 4.089703019238685e-05, + "loss": 0.266, + "step": 46430 + }, + { + "epoch": 2.0642752366982267, + "grad_norm": 0.1663346141576767, + "learning_rate": 4.0876708017426866e-05, + "loss": 0.2661, + "step": 46440 + }, + { + "epoch": 2.0647197404098323, + "grad_norm": 0.16367818415164948, + "learning_rate": 4.085638740138798e-05, + "loss": 0.2633, + "step": 46450 + }, + { + "epoch": 2.0651642441214384, + "grad_norm": 0.1857093870639801, + "learning_rate": 4.08360683477424e-05, + "loss": 0.2623, + "step": 46460 + }, + { + "epoch": 2.0656087478330445, + "grad_norm": 0.17880679666996002, + "learning_rate": 4.0815750859962085e-05, + "loss": 0.2629, + "step": 46470 + }, + { + "epoch": 2.0660532515446506, + "grad_norm": 0.20959797501564026, + "learning_rate": 4.079543494151879e-05, + "loss": 0.2639, + "step": 46480 + }, + { + "epoch": 2.0664977552562562, + "grad_norm": 0.2014445662498474, + "learning_rate": 4.07751205958839e-05, + "loss": 0.2666, + "step": 46490 + }, + { + "epoch": 2.0669422589678623, + "grad_norm": 0.17430844902992249, + "learning_rate": 4.0754807826528615e-05, + "loss": 0.2661, + "step": 46500 + }, + { + "epoch": 2.0673867626794684, + "grad_norm": 0.15386445820331573, + "learning_rate": 4.073449663692379e-05, + "loss": 0.2659, + "step": 46510 + }, + { + "epoch": 2.0678312663910745, + "grad_norm": 0.16181066632270813, + "learning_rate": 4.07141870305401e-05, + "loss": 0.267, + "step": 46520 + }, + { + "epoch": 2.06827577010268, + "grad_norm": 0.1507546305656433, + "learning_rate": 4.0693879010847866e-05, + "loss": 0.2635, + "step": 46530 + }, + { + "epoch": 2.0687202738142862, + "grad_norm": 0.15347006916999817, + "learning_rate": 4.06735725813172e-05, + "loss": 0.2647, + "step": 46540 + }, + { + "epoch": 2.0691647775258923, + "grad_norm": 0.16564403474330902, + "learning_rate": 4.0653267745417903e-05, + "loss": 0.2669, + "step": 46550 + }, + { + "epoch": 2.0696092812374984, + "grad_norm": 0.17610645294189453, + "learning_rate": 4.063296450661949e-05, + "loss": 0.2673, + "step": 46560 + }, + { + "epoch": 2.0700537849491045, + "grad_norm": 0.1503361314535141, + "learning_rate": 4.061266286839128e-05, + "loss": 0.2636, + "step": 46570 + }, + { + "epoch": 2.07049828866071, + "grad_norm": 0.15469901263713837, + "learning_rate": 4.0592362834202225e-05, + "loss": 0.2658, + "step": 46580 + }, + { + "epoch": 2.0709427923723163, + "grad_norm": 0.16376689076423645, + "learning_rate": 4.057206440752107e-05, + "loss": 0.2624, + "step": 46590 + }, + { + "epoch": 2.0713872960839224, + "grad_norm": 0.21526885032653809, + "learning_rate": 4.0551767591816245e-05, + "loss": 0.2662, + "step": 46600 + }, + { + "epoch": 2.0718317997955285, + "grad_norm": 0.1595943719148636, + "learning_rate": 4.0531472390555935e-05, + "loss": 0.2664, + "step": 46610 + }, + { + "epoch": 2.072276303507134, + "grad_norm": 0.15257123112678528, + "learning_rate": 4.051117880720802e-05, + "loss": 0.2632, + "step": 46620 + }, + { + "epoch": 2.07272080721874, + "grad_norm": 0.17105649411678314, + "learning_rate": 4.049088684524015e-05, + "loss": 0.2659, + "step": 46630 + }, + { + "epoch": 2.0731653109303463, + "grad_norm": 0.14586974680423737, + "learning_rate": 4.0470596508119636e-05, + "loss": 0.267, + "step": 46640 + }, + { + "epoch": 2.0736098146419524, + "grad_norm": 0.16687628626823425, + "learning_rate": 4.0450307799313524e-05, + "loss": 0.2678, + "step": 46650 + }, + { + "epoch": 2.074054318353558, + "grad_norm": 0.15534605085849762, + "learning_rate": 4.0430020722288656e-05, + "loss": 0.2646, + "step": 46660 + }, + { + "epoch": 2.074498822065164, + "grad_norm": 0.16309738159179688, + "learning_rate": 4.040973528051148e-05, + "loss": 0.2685, + "step": 46670 + }, + { + "epoch": 2.07494332577677, + "grad_norm": 0.19097016751766205, + "learning_rate": 4.038945147744827e-05, + "loss": 0.2656, + "step": 46680 + }, + { + "epoch": 2.0753878294883763, + "grad_norm": 0.15758240222930908, + "learning_rate": 4.0369169316564945e-05, + "loss": 0.2645, + "step": 46690 + }, + { + "epoch": 2.0758323331999824, + "grad_norm": 0.17536814510822296, + "learning_rate": 4.034888880132718e-05, + "loss": 0.265, + "step": 46700 + }, + { + "epoch": 2.076276836911588, + "grad_norm": 0.17396719753742218, + "learning_rate": 4.032860993520035e-05, + "loss": 0.2671, + "step": 46710 + }, + { + "epoch": 2.076721340623194, + "grad_norm": 0.18595777451992035, + "learning_rate": 4.030833272164959e-05, + "loss": 0.265, + "step": 46720 + }, + { + "epoch": 2.0771658443348002, + "grad_norm": 0.1479623168706894, + "learning_rate": 4.0288057164139705e-05, + "loss": 0.264, + "step": 46730 + }, + { + "epoch": 2.0776103480464063, + "grad_norm": 0.1464693546295166, + "learning_rate": 4.0267783266135205e-05, + "loss": 0.2656, + "step": 46740 + }, + { + "epoch": 2.078054851758012, + "grad_norm": 0.1949462592601776, + "learning_rate": 4.024751103110039e-05, + "loss": 0.2653, + "step": 46750 + }, + { + "epoch": 2.078499355469618, + "grad_norm": 0.18058529496192932, + "learning_rate": 4.0227240462499176e-05, + "loss": 0.2641, + "step": 46760 + }, + { + "epoch": 2.078943859181224, + "grad_norm": 0.1714845895767212, + "learning_rate": 4.020697156379531e-05, + "loss": 0.2639, + "step": 46770 + }, + { + "epoch": 2.0793883628928302, + "grad_norm": 0.19153384864330292, + "learning_rate": 4.018670433845215e-05, + "loss": 0.2667, + "step": 46780 + }, + { + "epoch": 2.0798328666044363, + "grad_norm": 0.15332907438278198, + "learning_rate": 4.016643878993284e-05, + "loss": 0.2639, + "step": 46790 + }, + { + "epoch": 2.080277370316042, + "grad_norm": 0.19153249263763428, + "learning_rate": 4.014617492170017e-05, + "loss": 0.2676, + "step": 46800 + }, + { + "epoch": 2.080721874027648, + "grad_norm": 0.1624211221933365, + "learning_rate": 4.0125912737216726e-05, + "loss": 0.2662, + "step": 46810 + }, + { + "epoch": 2.081166377739254, + "grad_norm": 0.17501723766326904, + "learning_rate": 4.0105652239944735e-05, + "loss": 0.2626, + "step": 46820 + }, + { + "epoch": 2.0816108814508603, + "grad_norm": 0.19444817304611206, + "learning_rate": 4.0085393433346144e-05, + "loss": 0.2678, + "step": 46830 + }, + { + "epoch": 2.082055385162466, + "grad_norm": 0.2000740021467209, + "learning_rate": 4.006513632088268e-05, + "loss": 0.2674, + "step": 46840 + }, + { + "epoch": 2.082499888874072, + "grad_norm": 0.14862699806690216, + "learning_rate": 4.004488090601567e-05, + "loss": 0.2664, + "step": 46850 + }, + { + "epoch": 2.082944392585678, + "grad_norm": 0.1553385853767395, + "learning_rate": 4.002462719220626e-05, + "loss": 0.2666, + "step": 46860 + }, + { + "epoch": 2.083388896297284, + "grad_norm": 0.16293206810951233, + "learning_rate": 4.000437518291522e-05, + "loss": 0.2656, + "step": 46870 + }, + { + "epoch": 2.0838334000088903, + "grad_norm": 0.13430730998516083, + "learning_rate": 3.9984124881603094e-05, + "loss": 0.2629, + "step": 46880 + }, + { + "epoch": 2.084277903720496, + "grad_norm": 0.19106239080429077, + "learning_rate": 3.9963876291730086e-05, + "loss": 0.2614, + "step": 46890 + }, + { + "epoch": 2.084722407432102, + "grad_norm": 0.14771275222301483, + "learning_rate": 3.994362941675614e-05, + "loss": 0.2651, + "step": 46900 + }, + { + "epoch": 2.085166911143708, + "grad_norm": 0.1496736705303192, + "learning_rate": 3.992338426014088e-05, + "loss": 0.2641, + "step": 46910 + }, + { + "epoch": 2.085611414855314, + "grad_norm": 0.155574232339859, + "learning_rate": 3.9903140825343636e-05, + "loss": 0.2688, + "step": 46920 + }, + { + "epoch": 2.08605591856692, + "grad_norm": 0.1755906045436859, + "learning_rate": 3.98828991158235e-05, + "loss": 0.2663, + "step": 46930 + }, + { + "epoch": 2.086500422278526, + "grad_norm": 0.1518246978521347, + "learning_rate": 3.9862659135039185e-05, + "loss": 0.2677, + "step": 46940 + }, + { + "epoch": 2.086944925990132, + "grad_norm": 0.16935135424137115, + "learning_rate": 3.984242088644918e-05, + "loss": 0.2665, + "step": 46950 + }, + { + "epoch": 2.087389429701738, + "grad_norm": 0.152016282081604, + "learning_rate": 3.9822184373511615e-05, + "loss": 0.2605, + "step": 46960 + }, + { + "epoch": 2.087833933413344, + "grad_norm": 0.1532069593667984, + "learning_rate": 3.98019495996844e-05, + "loss": 0.2659, + "step": 46970 + }, + { + "epoch": 2.08827843712495, + "grad_norm": 0.15032213926315308, + "learning_rate": 3.978171656842507e-05, + "loss": 0.2621, + "step": 46980 + }, + { + "epoch": 2.088722940836556, + "grad_norm": 0.1677623987197876, + "learning_rate": 3.976148528319091e-05, + "loss": 0.2647, + "step": 46990 + }, + { + "epoch": 2.089167444548162, + "grad_norm": 0.18564875423908234, + "learning_rate": 3.974125574743888e-05, + "loss": 0.2636, + "step": 47000 + }, + { + "epoch": 2.089611948259768, + "grad_norm": 0.1595609188079834, + "learning_rate": 3.9721027964625686e-05, + "loss": 0.2655, + "step": 47010 + }, + { + "epoch": 2.090056451971374, + "grad_norm": 0.14873330295085907, + "learning_rate": 3.9700801938207676e-05, + "loss": 0.2648, + "step": 47020 + }, + { + "epoch": 2.09050095568298, + "grad_norm": 0.13465029001235962, + "learning_rate": 3.9680577671640916e-05, + "loss": 0.2618, + "step": 47030 + }, + { + "epoch": 2.090945459394586, + "grad_norm": 0.14046040177345276, + "learning_rate": 3.966035516838121e-05, + "loss": 0.2632, + "step": 47040 + }, + { + "epoch": 2.091389963106192, + "grad_norm": 0.153749480843544, + "learning_rate": 3.9640134431884014e-05, + "loss": 0.2666, + "step": 47050 + }, + { + "epoch": 2.0918344668177977, + "grad_norm": 0.18619424104690552, + "learning_rate": 3.961991546560451e-05, + "loss": 0.2648, + "step": 47060 + }, + { + "epoch": 2.092278970529404, + "grad_norm": 0.12911680340766907, + "learning_rate": 3.959969827299753e-05, + "loss": 0.2629, + "step": 47070 + }, + { + "epoch": 2.09272347424101, + "grad_norm": 0.1522546261548996, + "learning_rate": 3.9579482857517684e-05, + "loss": 0.2652, + "step": 47080 + }, + { + "epoch": 2.093167977952616, + "grad_norm": 0.19931839406490326, + "learning_rate": 3.955926922261921e-05, + "loss": 0.2676, + "step": 47090 + }, + { + "epoch": 2.093612481664222, + "grad_norm": 0.16128432750701904, + "learning_rate": 3.9539057371756084e-05, + "loss": 0.2647, + "step": 47100 + }, + { + "epoch": 2.0940569853758277, + "grad_norm": 0.17113769054412842, + "learning_rate": 3.951884730838195e-05, + "loss": 0.2662, + "step": 47110 + }, + { + "epoch": 2.094501489087434, + "grad_norm": 0.173188716173172, + "learning_rate": 3.949863903595012e-05, + "loss": 0.2631, + "step": 47120 + }, + { + "epoch": 2.09494599279904, + "grad_norm": 0.15453606843948364, + "learning_rate": 3.947843255791369e-05, + "loss": 0.267, + "step": 47130 + }, + { + "epoch": 2.095390496510646, + "grad_norm": 0.15710952877998352, + "learning_rate": 3.9458227877725364e-05, + "loss": 0.2641, + "step": 47140 + }, + { + "epoch": 2.0958350002222517, + "grad_norm": 0.1396241933107376, + "learning_rate": 3.943802499883758e-05, + "loss": 0.2649, + "step": 47150 + }, + { + "epoch": 2.0962795039338578, + "grad_norm": 0.17110776901245117, + "learning_rate": 3.9417823924702437e-05, + "loss": 0.2656, + "step": 47160 + }, + { + "epoch": 2.096724007645464, + "grad_norm": 0.15590320527553558, + "learning_rate": 3.939762465877178e-05, + "loss": 0.2647, + "step": 47170 + }, + { + "epoch": 2.09716851135707, + "grad_norm": 0.16802656650543213, + "learning_rate": 3.937742720449708e-05, + "loss": 0.2659, + "step": 47180 + }, + { + "epoch": 2.097613015068676, + "grad_norm": 0.18048036098480225, + "learning_rate": 3.9357231565329563e-05, + "loss": 0.262, + "step": 47190 + }, + { + "epoch": 2.0980575187802817, + "grad_norm": 0.19472695887088776, + "learning_rate": 3.933703774472008e-05, + "loss": 0.2639, + "step": 47200 + }, + { + "epoch": 2.0985020224918878, + "grad_norm": 0.14606744050979614, + "learning_rate": 3.93168457461192e-05, + "loss": 0.2658, + "step": 47210 + }, + { + "epoch": 2.098946526203494, + "grad_norm": 0.17312853038311005, + "learning_rate": 3.9296655572977216e-05, + "loss": 0.2655, + "step": 47220 + }, + { + "epoch": 2.0993910299151, + "grad_norm": 0.152302548289299, + "learning_rate": 3.927646722874404e-05, + "loss": 0.2669, + "step": 47230 + }, + { + "epoch": 2.0998355336267056, + "grad_norm": 0.1604105830192566, + "learning_rate": 3.925628071686934e-05, + "loss": 0.2654, + "step": 47240 + }, + { + "epoch": 2.1002800373383117, + "grad_norm": 0.15822277963161469, + "learning_rate": 3.9236096040802415e-05, + "loss": 0.2641, + "step": 47250 + }, + { + "epoch": 2.100724541049918, + "grad_norm": 0.17906838655471802, + "learning_rate": 3.9215913203992294e-05, + "loss": 0.2673, + "step": 47260 + }, + { + "epoch": 2.101169044761524, + "grad_norm": 0.14517703652381897, + "learning_rate": 3.9195732209887645e-05, + "loss": 0.266, + "step": 47270 + }, + { + "epoch": 2.1016135484731295, + "grad_norm": 0.1790814995765686, + "learning_rate": 3.9175553061936875e-05, + "loss": 0.265, + "step": 47280 + }, + { + "epoch": 2.1020580521847356, + "grad_norm": 0.18946287035942078, + "learning_rate": 3.9155375763588045e-05, + "loss": 0.2623, + "step": 47290 + }, + { + "epoch": 2.1025025558963417, + "grad_norm": 0.15993185341358185, + "learning_rate": 3.913520031828889e-05, + "loss": 0.2656, + "step": 47300 + }, + { + "epoch": 2.102947059607948, + "grad_norm": 0.15464052557945251, + "learning_rate": 3.911502672948685e-05, + "loss": 0.2663, + "step": 47310 + }, + { + "epoch": 2.103391563319554, + "grad_norm": 0.15302129089832306, + "learning_rate": 3.9094855000629014e-05, + "loss": 0.2634, + "step": 47320 + }, + { + "epoch": 2.1038360670311596, + "grad_norm": 0.16600356996059418, + "learning_rate": 3.907468513516223e-05, + "loss": 0.2652, + "step": 47330 + }, + { + "epoch": 2.1042805707427656, + "grad_norm": 0.19445227086544037, + "learning_rate": 3.905451713653294e-05, + "loss": 0.2642, + "step": 47340 + }, + { + "epoch": 2.1047250744543717, + "grad_norm": 0.18813979625701904, + "learning_rate": 3.903435100818731e-05, + "loss": 0.2626, + "step": 47350 + }, + { + "epoch": 2.105169578165978, + "grad_norm": 0.1575743705034256, + "learning_rate": 3.901418675357117e-05, + "loss": 0.2637, + "step": 47360 + }, + { + "epoch": 2.1056140818775835, + "grad_norm": 0.15129786729812622, + "learning_rate": 3.8994024376130075e-05, + "loss": 0.2641, + "step": 47370 + }, + { + "epoch": 2.1060585855891896, + "grad_norm": 0.15618230402469635, + "learning_rate": 3.8973863879309194e-05, + "loss": 0.2646, + "step": 47380 + }, + { + "epoch": 2.1065030893007957, + "grad_norm": 0.14689958095550537, + "learning_rate": 3.8953705266553394e-05, + "loss": 0.2641, + "step": 47390 + }, + { + "epoch": 2.1069475930124018, + "grad_norm": 0.15863680839538574, + "learning_rate": 3.893354854130727e-05, + "loss": 0.2629, + "step": 47400 + }, + { + "epoch": 2.107392096724008, + "grad_norm": 0.14216145873069763, + "learning_rate": 3.8913393707015006e-05, + "loss": 0.2627, + "step": 47410 + }, + { + "epoch": 2.1078366004356135, + "grad_norm": 0.13715706765651703, + "learning_rate": 3.889324076712056e-05, + "loss": 0.2627, + "step": 47420 + }, + { + "epoch": 2.1082811041472196, + "grad_norm": 0.16935235261917114, + "learning_rate": 3.8873089725067476e-05, + "loss": 0.2657, + "step": 47430 + }, + { + "epoch": 2.1087256078588257, + "grad_norm": 0.15897876024246216, + "learning_rate": 3.885294058429905e-05, + "loss": 0.2648, + "step": 47440 + }, + { + "epoch": 2.1091701115704318, + "grad_norm": 0.14790701866149902, + "learning_rate": 3.8832793348258206e-05, + "loss": 0.2636, + "step": 47450 + }, + { + "epoch": 2.1096146152820374, + "grad_norm": 0.18484292924404144, + "learning_rate": 3.881264802038756e-05, + "loss": 0.2626, + "step": 47460 + }, + { + "epoch": 2.1100591189936435, + "grad_norm": 0.16918249428272247, + "learning_rate": 3.879250460412939e-05, + "loss": 0.2673, + "step": 47470 + }, + { + "epoch": 2.1105036227052496, + "grad_norm": 0.18605175614356995, + "learning_rate": 3.8772363102925644e-05, + "loss": 0.2669, + "step": 47480 + }, + { + "epoch": 2.1109481264168557, + "grad_norm": 0.1683143675327301, + "learning_rate": 3.875222352021798e-05, + "loss": 0.2685, + "step": 47490 + }, + { + "epoch": 2.111392630128462, + "grad_norm": 0.17063304781913757, + "learning_rate": 3.8732085859447686e-05, + "loss": 0.2644, + "step": 47500 + }, + { + "epoch": 2.1118371338400674, + "grad_norm": 0.15776002407073975, + "learning_rate": 3.871195012405575e-05, + "loss": 0.2644, + "step": 47510 + }, + { + "epoch": 2.1122816375516735, + "grad_norm": 0.13233020901679993, + "learning_rate": 3.869181631748278e-05, + "loss": 0.2654, + "step": 47520 + }, + { + "epoch": 2.1127261412632796, + "grad_norm": 0.1794171780347824, + "learning_rate": 3.867168444316915e-05, + "loss": 0.2624, + "step": 47530 + }, + { + "epoch": 2.1131706449748857, + "grad_norm": 0.20344562828540802, + "learning_rate": 3.865155450455481e-05, + "loss": 0.2668, + "step": 47540 + }, + { + "epoch": 2.1136151486864914, + "grad_norm": 0.1623205989599228, + "learning_rate": 3.8631426505079426e-05, + "loss": 0.2625, + "step": 47550 + }, + { + "epoch": 2.1140596523980975, + "grad_norm": 0.14590710401535034, + "learning_rate": 3.8611300448182304e-05, + "loss": 0.2633, + "step": 47560 + }, + { + "epoch": 2.1145041561097035, + "grad_norm": 0.14744117856025696, + "learning_rate": 3.859117633730248e-05, + "loss": 0.2641, + "step": 47570 + }, + { + "epoch": 2.1149486598213096, + "grad_norm": 0.21162788569927216, + "learning_rate": 3.857105417587858e-05, + "loss": 0.2612, + "step": 47580 + }, + { + "epoch": 2.1153931635329153, + "grad_norm": 0.19582292437553406, + "learning_rate": 3.855093396734894e-05, + "loss": 0.2627, + "step": 47590 + }, + { + "epoch": 2.1158376672445214, + "grad_norm": 0.15186533331871033, + "learning_rate": 3.8530815715151545e-05, + "loss": 0.2667, + "step": 47600 + }, + { + "epoch": 2.1162821709561275, + "grad_norm": 0.15877455472946167, + "learning_rate": 3.851069942272405e-05, + "loss": 0.2653, + "step": 47610 + }, + { + "epoch": 2.1167266746677336, + "grad_norm": 0.15922120213508606, + "learning_rate": 3.849058509350382e-05, + "loss": 0.2671, + "step": 47620 + }, + { + "epoch": 2.1171711783793397, + "grad_norm": 0.15215162932872772, + "learning_rate": 3.8470472730927783e-05, + "loss": 0.2642, + "step": 47630 + }, + { + "epoch": 2.1176156820909453, + "grad_norm": 0.17457027733325958, + "learning_rate": 3.845036233843264e-05, + "loss": 0.2645, + "step": 47640 + }, + { + "epoch": 2.1180601858025514, + "grad_norm": 0.20023898780345917, + "learning_rate": 3.843025391945469e-05, + "loss": 0.264, + "step": 47650 + }, + { + "epoch": 2.1185046895141575, + "grad_norm": 0.1629534512758255, + "learning_rate": 3.841014747742992e-05, + "loss": 0.2661, + "step": 47660 + }, + { + "epoch": 2.1189491932257636, + "grad_norm": 0.14599911868572235, + "learning_rate": 3.839004301579397e-05, + "loss": 0.2652, + "step": 47670 + }, + { + "epoch": 2.1193936969373692, + "grad_norm": 0.15632973611354828, + "learning_rate": 3.8369940537982097e-05, + "loss": 0.2656, + "step": 47680 + }, + { + "epoch": 2.1198382006489753, + "grad_norm": 0.13385677337646484, + "learning_rate": 3.834984004742933e-05, + "loss": 0.2648, + "step": 47690 + }, + { + "epoch": 2.1202827043605814, + "grad_norm": 0.20377783477306366, + "learning_rate": 3.832974154757026e-05, + "loss": 0.2649, + "step": 47700 + }, + { + "epoch": 2.1207272080721875, + "grad_norm": 0.18450284004211426, + "learning_rate": 3.830964504183919e-05, + "loss": 0.2636, + "step": 47710 + }, + { + "epoch": 2.1211717117837936, + "grad_norm": 0.16998767852783203, + "learning_rate": 3.828955053367003e-05, + "loss": 0.2636, + "step": 47720 + }, + { + "epoch": 2.1216162154953992, + "grad_norm": 0.18767878413200378, + "learning_rate": 3.826945802649642e-05, + "loss": 0.2646, + "step": 47730 + }, + { + "epoch": 2.1220607192070053, + "grad_norm": 0.1616816222667694, + "learning_rate": 3.824936752375159e-05, + "loss": 0.2627, + "step": 47740 + }, + { + "epoch": 2.1225052229186114, + "grad_norm": 0.1850178837776184, + "learning_rate": 3.822927902886848e-05, + "loss": 0.2641, + "step": 47750 + }, + { + "epoch": 2.1229497266302175, + "grad_norm": 0.1707088053226471, + "learning_rate": 3.8209192545279653e-05, + "loss": 0.2619, + "step": 47760 + }, + { + "epoch": 2.123394230341823, + "grad_norm": 0.16409924626350403, + "learning_rate": 3.8189108076417326e-05, + "loss": 0.2631, + "step": 47770 + }, + { + "epoch": 2.1238387340534293, + "grad_norm": 0.1735495924949646, + "learning_rate": 3.816902562571342e-05, + "loss": 0.2633, + "step": 47780 + }, + { + "epoch": 2.1242832377650354, + "grad_norm": 0.15607795119285583, + "learning_rate": 3.814894519659944e-05, + "loss": 0.2647, + "step": 47790 + }, + { + "epoch": 2.1247277414766415, + "grad_norm": 0.17059722542762756, + "learning_rate": 3.812886679250661e-05, + "loss": 0.2642, + "step": 47800 + }, + { + "epoch": 2.1251722451882475, + "grad_norm": 0.1794801950454712, + "learning_rate": 3.810879041686575e-05, + "loss": 0.2636, + "step": 47810 + }, + { + "epoch": 2.125616748899853, + "grad_norm": 0.1443103551864624, + "learning_rate": 3.808871607310741e-05, + "loss": 0.2609, + "step": 47820 + }, + { + "epoch": 2.1260612526114593, + "grad_norm": 0.13209302723407745, + "learning_rate": 3.806864376466169e-05, + "loss": 0.2611, + "step": 47830 + }, + { + "epoch": 2.1265057563230654, + "grad_norm": 0.16370494663715363, + "learning_rate": 3.804857349495845e-05, + "loss": 0.2619, + "step": 47840 + }, + { + "epoch": 2.1269502600346715, + "grad_norm": 0.14297746121883392, + "learning_rate": 3.802850526742713e-05, + "loss": 0.2637, + "step": 47850 + }, + { + "epoch": 2.127394763746277, + "grad_norm": 0.1528172492980957, + "learning_rate": 3.800843908549683e-05, + "loss": 0.2658, + "step": 47860 + }, + { + "epoch": 2.127839267457883, + "grad_norm": 0.1383899748325348, + "learning_rate": 3.7988374952596325e-05, + "loss": 0.2638, + "step": 47870 + }, + { + "epoch": 2.1282837711694893, + "grad_norm": 0.14482510089874268, + "learning_rate": 3.7968312872154e-05, + "loss": 0.2627, + "step": 47880 + }, + { + "epoch": 2.1287282748810954, + "grad_norm": 0.15320391952991486, + "learning_rate": 3.7948252847597965e-05, + "loss": 0.2636, + "step": 47890 + }, + { + "epoch": 2.129172778592701, + "grad_norm": 0.16566793620586395, + "learning_rate": 3.7928194882355885e-05, + "loss": 0.2636, + "step": 47900 + }, + { + "epoch": 2.129617282304307, + "grad_norm": 0.16654300689697266, + "learning_rate": 3.790813897985515e-05, + "loss": 0.2638, + "step": 47910 + }, + { + "epoch": 2.1300617860159132, + "grad_norm": 0.16823145747184753, + "learning_rate": 3.7888085143522726e-05, + "loss": 0.2636, + "step": 47920 + }, + { + "epoch": 2.1305062897275193, + "grad_norm": 0.17344194650650024, + "learning_rate": 3.7868033376785314e-05, + "loss": 0.2668, + "step": 47930 + }, + { + "epoch": 2.1309507934391254, + "grad_norm": 0.16456587612628937, + "learning_rate": 3.784798368306919e-05, + "loss": 0.2593, + "step": 47940 + }, + { + "epoch": 2.131395297150731, + "grad_norm": 0.14663103222846985, + "learning_rate": 3.782793606580029e-05, + "loss": 0.264, + "step": 47950 + }, + { + "epoch": 2.131839800862337, + "grad_norm": 0.16851139068603516, + "learning_rate": 3.7807890528404205e-05, + "loss": 0.2628, + "step": 47960 + }, + { + "epoch": 2.1322843045739432, + "grad_norm": 0.1662140041589737, + "learning_rate": 3.778784707430616e-05, + "loss": 0.2633, + "step": 47970 + }, + { + "epoch": 2.1327288082855493, + "grad_norm": 0.1345011442899704, + "learning_rate": 3.776780570693107e-05, + "loss": 0.2623, + "step": 47980 + }, + { + "epoch": 2.133173311997155, + "grad_norm": 0.13780300319194794, + "learning_rate": 3.774776642970342e-05, + "loss": 0.2634, + "step": 47990 + }, + { + "epoch": 2.133617815708761, + "grad_norm": 0.1658661663532257, + "learning_rate": 3.77277292460474e-05, + "loss": 0.263, + "step": 48000 + }, + { + "epoch": 2.134062319420367, + "grad_norm": 0.17866824567317963, + "learning_rate": 3.770769415938678e-05, + "loss": 0.2656, + "step": 48010 + }, + { + "epoch": 2.1345068231319733, + "grad_norm": 0.14482146501541138, + "learning_rate": 3.768766117314506e-05, + "loss": 0.267, + "step": 48020 + }, + { + "epoch": 2.134951326843579, + "grad_norm": 0.1376609057188034, + "learning_rate": 3.76676302907453e-05, + "loss": 0.2638, + "step": 48030 + }, + { + "epoch": 2.135395830555185, + "grad_norm": 0.17323684692382812, + "learning_rate": 3.764760151561021e-05, + "loss": 0.2636, + "step": 48040 + }, + { + "epoch": 2.135840334266791, + "grad_norm": 0.1531582772731781, + "learning_rate": 3.76275748511622e-05, + "loss": 0.2645, + "step": 48050 + }, + { + "epoch": 2.136284837978397, + "grad_norm": 0.20811402797698975, + "learning_rate": 3.7607550300823255e-05, + "loss": 0.2598, + "step": 48060 + }, + { + "epoch": 2.1367293416900033, + "grad_norm": 0.14164532721042633, + "learning_rate": 3.7587527868015044e-05, + "loss": 0.262, + "step": 48070 + }, + { + "epoch": 2.137173845401609, + "grad_norm": 0.18437151610851288, + "learning_rate": 3.756750755615881e-05, + "loss": 0.2647, + "step": 48080 + }, + { + "epoch": 2.137618349113215, + "grad_norm": 0.158000186085701, + "learning_rate": 3.754748936867553e-05, + "loss": 0.2649, + "step": 48090 + }, + { + "epoch": 2.138062852824821, + "grad_norm": 0.16036193072795868, + "learning_rate": 3.752747330898573e-05, + "loss": 0.2624, + "step": 48100 + }, + { + "epoch": 2.138507356536427, + "grad_norm": 0.14859826862812042, + "learning_rate": 3.750745938050962e-05, + "loss": 0.2638, + "step": 48110 + }, + { + "epoch": 2.1389518602480333, + "grad_norm": 0.1738968789577484, + "learning_rate": 3.7487447586667025e-05, + "loss": 0.266, + "step": 48120 + }, + { + "epoch": 2.139396363959639, + "grad_norm": 0.15494616329669952, + "learning_rate": 3.7467437930877426e-05, + "loss": 0.2627, + "step": 48130 + }, + { + "epoch": 2.139840867671245, + "grad_norm": 0.15193352103233337, + "learning_rate": 3.744743041655992e-05, + "loss": 0.266, + "step": 48140 + }, + { + "epoch": 2.140285371382851, + "grad_norm": 0.15189985930919647, + "learning_rate": 3.742742504713324e-05, + "loss": 0.2638, + "step": 48150 + }, + { + "epoch": 2.1407298750944572, + "grad_norm": 0.141555517911911, + "learning_rate": 3.740742182601576e-05, + "loss": 0.2625, + "step": 48160 + }, + { + "epoch": 2.141174378806063, + "grad_norm": 0.17375794053077698, + "learning_rate": 3.7387420756625464e-05, + "loss": 0.2629, + "step": 48170 + }, + { + "epoch": 2.141618882517669, + "grad_norm": 0.1743123084306717, + "learning_rate": 3.736742184238002e-05, + "loss": 0.2655, + "step": 48180 + }, + { + "epoch": 2.142063386229275, + "grad_norm": 0.18152041733264923, + "learning_rate": 3.7347425086696684e-05, + "loss": 0.2629, + "step": 48190 + }, + { + "epoch": 2.142507889940881, + "grad_norm": 0.17923089861869812, + "learning_rate": 3.732743049299235e-05, + "loss": 0.2663, + "step": 48200 + }, + { + "epoch": 2.142952393652487, + "grad_norm": 0.14023129642009735, + "learning_rate": 3.730743806468354e-05, + "loss": 0.2635, + "step": 48210 + }, + { + "epoch": 2.143396897364093, + "grad_norm": 0.15174509584903717, + "learning_rate": 3.7287447805186436e-05, + "loss": 0.2678, + "step": 48220 + }, + { + "epoch": 2.143841401075699, + "grad_norm": 0.1716509461402893, + "learning_rate": 3.726745971791682e-05, + "loss": 0.2638, + "step": 48230 + }, + { + "epoch": 2.144285904787305, + "grad_norm": 0.16982102394104004, + "learning_rate": 3.724747380629008e-05, + "loss": 0.2645, + "step": 48240 + }, + { + "epoch": 2.144730408498911, + "grad_norm": 0.1437000334262848, + "learning_rate": 3.72274900737213e-05, + "loss": 0.26, + "step": 48250 + }, + { + "epoch": 2.145174912210517, + "grad_norm": 0.13228093087673187, + "learning_rate": 3.7207508523625123e-05, + "loss": 0.2645, + "step": 48260 + }, + { + "epoch": 2.145619415922123, + "grad_norm": 0.14154531061649323, + "learning_rate": 3.718752915941588e-05, + "loss": 0.265, + "step": 48270 + }, + { + "epoch": 2.146063919633729, + "grad_norm": 0.14853838086128235, + "learning_rate": 3.7167551984507464e-05, + "loss": 0.2622, + "step": 48280 + }, + { + "epoch": 2.146508423345335, + "grad_norm": 0.16127800941467285, + "learning_rate": 3.714757700231346e-05, + "loss": 0.2631, + "step": 48290 + }, + { + "epoch": 2.1469529270569407, + "grad_norm": 0.169232577085495, + "learning_rate": 3.712760421624703e-05, + "loss": 0.2635, + "step": 48300 + }, + { + "epoch": 2.147397430768547, + "grad_norm": 0.1637832522392273, + "learning_rate": 3.710763362972099e-05, + "loss": 0.2663, + "step": 48310 + }, + { + "epoch": 2.147841934480153, + "grad_norm": 0.16412340104579926, + "learning_rate": 3.708766524614774e-05, + "loss": 0.2646, + "step": 48320 + }, + { + "epoch": 2.148286438191759, + "grad_norm": 0.17191368341445923, + "learning_rate": 3.7067699068939335e-05, + "loss": 0.2662, + "step": 48330 + }, + { + "epoch": 2.1487309419033647, + "grad_norm": 0.16101674735546112, + "learning_rate": 3.704773510150748e-05, + "loss": 0.2633, + "step": 48340 + }, + { + "epoch": 2.1491754456149708, + "grad_norm": 0.17342357337474823, + "learning_rate": 3.702777334726344e-05, + "loss": 0.2659, + "step": 48350 + }, + { + "epoch": 2.149619949326577, + "grad_norm": 0.14663296937942505, + "learning_rate": 3.7007813809618164e-05, + "loss": 0.2639, + "step": 48360 + }, + { + "epoch": 2.150064453038183, + "grad_norm": 0.16806121170520782, + "learning_rate": 3.6987856491982145e-05, + "loss": 0.2625, + "step": 48370 + }, + { + "epoch": 2.150508956749789, + "grad_norm": 0.14940430223941803, + "learning_rate": 3.696790139776558e-05, + "loss": 0.2615, + "step": 48380 + }, + { + "epoch": 2.1509534604613947, + "grad_norm": 0.1755368709564209, + "learning_rate": 3.6947948530378235e-05, + "loss": 0.2645, + "step": 48390 + }, + { + "epoch": 2.1513979641730008, + "grad_norm": 0.1358535885810852, + "learning_rate": 3.6927997893229516e-05, + "loss": 0.2641, + "step": 48400 + }, + { + "epoch": 2.151842467884607, + "grad_norm": 0.15294116735458374, + "learning_rate": 3.690804948972845e-05, + "loss": 0.2599, + "step": 48410 + }, + { + "epoch": 2.152286971596213, + "grad_norm": 0.16001376509666443, + "learning_rate": 3.688810332328363e-05, + "loss": 0.265, + "step": 48420 + }, + { + "epoch": 2.152731475307819, + "grad_norm": 0.16576442122459412, + "learning_rate": 3.686815939730336e-05, + "loss": 0.2649, + "step": 48430 + }, + { + "epoch": 2.1531759790194247, + "grad_norm": 0.12912492454051971, + "learning_rate": 3.684821771519548e-05, + "loss": 0.2638, + "step": 48440 + }, + { + "epoch": 2.153620482731031, + "grad_norm": 0.18657298386096954, + "learning_rate": 3.68282782803675e-05, + "loss": 0.2666, + "step": 48450 + }, + { + "epoch": 2.154064986442637, + "grad_norm": 0.1792975664138794, + "learning_rate": 3.6808341096226504e-05, + "loss": 0.2644, + "step": 48460 + }, + { + "epoch": 2.154509490154243, + "grad_norm": 0.16440163552761078, + "learning_rate": 3.678840616617924e-05, + "loss": 0.2642, + "step": 48470 + }, + { + "epoch": 2.1549539938658486, + "grad_norm": 0.1403883546590805, + "learning_rate": 3.6768473493632e-05, + "loss": 0.2636, + "step": 48480 + }, + { + "epoch": 2.1553984975774547, + "grad_norm": 0.1776662915945053, + "learning_rate": 3.6748543081990783e-05, + "loss": 0.2614, + "step": 48490 + }, + { + "epoch": 2.155843001289061, + "grad_norm": 0.1573719084262848, + "learning_rate": 3.672861493466112e-05, + "loss": 0.2629, + "step": 48500 + }, + { + "epoch": 2.156287505000667, + "grad_norm": 0.15204112231731415, + "learning_rate": 3.670868905504818e-05, + "loss": 0.265, + "step": 48510 + }, + { + "epoch": 2.1567320087122726, + "grad_norm": 0.143487811088562, + "learning_rate": 3.6688765446556784e-05, + "loss": 0.2641, + "step": 48520 + }, + { + "epoch": 2.1571765124238786, + "grad_norm": 0.16096939146518707, + "learning_rate": 3.6668844112591276e-05, + "loss": 0.263, + "step": 48530 + }, + { + "epoch": 2.1576210161354847, + "grad_norm": 0.1440524160861969, + "learning_rate": 3.664892505655573e-05, + "loss": 0.2638, + "step": 48540 + }, + { + "epoch": 2.158065519847091, + "grad_norm": 0.16390107572078705, + "learning_rate": 3.662900828185373e-05, + "loss": 0.2661, + "step": 48550 + }, + { + "epoch": 2.158510023558697, + "grad_norm": 0.15740270912647247, + "learning_rate": 3.6609093791888516e-05, + "loss": 0.2634, + "step": 48560 + }, + { + "epoch": 2.1589545272703026, + "grad_norm": 0.1803213208913803, + "learning_rate": 3.658918159006292e-05, + "loss": 0.2647, + "step": 48570 + }, + { + "epoch": 2.1593990309819087, + "grad_norm": 0.19680705666542053, + "learning_rate": 3.656927167977942e-05, + "loss": 0.2645, + "step": 48580 + }, + { + "epoch": 2.1598435346935148, + "grad_norm": 0.18082886934280396, + "learning_rate": 3.654936406444006e-05, + "loss": 0.2663, + "step": 48590 + }, + { + "epoch": 2.160288038405121, + "grad_norm": 0.18221569061279297, + "learning_rate": 3.65294587474465e-05, + "loss": 0.2655, + "step": 48600 + }, + { + "epoch": 2.1607325421167265, + "grad_norm": 0.17567695677280426, + "learning_rate": 3.650955573220002e-05, + "loss": 0.2687, + "step": 48610 + }, + { + "epoch": 2.1611770458283326, + "grad_norm": 0.16874530911445618, + "learning_rate": 3.648965502210149e-05, + "loss": 0.2669, + "step": 48620 + }, + { + "epoch": 2.1616215495399387, + "grad_norm": 0.17469803988933563, + "learning_rate": 3.646975662055142e-05, + "loss": 0.2635, + "step": 48630 + }, + { + "epoch": 2.1620660532515448, + "grad_norm": 0.18253086507320404, + "learning_rate": 3.644986053094987e-05, + "loss": 0.2642, + "step": 48640 + }, + { + "epoch": 2.1625105569631504, + "grad_norm": 0.15308736264705658, + "learning_rate": 3.642996675669659e-05, + "loss": 0.2644, + "step": 48650 + }, + { + "epoch": 2.1629550606747565, + "grad_norm": 0.17127634584903717, + "learning_rate": 3.641007530119083e-05, + "loss": 0.2651, + "step": 48660 + }, + { + "epoch": 2.1633995643863626, + "grad_norm": 0.14292745292186737, + "learning_rate": 3.639018616783153e-05, + "loss": 0.2612, + "step": 48670 + }, + { + "epoch": 2.1638440680979687, + "grad_norm": 0.1578737050294876, + "learning_rate": 3.637029936001719e-05, + "loss": 0.2645, + "step": 48680 + }, + { + "epoch": 2.164288571809575, + "grad_norm": 0.1410466581583023, + "learning_rate": 3.6350414881145886e-05, + "loss": 0.2625, + "step": 48690 + }, + { + "epoch": 2.1647330755211804, + "grad_norm": 0.15098322927951813, + "learning_rate": 3.6330532734615386e-05, + "loss": 0.2662, + "step": 48700 + }, + { + "epoch": 2.1651775792327865, + "grad_norm": 0.15482720732688904, + "learning_rate": 3.6310652923822975e-05, + "loss": 0.2641, + "step": 48710 + }, + { + "epoch": 2.1656220829443926, + "grad_norm": 0.15642890334129333, + "learning_rate": 3.629077545216558e-05, + "loss": 0.2623, + "step": 48720 + }, + { + "epoch": 2.1660665866559987, + "grad_norm": 0.163213849067688, + "learning_rate": 3.627090032303969e-05, + "loss": 0.2634, + "step": 48730 + }, + { + "epoch": 2.1665110903676044, + "grad_norm": 0.18770958483219147, + "learning_rate": 3.625102753984146e-05, + "loss": 0.2638, + "step": 48740 + }, + { + "epoch": 2.1669555940792105, + "grad_norm": 0.17576850950717926, + "learning_rate": 3.623115710596659e-05, + "loss": 0.2635, + "step": 48750 + }, + { + "epoch": 2.1674000977908165, + "grad_norm": 0.18585307896137238, + "learning_rate": 3.6211289024810395e-05, + "loss": 0.2627, + "step": 48760 + }, + { + "epoch": 2.1678446015024226, + "grad_norm": 0.1545569747686386, + "learning_rate": 3.619142329976777e-05, + "loss": 0.2623, + "step": 48770 + }, + { + "epoch": 2.1682891052140287, + "grad_norm": 0.1705627143383026, + "learning_rate": 3.6171559934233247e-05, + "loss": 0.2655, + "step": 48780 + }, + { + "epoch": 2.1687336089256344, + "grad_norm": 0.1798170953989029, + "learning_rate": 3.615169893160093e-05, + "loss": 0.2622, + "step": 48790 + }, + { + "epoch": 2.1691781126372405, + "grad_norm": 0.16399525105953217, + "learning_rate": 3.61318402952645e-05, + "loss": 0.2656, + "step": 48800 + }, + { + "epoch": 2.1696226163488466, + "grad_norm": 0.15324898064136505, + "learning_rate": 3.6111984028617285e-05, + "loss": 0.2667, + "step": 48810 + }, + { + "epoch": 2.1700671200604527, + "grad_norm": 0.18313048779964447, + "learning_rate": 3.6092130135052134e-05, + "loss": 0.2658, + "step": 48820 + }, + { + "epoch": 2.1705116237720583, + "grad_norm": 0.16586357355117798, + "learning_rate": 3.6072278617961584e-05, + "loss": 0.2662, + "step": 48830 + }, + { + "epoch": 2.1709561274836644, + "grad_norm": 0.14981724321842194, + "learning_rate": 3.605242948073767e-05, + "loss": 0.2635, + "step": 48840 + }, + { + "epoch": 2.1714006311952705, + "grad_norm": 0.16723473370075226, + "learning_rate": 3.603258272677212e-05, + "loss": 0.2598, + "step": 48850 + }, + { + "epoch": 2.1718451349068766, + "grad_norm": 0.18680471181869507, + "learning_rate": 3.601273835945616e-05, + "loss": 0.2631, + "step": 48860 + }, + { + "epoch": 2.1722896386184827, + "grad_norm": 0.16540759801864624, + "learning_rate": 3.5992896382180664e-05, + "loss": 0.2662, + "step": 48870 + }, + { + "epoch": 2.1727341423300883, + "grad_norm": 0.16872243583202362, + "learning_rate": 3.597305679833609e-05, + "loss": 0.2642, + "step": 48880 + }, + { + "epoch": 2.1731786460416944, + "grad_norm": 0.14113228023052216, + "learning_rate": 3.595321961131245e-05, + "loss": 0.2674, + "step": 48890 + }, + { + "epoch": 2.1736231497533005, + "grad_norm": 0.1520460844039917, + "learning_rate": 3.593338482449942e-05, + "loss": 0.265, + "step": 48900 + }, + { + "epoch": 2.1740676534649066, + "grad_norm": 0.15487655997276306, + "learning_rate": 3.591355244128618e-05, + "loss": 0.2609, + "step": 48910 + }, + { + "epoch": 2.1745121571765123, + "grad_norm": 0.16109877824783325, + "learning_rate": 3.589372246506158e-05, + "loss": 0.2632, + "step": 48920 + }, + { + "epoch": 2.1749566608881183, + "grad_norm": 0.1326572448015213, + "learning_rate": 3.5873894899213984e-05, + "loss": 0.2632, + "step": 48930 + }, + { + "epoch": 2.1754011645997244, + "grad_norm": 0.1553516685962677, + "learning_rate": 3.5854069747131416e-05, + "loss": 0.2617, + "step": 48940 + }, + { + "epoch": 2.1758456683113305, + "grad_norm": 0.16066215932369232, + "learning_rate": 3.583424701220143e-05, + "loss": 0.264, + "step": 48950 + }, + { + "epoch": 2.176290172022936, + "grad_norm": 0.1529558151960373, + "learning_rate": 3.581442669781121e-05, + "loss": 0.264, + "step": 48960 + }, + { + "epoch": 2.1767346757345423, + "grad_norm": 0.1696518063545227, + "learning_rate": 3.579460880734749e-05, + "loss": 0.2618, + "step": 48970 + }, + { + "epoch": 2.1771791794461484, + "grad_norm": 0.17087432742118835, + "learning_rate": 3.577479334419657e-05, + "loss": 0.2628, + "step": 48980 + }, + { + "epoch": 2.1776236831577545, + "grad_norm": 0.13301269710063934, + "learning_rate": 3.575498031174444e-05, + "loss": 0.2641, + "step": 48990 + }, + { + "epoch": 2.1780681868693605, + "grad_norm": 0.1688026487827301, + "learning_rate": 3.573516971337657e-05, + "loss": 0.2635, + "step": 49000 + }, + { + "epoch": 2.178512690580966, + "grad_norm": 0.16002221405506134, + "learning_rate": 3.5715361552478046e-05, + "loss": 0.2621, + "step": 49010 + }, + { + "epoch": 2.1789571942925723, + "grad_norm": 0.17600449919700623, + "learning_rate": 3.5695555832433536e-05, + "loss": 0.2655, + "step": 49020 + }, + { + "epoch": 2.1794016980041784, + "grad_norm": 0.1520869880914688, + "learning_rate": 3.5675752556627325e-05, + "loss": 0.2644, + "step": 49030 + }, + { + "epoch": 2.1798462017157845, + "grad_norm": 0.13619594275951385, + "learning_rate": 3.565595172844322e-05, + "loss": 0.2621, + "step": 49040 + }, + { + "epoch": 2.18029070542739, + "grad_norm": 0.1710263043642044, + "learning_rate": 3.5636153351264666e-05, + "loss": 0.2654, + "step": 49050 + }, + { + "epoch": 2.180735209138996, + "grad_norm": 0.1570107489824295, + "learning_rate": 3.5616357428474655e-05, + "loss": 0.2623, + "step": 49060 + }, + { + "epoch": 2.1811797128506023, + "grad_norm": 0.1696060448884964, + "learning_rate": 3.559656396345575e-05, + "loss": 0.2633, + "step": 49070 + }, + { + "epoch": 2.1816242165622084, + "grad_norm": 0.17186161875724792, + "learning_rate": 3.5576772959590146e-05, + "loss": 0.2667, + "step": 49080 + }, + { + "epoch": 2.1820687202738145, + "grad_norm": 0.1548486202955246, + "learning_rate": 3.5556984420259545e-05, + "loss": 0.2657, + "step": 49090 + }, + { + "epoch": 2.18251322398542, + "grad_norm": 0.15779750049114227, + "learning_rate": 3.5537198348845305e-05, + "loss": 0.2634, + "step": 49100 + }, + { + "epoch": 2.1829577276970262, + "grad_norm": 0.13886629045009613, + "learning_rate": 3.551741474872831e-05, + "loss": 0.2604, + "step": 49110 + }, + { + "epoch": 2.1834022314086323, + "grad_norm": 0.15638500452041626, + "learning_rate": 3.549763362328903e-05, + "loss": 0.2656, + "step": 49120 + }, + { + "epoch": 2.1838467351202384, + "grad_norm": 0.14955075085163116, + "learning_rate": 3.5477854975907515e-05, + "loss": 0.2648, + "step": 49130 + }, + { + "epoch": 2.184291238831844, + "grad_norm": 0.16989152133464813, + "learning_rate": 3.5458078809963416e-05, + "loss": 0.2647, + "step": 49140 + }, + { + "epoch": 2.18473574254345, + "grad_norm": 0.16365857422351837, + "learning_rate": 3.543830512883594e-05, + "loss": 0.2662, + "step": 49150 + }, + { + "epoch": 2.1851802462550562, + "grad_norm": 0.15198005735874176, + "learning_rate": 3.5418533935903824e-05, + "loss": 0.2634, + "step": 49160 + }, + { + "epoch": 2.1856247499666623, + "grad_norm": 0.173753559589386, + "learning_rate": 3.539876523454547e-05, + "loss": 0.2624, + "step": 49170 + }, + { + "epoch": 2.1860692536782684, + "grad_norm": 0.15342655777931213, + "learning_rate": 3.537899902813878e-05, + "loss": 0.2632, + "step": 49180 + }, + { + "epoch": 2.186513757389874, + "grad_norm": 0.1654163897037506, + "learning_rate": 3.5359235320061293e-05, + "loss": 0.2644, + "step": 49190 + }, + { + "epoch": 2.18695826110148, + "grad_norm": 0.17047019302845, + "learning_rate": 3.533947411369003e-05, + "loss": 0.265, + "step": 49200 + }, + { + "epoch": 2.1874027648130863, + "grad_norm": 0.15864387154579163, + "learning_rate": 3.53197154124017e-05, + "loss": 0.2644, + "step": 49210 + }, + { + "epoch": 2.1878472685246924, + "grad_norm": 0.6436173319816589, + "learning_rate": 3.52999592195725e-05, + "loss": 0.267, + "step": 49220 + }, + { + "epoch": 2.188291772236298, + "grad_norm": 0.13625219464302063, + "learning_rate": 3.5280205538578224e-05, + "loss": 0.2637, + "step": 49230 + }, + { + "epoch": 2.188736275947904, + "grad_norm": 0.1750781536102295, + "learning_rate": 3.5260454372794236e-05, + "loss": 0.2633, + "step": 49240 + }, + { + "epoch": 2.18918077965951, + "grad_norm": 0.17977483570575714, + "learning_rate": 3.524070572559545e-05, + "loss": 0.2653, + "step": 49250 + }, + { + "epoch": 2.1896252833711163, + "grad_norm": 0.16275948286056519, + "learning_rate": 3.5220959600356395e-05, + "loss": 0.2639, + "step": 49260 + }, + { + "epoch": 2.190069787082722, + "grad_norm": 0.16746969521045685, + "learning_rate": 3.5201216000451145e-05, + "loss": 0.2639, + "step": 49270 + }, + { + "epoch": 2.190514290794328, + "grad_norm": 0.1520751565694809, + "learning_rate": 3.5181474929253335e-05, + "loss": 0.2635, + "step": 49280 + }, + { + "epoch": 2.190958794505934, + "grad_norm": 0.18008136749267578, + "learning_rate": 3.516173639013615e-05, + "loss": 0.2593, + "step": 49290 + }, + { + "epoch": 2.19140329821754, + "grad_norm": 0.1294299215078354, + "learning_rate": 3.5142000386472406e-05, + "loss": 0.2644, + "step": 49300 + }, + { + "epoch": 2.1918478019291463, + "grad_norm": 0.17900539934635162, + "learning_rate": 3.5122266921634427e-05, + "loss": 0.2637, + "step": 49310 + }, + { + "epoch": 2.192292305640752, + "grad_norm": 0.14963583648204803, + "learning_rate": 3.510253599899413e-05, + "loss": 0.2654, + "step": 49320 + }, + { + "epoch": 2.192736809352358, + "grad_norm": 0.18624910712242126, + "learning_rate": 3.5082807621922965e-05, + "loss": 0.2636, + "step": 49330 + }, + { + "epoch": 2.193181313063964, + "grad_norm": 0.17319877445697784, + "learning_rate": 3.506308179379201e-05, + "loss": 0.2611, + "step": 49340 + }, + { + "epoch": 2.1936258167755702, + "grad_norm": 0.16914017498493195, + "learning_rate": 3.5043358517971844e-05, + "loss": 0.2639, + "step": 49350 + }, + { + "epoch": 2.194070320487176, + "grad_norm": 0.17158548533916473, + "learning_rate": 3.502363779783264e-05, + "loss": 0.2629, + "step": 49360 + }, + { + "epoch": 2.194514824198782, + "grad_norm": 0.1550026684999466, + "learning_rate": 3.500391963674415e-05, + "loss": 0.265, + "step": 49370 + }, + { + "epoch": 2.194959327910388, + "grad_norm": 0.161256343126297, + "learning_rate": 3.4984204038075615e-05, + "loss": 0.2629, + "step": 49380 + }, + { + "epoch": 2.195403831621994, + "grad_norm": 0.1579238325357437, + "learning_rate": 3.496449100519595e-05, + "loss": 0.2623, + "step": 49390 + }, + { + "epoch": 2.1958483353336002, + "grad_norm": 0.14101757109165192, + "learning_rate": 3.494478054147354e-05, + "loss": 0.264, + "step": 49400 + }, + { + "epoch": 2.196292839045206, + "grad_norm": 0.1721751093864441, + "learning_rate": 3.4925072650276395e-05, + "loss": 0.2637, + "step": 49410 + }, + { + "epoch": 2.196737342756812, + "grad_norm": 0.15474040806293488, + "learning_rate": 3.4905367334972016e-05, + "loss": 0.2637, + "step": 49420 + }, + { + "epoch": 2.197181846468418, + "grad_norm": 0.16254118084907532, + "learning_rate": 3.488566459892752e-05, + "loss": 0.2648, + "step": 49430 + }, + { + "epoch": 2.197626350180024, + "grad_norm": 0.15956714749336243, + "learning_rate": 3.4865964445509585e-05, + "loss": 0.2603, + "step": 49440 + }, + { + "epoch": 2.19807085389163, + "grad_norm": 0.14931164681911469, + "learning_rate": 3.484626687808438e-05, + "loss": 0.2618, + "step": 49450 + }, + { + "epoch": 2.198515357603236, + "grad_norm": 0.14389725029468536, + "learning_rate": 3.4826571900017735e-05, + "loss": 0.2616, + "step": 49460 + }, + { + "epoch": 2.198959861314842, + "grad_norm": 0.1552031934261322, + "learning_rate": 3.480687951467495e-05, + "loss": 0.2648, + "step": 49470 + }, + { + "epoch": 2.199404365026448, + "grad_norm": 0.14235706627368927, + "learning_rate": 3.4787189725420925e-05, + "loss": 0.2623, + "step": 49480 + }, + { + "epoch": 2.199848868738054, + "grad_norm": 0.14717689156532288, + "learning_rate": 3.4767502535620086e-05, + "loss": 0.2628, + "step": 49490 + }, + { + "epoch": 2.20029337244966, + "grad_norm": 0.17410537600517273, + "learning_rate": 3.474781794863648e-05, + "loss": 0.2643, + "step": 49500 + }, + { + "epoch": 2.200737876161266, + "grad_norm": 0.13604748249053955, + "learning_rate": 3.472813596783363e-05, + "loss": 0.26, + "step": 49510 + }, + { + "epoch": 2.201182379872872, + "grad_norm": 0.1764792650938034, + "learning_rate": 3.470845659657466e-05, + "loss": 0.2622, + "step": 49520 + }, + { + "epoch": 2.201626883584478, + "grad_norm": 0.1737770289182663, + "learning_rate": 3.468877983822223e-05, + "loss": 0.2652, + "step": 49530 + }, + { + "epoch": 2.2020713872960838, + "grad_norm": 0.15993966162204742, + "learning_rate": 3.466910569613855e-05, + "loss": 0.2666, + "step": 49540 + }, + { + "epoch": 2.20251589100769, + "grad_norm": 0.15074725449085236, + "learning_rate": 3.464943417368542e-05, + "loss": 0.2599, + "step": 49550 + }, + { + "epoch": 2.202960394719296, + "grad_norm": 0.18257586658000946, + "learning_rate": 3.462976527422415e-05, + "loss": 0.2626, + "step": 49560 + }, + { + "epoch": 2.203404898430902, + "grad_norm": 0.12723088264465332, + "learning_rate": 3.461009900111562e-05, + "loss": 0.2652, + "step": 49570 + }, + { + "epoch": 2.2038494021425077, + "grad_norm": 0.15828000009059906, + "learning_rate": 3.459043535772023e-05, + "loss": 0.2625, + "step": 49580 + }, + { + "epoch": 2.2042939058541138, + "grad_norm": 0.15133707225322723, + "learning_rate": 3.4570774347398014e-05, + "loss": 0.2661, + "step": 49590 + }, + { + "epoch": 2.20473840956572, + "grad_norm": 0.18465185165405273, + "learning_rate": 3.4551115973508454e-05, + "loss": 0.264, + "step": 49600 + }, + { + "epoch": 2.205182913277326, + "grad_norm": 0.14860372245311737, + "learning_rate": 3.453146023941066e-05, + "loss": 0.2603, + "step": 49610 + }, + { + "epoch": 2.205627416988932, + "grad_norm": 0.13458436727523804, + "learning_rate": 3.451180714846325e-05, + "loss": 0.2656, + "step": 49620 + }, + { + "epoch": 2.2060719207005377, + "grad_norm": 0.16159887611865997, + "learning_rate": 3.449215670402438e-05, + "loss": 0.2614, + "step": 49630 + }, + { + "epoch": 2.206516424412144, + "grad_norm": 0.14222727715969086, + "learning_rate": 3.447250890945181e-05, + "loss": 0.2654, + "step": 49640 + }, + { + "epoch": 2.20696092812375, + "grad_norm": 0.13498292863368988, + "learning_rate": 3.4452863768102754e-05, + "loss": 0.2631, + "step": 49650 + }, + { + "epoch": 2.207405431835356, + "grad_norm": 0.16159941256046295, + "learning_rate": 3.443322128333409e-05, + "loss": 0.2634, + "step": 49660 + }, + { + "epoch": 2.2078499355469616, + "grad_norm": 0.15932483971118927, + "learning_rate": 3.441358145850215e-05, + "loss": 0.2636, + "step": 49670 + }, + { + "epoch": 2.2082944392585677, + "grad_norm": 0.1703672558069229, + "learning_rate": 3.439394429696286e-05, + "loss": 0.2632, + "step": 49680 + }, + { + "epoch": 2.208738942970174, + "grad_norm": 0.17087078094482422, + "learning_rate": 3.4374309802071644e-05, + "loss": 0.2617, + "step": 49690 + }, + { + "epoch": 2.20918344668178, + "grad_norm": 0.1717880219221115, + "learning_rate": 3.435467797718353e-05, + "loss": 0.2626, + "step": 49700 + }, + { + "epoch": 2.2096279503933856, + "grad_norm": 0.15656544268131256, + "learning_rate": 3.433504882565306e-05, + "loss": 0.2641, + "step": 49710 + }, + { + "epoch": 2.2100724541049916, + "grad_norm": 0.1612085998058319, + "learning_rate": 3.43154223508343e-05, + "loss": 0.2644, + "step": 49720 + }, + { + "epoch": 2.2105169578165977, + "grad_norm": 0.18765373528003693, + "learning_rate": 3.429579855608089e-05, + "loss": 0.2651, + "step": 49730 + }, + { + "epoch": 2.210961461528204, + "grad_norm": 0.1677933782339096, + "learning_rate": 3.427617744474597e-05, + "loss": 0.2632, + "step": 49740 + }, + { + "epoch": 2.21140596523981, + "grad_norm": 0.12616343796253204, + "learning_rate": 3.425655902018231e-05, + "loss": 0.2637, + "step": 49750 + }, + { + "epoch": 2.2118504689514156, + "grad_norm": 0.1846417486667633, + "learning_rate": 3.423694328574211e-05, + "loss": 0.2645, + "step": 49760 + }, + { + "epoch": 2.2122949726630217, + "grad_norm": 0.17087379097938538, + "learning_rate": 3.42173302447772e-05, + "loss": 0.2653, + "step": 49770 + }, + { + "epoch": 2.2127394763746278, + "grad_norm": 0.14701838791370392, + "learning_rate": 3.419771990063886e-05, + "loss": 0.2614, + "step": 49780 + }, + { + "epoch": 2.213183980086234, + "grad_norm": 0.17524513602256775, + "learning_rate": 3.417811225667803e-05, + "loss": 0.2651, + "step": 49790 + }, + { + "epoch": 2.21362848379784, + "grad_norm": 0.19166389107704163, + "learning_rate": 3.415850731624508e-05, + "loss": 0.2638, + "step": 49800 + }, + { + "epoch": 2.2140729875094456, + "grad_norm": 0.1620893031358719, + "learning_rate": 3.4138905082689945e-05, + "loss": 0.2652, + "step": 49810 + }, + { + "epoch": 2.2145174912210517, + "grad_norm": 0.16036994755268097, + "learning_rate": 3.4119305559362145e-05, + "loss": 0.2614, + "step": 49820 + }, + { + "epoch": 2.2149619949326578, + "grad_norm": 0.14728638529777527, + "learning_rate": 3.4099708749610684e-05, + "loss": 0.2635, + "step": 49830 + }, + { + "epoch": 2.215406498644264, + "grad_norm": 0.14152255654335022, + "learning_rate": 3.408011465678413e-05, + "loss": 0.2628, + "step": 49840 + }, + { + "epoch": 2.2158510023558695, + "grad_norm": 0.15467749536037445, + "learning_rate": 3.406052328423055e-05, + "loss": 0.2633, + "step": 49850 + }, + { + "epoch": 2.2162955060674756, + "grad_norm": 0.16392815113067627, + "learning_rate": 3.4040934635297615e-05, + "loss": 0.2638, + "step": 49860 + }, + { + "epoch": 2.2167400097790817, + "grad_norm": 0.14355020225048065, + "learning_rate": 3.4021348713332466e-05, + "loss": 0.2627, + "step": 49870 + }, + { + "epoch": 2.217184513490688, + "grad_norm": 0.15547782182693481, + "learning_rate": 3.4001765521681807e-05, + "loss": 0.2619, + "step": 49880 + }, + { + "epoch": 2.2176290172022934, + "grad_norm": 0.1706082820892334, + "learning_rate": 3.398218506369188e-05, + "loss": 0.2602, + "step": 49890 + }, + { + "epoch": 2.2180735209138995, + "grad_norm": 0.1487610936164856, + "learning_rate": 3.3962607342708404e-05, + "loss": 0.2626, + "step": 49900 + }, + { + "epoch": 2.2185180246255056, + "grad_norm": 0.14519551396369934, + "learning_rate": 3.394303236207673e-05, + "loss": 0.262, + "step": 49910 + }, + { + "epoch": 2.2189625283371117, + "grad_norm": 0.15198297798633575, + "learning_rate": 3.392346012514166e-05, + "loss": 0.2647, + "step": 49920 + }, + { + "epoch": 2.219407032048718, + "grad_norm": 0.1650969386100769, + "learning_rate": 3.390389063524757e-05, + "loss": 0.2584, + "step": 49930 + }, + { + "epoch": 2.2198515357603235, + "grad_norm": 0.17029888927936554, + "learning_rate": 3.3884323895738324e-05, + "loss": 0.2624, + "step": 49940 + }, + { + "epoch": 2.2202960394719296, + "grad_norm": 0.17458760738372803, + "learning_rate": 3.386475990995738e-05, + "loss": 0.26, + "step": 49950 + }, + { + "epoch": 2.2207405431835356, + "grad_norm": 0.17242760956287384, + "learning_rate": 3.384519868124765e-05, + "loss": 0.2647, + "step": 49960 + }, + { + "epoch": 2.2211850468951417, + "grad_norm": 0.15602459013462067, + "learning_rate": 3.3825640212951645e-05, + "loss": 0.2604, + "step": 49970 + }, + { + "epoch": 2.2216295506067474, + "grad_norm": 0.15907125174999237, + "learning_rate": 3.380608450841134e-05, + "loss": 0.263, + "step": 49980 + }, + { + "epoch": 2.2220740543183535, + "grad_norm": 0.14848533272743225, + "learning_rate": 3.3786531570968305e-05, + "loss": 0.2627, + "step": 49990 + }, + { + "epoch": 2.2225185580299596, + "grad_norm": 0.16686034202575684, + "learning_rate": 3.3766981403963584e-05, + "loss": 0.266, + "step": 50000 + }, + { + "epoch": 2.2229630617415657, + "grad_norm": 0.15366080403327942, + "learning_rate": 3.374743401073775e-05, + "loss": 0.2615, + "step": 50010 + }, + { + "epoch": 2.2234075654531713, + "grad_norm": 0.1893680989742279, + "learning_rate": 3.372788939463095e-05, + "loss": 0.263, + "step": 50020 + }, + { + "epoch": 2.2238520691647774, + "grad_norm": 0.14227691292762756, + "learning_rate": 3.370834755898281e-05, + "loss": 0.2616, + "step": 50030 + }, + { + "epoch": 2.2242965728763835, + "grad_norm": 0.15179386734962463, + "learning_rate": 3.3688808507132493e-05, + "loss": 0.2616, + "step": 50040 + }, + { + "epoch": 2.2247410765879896, + "grad_norm": 0.14526909589767456, + "learning_rate": 3.3669272242418685e-05, + "loss": 0.26, + "step": 50050 + }, + { + "epoch": 2.2251855802995957, + "grad_norm": 0.1415199637413025, + "learning_rate": 3.364973876817961e-05, + "loss": 0.2604, + "step": 50060 + }, + { + "epoch": 2.2256300840112013, + "grad_norm": 0.1662074327468872, + "learning_rate": 3.363020808775299e-05, + "loss": 0.2637, + "step": 50070 + }, + { + "epoch": 2.2260745877228074, + "grad_norm": 0.12527897953987122, + "learning_rate": 3.361068020447611e-05, + "loss": 0.2612, + "step": 50080 + }, + { + "epoch": 2.2265190914344135, + "grad_norm": 0.16575467586517334, + "learning_rate": 3.3591155121685724e-05, + "loss": 0.263, + "step": 50090 + }, + { + "epoch": 2.2269635951460196, + "grad_norm": 0.16556322574615479, + "learning_rate": 3.357163284271814e-05, + "loss": 0.263, + "step": 50100 + }, + { + "epoch": 2.2274080988576257, + "grad_norm": 0.1421976089477539, + "learning_rate": 3.355211337090919e-05, + "loss": 0.2643, + "step": 50110 + }, + { + "epoch": 2.2278526025692313, + "grad_norm": 0.13717156648635864, + "learning_rate": 3.353259670959421e-05, + "loss": 0.2603, + "step": 50120 + }, + { + "epoch": 2.2282971062808374, + "grad_norm": 0.14448270201683044, + "learning_rate": 3.351308286210808e-05, + "loss": 0.2606, + "step": 50130 + }, + { + "epoch": 2.2287416099924435, + "grad_norm": 0.139449343085289, + "learning_rate": 3.3493571831785156e-05, + "loss": 0.2633, + "step": 50140 + }, + { + "epoch": 2.2291861137040496, + "grad_norm": 0.16359247267246246, + "learning_rate": 3.347406362195936e-05, + "loss": 0.264, + "step": 50150 + }, + { + "epoch": 2.2296306174156553, + "grad_norm": 0.14808517694473267, + "learning_rate": 3.345455823596411e-05, + "loss": 0.264, + "step": 50160 + }, + { + "epoch": 2.2300751211272614, + "grad_norm": 0.13222603499889374, + "learning_rate": 3.3435055677132346e-05, + "loss": 0.2613, + "step": 50170 + }, + { + "epoch": 2.2305196248388675, + "grad_norm": 0.14974956214427948, + "learning_rate": 3.3415555948796505e-05, + "loss": 0.2639, + "step": 50180 + }, + { + "epoch": 2.2309641285504735, + "grad_norm": 0.172428160905838, + "learning_rate": 3.3396059054288556e-05, + "loss": 0.2607, + "step": 50190 + }, + { + "epoch": 2.231408632262079, + "grad_norm": 0.16570620238780975, + "learning_rate": 3.3376564996940015e-05, + "loss": 0.2644, + "step": 50200 + }, + { + "epoch": 2.2318531359736853, + "grad_norm": 0.1825970560312271, + "learning_rate": 3.3357073780081836e-05, + "loss": 0.2638, + "step": 50210 + }, + { + "epoch": 2.2322976396852914, + "grad_norm": 0.18985609710216522, + "learning_rate": 3.333758540704459e-05, + "loss": 0.2639, + "step": 50220 + }, + { + "epoch": 2.2327421433968975, + "grad_norm": 0.1583869308233261, + "learning_rate": 3.331809988115827e-05, + "loss": 0.2604, + "step": 50230 + }, + { + "epoch": 2.2331866471085036, + "grad_norm": 0.14917346835136414, + "learning_rate": 3.329861720575244e-05, + "loss": 0.2601, + "step": 50240 + }, + { + "epoch": 2.233631150820109, + "grad_norm": 0.1416877657175064, + "learning_rate": 3.3279137384156126e-05, + "loss": 0.2617, + "step": 50250 + }, + { + "epoch": 2.2340756545317153, + "grad_norm": 0.14494526386260986, + "learning_rate": 3.3259660419697934e-05, + "loss": 0.263, + "step": 50260 + }, + { + "epoch": 2.2345201582433214, + "grad_norm": 0.18271858990192413, + "learning_rate": 3.3240186315705926e-05, + "loss": 0.2637, + "step": 50270 + }, + { + "epoch": 2.2349646619549275, + "grad_norm": 0.16994144022464752, + "learning_rate": 3.322071507550769e-05, + "loss": 0.2622, + "step": 50280 + }, + { + "epoch": 2.235409165666533, + "grad_norm": 0.21519321203231812, + "learning_rate": 3.320124670243033e-05, + "loss": 0.2652, + "step": 50290 + }, + { + "epoch": 2.2358536693781392, + "grad_norm": 0.16418716311454773, + "learning_rate": 3.318178119980045e-05, + "loss": 0.2613, + "step": 50300 + }, + { + "epoch": 2.2362981730897453, + "grad_norm": 0.1367458999156952, + "learning_rate": 3.316231857094421e-05, + "loss": 0.2608, + "step": 50310 + }, + { + "epoch": 2.2367426768013514, + "grad_norm": 0.1376379281282425, + "learning_rate": 3.31428588191872e-05, + "loss": 0.2626, + "step": 50320 + }, + { + "epoch": 2.237187180512957, + "grad_norm": 0.14127574861049652, + "learning_rate": 3.312340194785458e-05, + "loss": 0.2609, + "step": 50330 + }, + { + "epoch": 2.237631684224563, + "grad_norm": 0.14917217195034027, + "learning_rate": 3.3103947960270975e-05, + "loss": 0.2613, + "step": 50340 + }, + { + "epoch": 2.2380761879361692, + "grad_norm": 0.16073234379291534, + "learning_rate": 3.308449685976058e-05, + "loss": 0.2625, + "step": 50350 + }, + { + "epoch": 2.2385206916477753, + "grad_norm": 0.16008393466472626, + "learning_rate": 3.3065048649647024e-05, + "loss": 0.2644, + "step": 50360 + }, + { + "epoch": 2.2389651953593814, + "grad_norm": 0.153523787856102, + "learning_rate": 3.304560333325348e-05, + "loss": 0.2642, + "step": 50370 + }, + { + "epoch": 2.239409699070987, + "grad_norm": 0.15616478025913239, + "learning_rate": 3.3026160913902624e-05, + "loss": 0.2627, + "step": 50380 + }, + { + "epoch": 2.239854202782593, + "grad_norm": 0.17544874548912048, + "learning_rate": 3.300672139491662e-05, + "loss": 0.2627, + "step": 50390 + }, + { + "epoch": 2.2402987064941993, + "grad_norm": 0.16394196450710297, + "learning_rate": 3.298728477961717e-05, + "loss": 0.2631, + "step": 50400 + }, + { + "epoch": 2.2407432102058054, + "grad_norm": 0.13389413058757782, + "learning_rate": 3.296785107132544e-05, + "loss": 0.2606, + "step": 50410 + }, + { + "epoch": 2.241187713917411, + "grad_norm": 0.1654493808746338, + "learning_rate": 3.2948420273362156e-05, + "loss": 0.2622, + "step": 50420 + }, + { + "epoch": 2.241632217629017, + "grad_norm": 0.1577543318271637, + "learning_rate": 3.292899238904747e-05, + "loss": 0.2639, + "step": 50430 + }, + { + "epoch": 2.242076721340623, + "grad_norm": 0.1318909227848053, + "learning_rate": 3.29095674217011e-05, + "loss": 0.2603, + "step": 50440 + }, + { + "epoch": 2.2425212250522293, + "grad_norm": 0.2313169538974762, + "learning_rate": 3.289014537464224e-05, + "loss": 0.2625, + "step": 50450 + }, + { + "epoch": 2.2429657287638354, + "grad_norm": 0.14694710075855255, + "learning_rate": 3.287072625118955e-05, + "loss": 0.2626, + "step": 50460 + }, + { + "epoch": 2.243410232475441, + "grad_norm": 0.13913962244987488, + "learning_rate": 3.285131005466129e-05, + "loss": 0.2585, + "step": 50470 + }, + { + "epoch": 2.243854736187047, + "grad_norm": 0.1651725023984909, + "learning_rate": 3.2831896788375105e-05, + "loss": 0.2628, + "step": 50480 + }, + { + "epoch": 2.244299239898653, + "grad_norm": 0.155464768409729, + "learning_rate": 3.281248645564822e-05, + "loss": 0.2614, + "step": 50490 + }, + { + "epoch": 2.2447437436102593, + "grad_norm": 0.17873680591583252, + "learning_rate": 3.2793079059797306e-05, + "loss": 0.2604, + "step": 50500 + }, + { + "epoch": 2.245188247321865, + "grad_norm": 0.13089855015277863, + "learning_rate": 3.277367460413859e-05, + "loss": 0.2621, + "step": 50510 + }, + { + "epoch": 2.245632751033471, + "grad_norm": 0.1626618504524231, + "learning_rate": 3.275427309198773e-05, + "loss": 0.2585, + "step": 50520 + }, + { + "epoch": 2.246077254745077, + "grad_norm": 0.13488446176052094, + "learning_rate": 3.273487452665993e-05, + "loss": 0.2639, + "step": 50530 + }, + { + "epoch": 2.2465217584566832, + "grad_norm": 0.1378413736820221, + "learning_rate": 3.271547891146986e-05, + "loss": 0.2636, + "step": 50540 + }, + { + "epoch": 2.2469662621682893, + "grad_norm": 0.13790328800678253, + "learning_rate": 3.269608624973173e-05, + "loss": 0.2586, + "step": 50550 + }, + { + "epoch": 2.247410765879895, + "grad_norm": 0.1626182198524475, + "learning_rate": 3.26766965447592e-05, + "loss": 0.2609, + "step": 50560 + }, + { + "epoch": 2.247855269591501, + "grad_norm": 0.16445548832416534, + "learning_rate": 3.2657309799865424e-05, + "loss": 0.2642, + "step": 50570 + }, + { + "epoch": 2.248299773303107, + "grad_norm": 0.1557942032814026, + "learning_rate": 3.2637926018363084e-05, + "loss": 0.2629, + "step": 50580 + }, + { + "epoch": 2.2487442770147132, + "grad_norm": 0.19623830914497375, + "learning_rate": 3.261854520356432e-05, + "loss": 0.2618, + "step": 50590 + }, + { + "epoch": 2.249188780726319, + "grad_norm": 0.15036088228225708, + "learning_rate": 3.2599167358780816e-05, + "loss": 0.2613, + "step": 50600 + }, + { + "epoch": 2.249633284437925, + "grad_norm": 0.1517370194196701, + "learning_rate": 3.2579792487323676e-05, + "loss": 0.2613, + "step": 50610 + }, + { + "epoch": 2.250077788149531, + "grad_norm": 0.15119722485542297, + "learning_rate": 3.256042059250358e-05, + "loss": 0.2632, + "step": 50620 + }, + { + "epoch": 2.250522291861137, + "grad_norm": 0.1529923677444458, + "learning_rate": 3.254105167763062e-05, + "loss": 0.2634, + "step": 50630 + }, + { + "epoch": 2.250966795572743, + "grad_norm": 0.23029085993766785, + "learning_rate": 3.252168574601443e-05, + "loss": 0.2632, + "step": 50640 + }, + { + "epoch": 2.251411299284349, + "grad_norm": 0.18478645384311676, + "learning_rate": 3.2502322800964115e-05, + "loss": 0.2637, + "step": 50650 + }, + { + "epoch": 2.251855802995955, + "grad_norm": 0.1561727076768875, + "learning_rate": 3.248296284578825e-05, + "loss": 0.2637, + "step": 50660 + }, + { + "epoch": 2.252300306707561, + "grad_norm": 0.16732394695281982, + "learning_rate": 3.246360588379497e-05, + "loss": 0.2622, + "step": 50670 + }, + { + "epoch": 2.252744810419167, + "grad_norm": 0.1337866634130478, + "learning_rate": 3.24442519182918e-05, + "loss": 0.2612, + "step": 50680 + }, + { + "epoch": 2.253189314130773, + "grad_norm": 0.17193689942359924, + "learning_rate": 3.2424900952585845e-05, + "loss": 0.263, + "step": 50690 + }, + { + "epoch": 2.253633817842379, + "grad_norm": 0.1407705694437027, + "learning_rate": 3.240555298998362e-05, + "loss": 0.2622, + "step": 50700 + }, + { + "epoch": 2.254078321553985, + "grad_norm": 0.15199410915374756, + "learning_rate": 3.2386208033791194e-05, + "loss": 0.2608, + "step": 50710 + }, + { + "epoch": 2.254522825265591, + "grad_norm": 0.13580016791820526, + "learning_rate": 3.2366866087314064e-05, + "loss": 0.2628, + "step": 50720 + }, + { + "epoch": 2.254967328977197, + "grad_norm": 0.12823836505413055, + "learning_rate": 3.234752715385727e-05, + "loss": 0.2626, + "step": 50730 + }, + { + "epoch": 2.255411832688803, + "grad_norm": 0.14807887375354767, + "learning_rate": 3.232819123672529e-05, + "loss": 0.2652, + "step": 50740 + }, + { + "epoch": 2.255856336400409, + "grad_norm": 0.18572671711444855, + "learning_rate": 3.2308858339222076e-05, + "loss": 0.2658, + "step": 50750 + }, + { + "epoch": 2.256300840112015, + "grad_norm": 0.17317648231983185, + "learning_rate": 3.2289528464651144e-05, + "loss": 0.2593, + "step": 50760 + }, + { + "epoch": 2.2567453438236207, + "grad_norm": 0.13822491466999054, + "learning_rate": 3.22702016163154e-05, + "loss": 0.2615, + "step": 50770 + }, + { + "epoch": 2.257189847535227, + "grad_norm": 0.13925740122795105, + "learning_rate": 3.2250877797517295e-05, + "loss": 0.2629, + "step": 50780 + }, + { + "epoch": 2.257634351246833, + "grad_norm": 0.13734810054302216, + "learning_rate": 3.223155701155872e-05, + "loss": 0.2618, + "step": 50790 + }, + { + "epoch": 2.258078854958439, + "grad_norm": 0.145889550447464, + "learning_rate": 3.2212239261741105e-05, + "loss": 0.2596, + "step": 50800 + }, + { + "epoch": 2.258523358670045, + "grad_norm": 0.15657521784305573, + "learning_rate": 3.219292455136528e-05, + "loss": 0.2596, + "step": 50810 + }, + { + "epoch": 2.2589678623816507, + "grad_norm": 0.1582016944885254, + "learning_rate": 3.217361288373165e-05, + "loss": 0.261, + "step": 50820 + }, + { + "epoch": 2.259412366093257, + "grad_norm": 0.16315673291683197, + "learning_rate": 3.215430426214002e-05, + "loss": 0.263, + "step": 50830 + }, + { + "epoch": 2.259856869804863, + "grad_norm": 0.16632674634456635, + "learning_rate": 3.2134998689889705e-05, + "loss": 0.2633, + "step": 50840 + }, + { + "epoch": 2.260301373516469, + "grad_norm": 0.1564113199710846, + "learning_rate": 3.211569617027952e-05, + "loss": 0.2656, + "step": 50850 + }, + { + "epoch": 2.260745877228075, + "grad_norm": 0.14844021201133728, + "learning_rate": 3.209639670660769e-05, + "loss": 0.263, + "step": 50860 + }, + { + "epoch": 2.2611903809396807, + "grad_norm": 0.16505934298038483, + "learning_rate": 3.207710030217202e-05, + "loss": 0.2616, + "step": 50870 + }, + { + "epoch": 2.261634884651287, + "grad_norm": 0.14357319474220276, + "learning_rate": 3.205780696026972e-05, + "loss": 0.2623, + "step": 50880 + }, + { + "epoch": 2.262079388362893, + "grad_norm": 0.1497640162706375, + "learning_rate": 3.203851668419749e-05, + "loss": 0.2628, + "step": 50890 + }, + { + "epoch": 2.262523892074499, + "grad_norm": 0.13695427775382996, + "learning_rate": 3.201922947725149e-05, + "loss": 0.2629, + "step": 50900 + }, + { + "epoch": 2.2629683957861046, + "grad_norm": 0.1823071539402008, + "learning_rate": 3.199994534272742e-05, + "loss": 0.2622, + "step": 50910 + }, + { + "epoch": 2.2634128994977107, + "grad_norm": 0.14995963871479034, + "learning_rate": 3.1980664283920394e-05, + "loss": 0.26, + "step": 50920 + }, + { + "epoch": 2.263857403209317, + "grad_norm": 0.16297921538352966, + "learning_rate": 3.196138630412499e-05, + "loss": 0.2653, + "step": 50930 + }, + { + "epoch": 2.264301906920923, + "grad_norm": 0.1603098064661026, + "learning_rate": 3.194211140663532e-05, + "loss": 0.2618, + "step": 50940 + }, + { + "epoch": 2.2647464106325286, + "grad_norm": 0.1605639010667801, + "learning_rate": 3.1922839594744914e-05, + "loss": 0.2634, + "step": 50950 + }, + { + "epoch": 2.2651909143441347, + "grad_norm": 0.16644108295440674, + "learning_rate": 3.190357087174683e-05, + "loss": 0.2615, + "step": 50960 + }, + { + "epoch": 2.2656354180557408, + "grad_norm": 0.13704563677310944, + "learning_rate": 3.188430524093353e-05, + "loss": 0.2604, + "step": 50970 + }, + { + "epoch": 2.266079921767347, + "grad_norm": 0.19929461181163788, + "learning_rate": 3.1865042705597014e-05, + "loss": 0.2629, + "step": 50980 + }, + { + "epoch": 2.266524425478953, + "grad_norm": 0.1726844310760498, + "learning_rate": 3.184578326902868e-05, + "loss": 0.2654, + "step": 50990 + }, + { + "epoch": 2.2669689291905586, + "grad_norm": 0.15695275366306305, + "learning_rate": 3.182652693451949e-05, + "loss": 0.2634, + "step": 51000 + }, + { + "epoch": 2.2674134329021647, + "grad_norm": 0.17284932732582092, + "learning_rate": 3.180727370535981e-05, + "loss": 0.2584, + "step": 51010 + }, + { + "epoch": 2.2678579366137708, + "grad_norm": 0.14414629340171814, + "learning_rate": 3.178802358483945e-05, + "loss": 0.263, + "step": 51020 + }, + { + "epoch": 2.268302440325377, + "grad_norm": 0.17139683663845062, + "learning_rate": 3.17687765762478e-05, + "loss": 0.2628, + "step": 51030 + }, + { + "epoch": 2.2687469440369825, + "grad_norm": 0.1543828397989273, + "learning_rate": 3.174953268287357e-05, + "loss": 0.2627, + "step": 51040 + }, + { + "epoch": 2.2691914477485886, + "grad_norm": 0.2140233963727951, + "learning_rate": 3.173029190800508e-05, + "loss": 0.263, + "step": 51050 + }, + { + "epoch": 2.2696359514601947, + "grad_norm": 0.13899436593055725, + "learning_rate": 3.171105425493e-05, + "loss": 0.2606, + "step": 51060 + }, + { + "epoch": 2.270080455171801, + "grad_norm": 0.16093982756137848, + "learning_rate": 3.169181972693557e-05, + "loss": 0.266, + "step": 51070 + }, + { + "epoch": 2.2705249588834064, + "grad_norm": 0.16864007711410522, + "learning_rate": 3.1672588327308404e-05, + "loss": 0.2628, + "step": 51080 + }, + { + "epoch": 2.2709694625950125, + "grad_norm": 0.14266347885131836, + "learning_rate": 3.1653360059334646e-05, + "loss": 0.2653, + "step": 51090 + }, + { + "epoch": 2.2714139663066186, + "grad_norm": 0.15725557506084442, + "learning_rate": 3.163413492629985e-05, + "loss": 0.2609, + "step": 51100 + }, + { + "epoch": 2.2718584700182247, + "grad_norm": 0.15951299667358398, + "learning_rate": 3.161491293148912e-05, + "loss": 0.2639, + "step": 51110 + }, + { + "epoch": 2.272302973729831, + "grad_norm": 0.1868618130683899, + "learning_rate": 3.1595694078186925e-05, + "loss": 0.2629, + "step": 51120 + }, + { + "epoch": 2.2727474774414365, + "grad_norm": 0.16473442316055298, + "learning_rate": 3.157647836967724e-05, + "loss": 0.2616, + "step": 51130 + }, + { + "epoch": 2.2731919811530426, + "grad_norm": 0.13086606562137604, + "learning_rate": 3.155726580924354e-05, + "loss": 0.2618, + "step": 51140 + }, + { + "epoch": 2.2736364848646486, + "grad_norm": 0.1541503667831421, + "learning_rate": 3.1538056400168676e-05, + "loss": 0.2632, + "step": 51150 + }, + { + "epoch": 2.2740809885762547, + "grad_norm": 0.1673114001750946, + "learning_rate": 3.151885014573506e-05, + "loss": 0.2605, + "step": 51160 + }, + { + "epoch": 2.274525492287861, + "grad_norm": 0.1620934009552002, + "learning_rate": 3.149964704922448e-05, + "loss": 0.2642, + "step": 51170 + }, + { + "epoch": 2.2749699959994665, + "grad_norm": 0.15758676826953888, + "learning_rate": 3.148044711391825e-05, + "loss": 0.2606, + "step": 51180 + }, + { + "epoch": 2.2754144997110726, + "grad_norm": 0.17408959567546844, + "learning_rate": 3.146125034309707e-05, + "loss": 0.2625, + "step": 51190 + }, + { + "epoch": 2.2758590034226787, + "grad_norm": 0.14899204671382904, + "learning_rate": 3.1442056740041195e-05, + "loss": 0.2633, + "step": 51200 + }, + { + "epoch": 2.2763035071342848, + "grad_norm": 0.12844981253147125, + "learning_rate": 3.142286630803026e-05, + "loss": 0.2624, + "step": 51210 + }, + { + "epoch": 2.2767480108458904, + "grad_norm": 0.18979939818382263, + "learning_rate": 3.140367905034337e-05, + "loss": 0.262, + "step": 51220 + }, + { + "epoch": 2.2771925145574965, + "grad_norm": 0.17148561775684357, + "learning_rate": 3.138449497025914e-05, + "loss": 0.2601, + "step": 51230 + }, + { + "epoch": 2.2776370182691026, + "grad_norm": 0.15749257802963257, + "learning_rate": 3.136531407105557e-05, + "loss": 0.2597, + "step": 51240 + }, + { + "epoch": 2.2780815219807087, + "grad_norm": 0.17253589630126953, + "learning_rate": 3.1346136356010184e-05, + "loss": 0.2639, + "step": 51250 + }, + { + "epoch": 2.2785260256923143, + "grad_norm": 0.17722001671791077, + "learning_rate": 3.132696182839988e-05, + "loss": 0.2643, + "step": 51260 + }, + { + "epoch": 2.2789705294039204, + "grad_norm": 0.17601241171360016, + "learning_rate": 3.130779049150111e-05, + "loss": 0.2626, + "step": 51270 + }, + { + "epoch": 2.2794150331155265, + "grad_norm": 0.1538058966398239, + "learning_rate": 3.128862234858971e-05, + "loss": 0.264, + "step": 51280 + }, + { + "epoch": 2.2798595368271326, + "grad_norm": 0.15370668470859528, + "learning_rate": 3.1269457402941005e-05, + "loss": 0.2603, + "step": 51290 + }, + { + "epoch": 2.2803040405387387, + "grad_norm": 0.15571735799312592, + "learning_rate": 3.125029565782974e-05, + "loss": 0.2612, + "step": 51300 + }, + { + "epoch": 2.2807485442503443, + "grad_norm": 0.14383886754512787, + "learning_rate": 3.123113711653013e-05, + "loss": 0.2591, + "step": 51310 + }, + { + "epoch": 2.2811930479619504, + "grad_norm": 0.14391984045505524, + "learning_rate": 3.121198178231587e-05, + "loss": 0.2626, + "step": 51320 + }, + { + "epoch": 2.2816375516735565, + "grad_norm": 0.1397150158882141, + "learning_rate": 3.119282965846006e-05, + "loss": 0.2618, + "step": 51330 + }, + { + "epoch": 2.2820820553851626, + "grad_norm": 0.1371311992406845, + "learning_rate": 3.1173680748235303e-05, + "loss": 0.2612, + "step": 51340 + }, + { + "epoch": 2.2825265590967683, + "grad_norm": 0.1633937805891037, + "learning_rate": 3.115453505491358e-05, + "loss": 0.2641, + "step": 51350 + }, + { + "epoch": 2.2829710628083744, + "grad_norm": 0.1508709043264389, + "learning_rate": 3.1135392581766404e-05, + "loss": 0.2619, + "step": 51360 + }, + { + "epoch": 2.2834155665199805, + "grad_norm": 0.16219961643218994, + "learning_rate": 3.111625333206467e-05, + "loss": 0.26, + "step": 51370 + }, + { + "epoch": 2.2838600702315865, + "grad_norm": 0.17235681414604187, + "learning_rate": 3.109711730907881e-05, + "loss": 0.261, + "step": 51380 + }, + { + "epoch": 2.284304573943192, + "grad_norm": 0.14026224613189697, + "learning_rate": 3.107798451607859e-05, + "loss": 0.2623, + "step": 51390 + }, + { + "epoch": 2.2847490776547983, + "grad_norm": 0.14392946660518646, + "learning_rate": 3.105885495633328e-05, + "loss": 0.2624, + "step": 51400 + }, + { + "epoch": 2.2851935813664044, + "grad_norm": 0.13828250765800476, + "learning_rate": 3.103972863311163e-05, + "loss": 0.2616, + "step": 51410 + }, + { + "epoch": 2.2856380850780105, + "grad_norm": 0.12535294890403748, + "learning_rate": 3.102060554968178e-05, + "loss": 0.2637, + "step": 51420 + }, + { + "epoch": 2.2860825887896166, + "grad_norm": 0.144496887922287, + "learning_rate": 3.100148570931137e-05, + "loss": 0.2625, + "step": 51430 + }, + { + "epoch": 2.286527092501222, + "grad_norm": 0.1532856673002243, + "learning_rate": 3.098236911526744e-05, + "loss": 0.2631, + "step": 51440 + }, + { + "epoch": 2.2869715962128283, + "grad_norm": 0.142940491437912, + "learning_rate": 3.0963255770816495e-05, + "loss": 0.2612, + "step": 51450 + }, + { + "epoch": 2.2874160999244344, + "grad_norm": 0.1554553061723709, + "learning_rate": 3.094414567922447e-05, + "loss": 0.2609, + "step": 51460 + }, + { + "epoch": 2.2878606036360405, + "grad_norm": 0.1491139531135559, + "learning_rate": 3.092503884375678e-05, + "loss": 0.2601, + "step": 51470 + }, + { + "epoch": 2.2883051073476466, + "grad_norm": 0.16095131635665894, + "learning_rate": 3.0905935267678255e-05, + "loss": 0.2632, + "step": 51480 + }, + { + "epoch": 2.2887496110592522, + "grad_norm": 0.17634353041648865, + "learning_rate": 3.0886834954253166e-05, + "loss": 0.2634, + "step": 51490 + }, + { + "epoch": 2.2891941147708583, + "grad_norm": 0.15755097568035126, + "learning_rate": 3.086773790674524e-05, + "loss": 0.259, + "step": 51500 + }, + { + "epoch": 2.2896386184824644, + "grad_norm": 0.14894483983516693, + "learning_rate": 3.0848644128417617e-05, + "loss": 0.2623, + "step": 51510 + }, + { + "epoch": 2.2900831221940705, + "grad_norm": 0.28462958335876465, + "learning_rate": 3.082955362253294e-05, + "loss": 0.2659, + "step": 51520 + }, + { + "epoch": 2.290527625905676, + "grad_norm": 0.14685820043087006, + "learning_rate": 3.081046639235323e-05, + "loss": 0.2615, + "step": 51530 + }, + { + "epoch": 2.2909721296172822, + "grad_norm": 0.1436007171869278, + "learning_rate": 3.0791382441139986e-05, + "loss": 0.2625, + "step": 51540 + }, + { + "epoch": 2.2914166333288883, + "grad_norm": 0.12698771059513092, + "learning_rate": 3.077230177215411e-05, + "loss": 0.2594, + "step": 51550 + }, + { + "epoch": 2.2918611370404944, + "grad_norm": 0.14053431153297424, + "learning_rate": 3.0753224388656e-05, + "loss": 0.2623, + "step": 51560 + }, + { + "epoch": 2.2923056407521, + "grad_norm": 0.14195317029953003, + "learning_rate": 3.073415029390544e-05, + "loss": 0.2594, + "step": 51570 + }, + { + "epoch": 2.292750144463706, + "grad_norm": 0.16142666339874268, + "learning_rate": 3.071507949116166e-05, + "loss": 0.2669, + "step": 51580 + }, + { + "epoch": 2.2931946481753123, + "grad_norm": 0.14564509689807892, + "learning_rate": 3.069601198368337e-05, + "loss": 0.2635, + "step": 51590 + }, + { + "epoch": 2.2936391518869184, + "grad_norm": 0.13885536789894104, + "learning_rate": 3.067694777472864e-05, + "loss": 0.2609, + "step": 51600 + }, + { + "epoch": 2.2940836555985245, + "grad_norm": 0.3177100121974945, + "learning_rate": 3.065788686755508e-05, + "loss": 0.261, + "step": 51610 + }, + { + "epoch": 2.29452815931013, + "grad_norm": 0.1647438108921051, + "learning_rate": 3.063882926541961e-05, + "loss": 0.2616, + "step": 51620 + }, + { + "epoch": 2.294972663021736, + "grad_norm": 0.13878120481967926, + "learning_rate": 3.061977497157872e-05, + "loss": 0.2612, + "step": 51630 + }, + { + "epoch": 2.2954171667333423, + "grad_norm": 0.14496605098247528, + "learning_rate": 3.0600723989288224e-05, + "loss": 0.2602, + "step": 51640 + }, + { + "epoch": 2.2958616704449484, + "grad_norm": 0.14441511034965515, + "learning_rate": 3.058167632180343e-05, + "loss": 0.2663, + "step": 51650 + }, + { + "epoch": 2.296306174156554, + "grad_norm": 0.14868059754371643, + "learning_rate": 3.0562631972379074e-05, + "loss": 0.2596, + "step": 51660 + }, + { + "epoch": 2.29675067786816, + "grad_norm": 0.14633828401565552, + "learning_rate": 3.0543590944269276e-05, + "loss": 0.2619, + "step": 51670 + }, + { + "epoch": 2.297195181579766, + "grad_norm": 0.13543203473091125, + "learning_rate": 3.052455324072766e-05, + "loss": 0.2631, + "step": 51680 + }, + { + "epoch": 2.2976396852913723, + "grad_norm": 0.15132296085357666, + "learning_rate": 3.0505518865007244e-05, + "loss": 0.2607, + "step": 51690 + }, + { + "epoch": 2.298084189002978, + "grad_norm": 0.15530887246131897, + "learning_rate": 3.048648782036048e-05, + "loss": 0.264, + "step": 51700 + }, + { + "epoch": 2.298528692714584, + "grad_norm": 0.16657571494579315, + "learning_rate": 3.0467460110039236e-05, + "loss": 0.2613, + "step": 51710 + }, + { + "epoch": 2.29897319642619, + "grad_norm": 0.15608568489551544, + "learning_rate": 3.0448435737294866e-05, + "loss": 0.2636, + "step": 51720 + }, + { + "epoch": 2.2994177001377962, + "grad_norm": 0.15392889082431793, + "learning_rate": 3.042941470537808e-05, + "loss": 0.2618, + "step": 51730 + }, + { + "epoch": 2.2998622038494023, + "grad_norm": 0.17162901163101196, + "learning_rate": 3.0410397017539072e-05, + "loss": 0.2612, + "step": 51740 + }, + { + "epoch": 2.300306707561008, + "grad_norm": 0.1765737682580948, + "learning_rate": 3.039138267702742e-05, + "loss": 0.2611, + "step": 51750 + }, + { + "epoch": 2.300751211272614, + "grad_norm": 0.16359937191009521, + "learning_rate": 3.0372371687092193e-05, + "loss": 0.2621, + "step": 51760 + }, + { + "epoch": 2.30119571498422, + "grad_norm": 0.1450691670179367, + "learning_rate": 3.0353364050981823e-05, + "loss": 0.2608, + "step": 51770 + }, + { + "epoch": 2.3016402186958262, + "grad_norm": 0.18010249733924866, + "learning_rate": 3.033435977194418e-05, + "loss": 0.2621, + "step": 51780 + }, + { + "epoch": 2.3020847224074323, + "grad_norm": 0.1473066210746765, + "learning_rate": 3.0315358853226618e-05, + "loss": 0.2628, + "step": 51790 + }, + { + "epoch": 2.302529226119038, + "grad_norm": 0.15066495537757874, + "learning_rate": 3.0296361298075844e-05, + "loss": 0.2626, + "step": 51800 + }, + { + "epoch": 2.302973729830644, + "grad_norm": 0.14342178404331207, + "learning_rate": 3.027736710973803e-05, + "loss": 0.2604, + "step": 51810 + }, + { + "epoch": 2.30341823354225, + "grad_norm": 0.15785427391529083, + "learning_rate": 3.025837629145875e-05, + "loss": 0.2623, + "step": 51820 + }, + { + "epoch": 2.3038627372538563, + "grad_norm": 0.13816314935684204, + "learning_rate": 3.0239388846483048e-05, + "loss": 0.2601, + "step": 51830 + }, + { + "epoch": 2.304307240965462, + "grad_norm": 0.1528976410627365, + "learning_rate": 3.022040477805532e-05, + "loss": 0.262, + "step": 51840 + }, + { + "epoch": 2.304751744677068, + "grad_norm": 0.1542656272649765, + "learning_rate": 3.020142408941946e-05, + "loss": 0.2618, + "step": 51850 + }, + { + "epoch": 2.305196248388674, + "grad_norm": 0.16868646442890167, + "learning_rate": 3.018244678381873e-05, + "loss": 0.2648, + "step": 51860 + }, + { + "epoch": 2.30564075210028, + "grad_norm": 0.1437636762857437, + "learning_rate": 3.0163472864495812e-05, + "loss": 0.263, + "step": 51870 + }, + { + "epoch": 2.306085255811886, + "grad_norm": 0.1207614615559578, + "learning_rate": 3.014450233469287e-05, + "loss": 0.2618, + "step": 51880 + }, + { + "epoch": 2.306529759523492, + "grad_norm": 0.1512777954339981, + "learning_rate": 3.0125535197651422e-05, + "loss": 0.261, + "step": 51890 + }, + { + "epoch": 2.306974263235098, + "grad_norm": 0.13559401035308838, + "learning_rate": 3.0106571456612448e-05, + "loss": 0.2602, + "step": 51900 + }, + { + "epoch": 2.307418766946704, + "grad_norm": 0.1320648044347763, + "learning_rate": 3.0087611114816305e-05, + "loss": 0.2622, + "step": 51910 + }, + { + "epoch": 2.30786327065831, + "grad_norm": 0.12921079993247986, + "learning_rate": 3.006865417550284e-05, + "loss": 0.2629, + "step": 51920 + }, + { + "epoch": 2.308307774369916, + "grad_norm": 0.19750317931175232, + "learning_rate": 3.0049700641911242e-05, + "loss": 0.2612, + "step": 51930 + }, + { + "epoch": 2.308752278081522, + "grad_norm": 0.18909130990505219, + "learning_rate": 3.0030750517280175e-05, + "loss": 0.2608, + "step": 51940 + }, + { + "epoch": 2.309196781793128, + "grad_norm": 0.1586609035730362, + "learning_rate": 3.0011803804847682e-05, + "loss": 0.2644, + "step": 51950 + }, + { + "epoch": 2.309641285504734, + "grad_norm": 0.15259015560150146, + "learning_rate": 2.9992860507851224e-05, + "loss": 0.2615, + "step": 51960 + }, + { + "epoch": 2.31008578921634, + "grad_norm": 0.15608026087284088, + "learning_rate": 2.9973920629527723e-05, + "loss": 0.2611, + "step": 51970 + }, + { + "epoch": 2.310530292927946, + "grad_norm": 0.16334061324596405, + "learning_rate": 2.9954984173113453e-05, + "loss": 0.2626, + "step": 51980 + }, + { + "epoch": 2.310974796639552, + "grad_norm": 0.15105850994586945, + "learning_rate": 2.993605114184418e-05, + "loss": 0.26, + "step": 51990 + }, + { + "epoch": 2.311419300351158, + "grad_norm": 0.14352446794509888, + "learning_rate": 2.9917121538955005e-05, + "loss": 0.2594, + "step": 52000 + }, + { + "epoch": 2.3118638040627637, + "grad_norm": 0.17756007611751556, + "learning_rate": 2.98981953676805e-05, + "loss": 0.26, + "step": 52010 + }, + { + "epoch": 2.31230830777437, + "grad_norm": 0.15179599821567535, + "learning_rate": 2.9879272631254594e-05, + "loss": 0.2617, + "step": 52020 + }, + { + "epoch": 2.312752811485976, + "grad_norm": 0.16184423863887787, + "learning_rate": 2.986035333291073e-05, + "loss": 0.2599, + "step": 52030 + }, + { + "epoch": 2.313197315197582, + "grad_norm": 0.15981246531009674, + "learning_rate": 2.9841437475881646e-05, + "loss": 0.2587, + "step": 52040 + }, + { + "epoch": 2.313641818909188, + "grad_norm": 0.1752709299325943, + "learning_rate": 2.9822525063399555e-05, + "loss": 0.2617, + "step": 52050 + }, + { + "epoch": 2.3140863226207937, + "grad_norm": 0.15644998848438263, + "learning_rate": 2.9803616098696087e-05, + "loss": 0.2623, + "step": 52060 + }, + { + "epoch": 2.3145308263324, + "grad_norm": 0.15342184901237488, + "learning_rate": 2.9784710585002228e-05, + "loss": 0.2628, + "step": 52070 + }, + { + "epoch": 2.314975330044006, + "grad_norm": 0.17568007111549377, + "learning_rate": 2.9765808525548467e-05, + "loss": 0.2623, + "step": 52080 + }, + { + "epoch": 2.315419833755612, + "grad_norm": 0.1499946564435959, + "learning_rate": 2.974690992356461e-05, + "loss": 0.2635, + "step": 52090 + }, + { + "epoch": 2.315864337467218, + "grad_norm": 0.1509225219488144, + "learning_rate": 2.9728014782279934e-05, + "loss": 0.2618, + "step": 52100 + }, + { + "epoch": 2.3163088411788237, + "grad_norm": 0.12449293583631516, + "learning_rate": 2.970912310492307e-05, + "loss": 0.2631, + "step": 52110 + }, + { + "epoch": 2.31675334489043, + "grad_norm": 0.20337019860744476, + "learning_rate": 2.9690234894722134e-05, + "loss": 0.262, + "step": 52120 + }, + { + "epoch": 2.317197848602036, + "grad_norm": 0.1393771916627884, + "learning_rate": 2.9671350154904577e-05, + "loss": 0.2625, + "step": 52130 + }, + { + "epoch": 2.3176423523136416, + "grad_norm": 0.15087619423866272, + "learning_rate": 2.9652468888697282e-05, + "loss": 0.2633, + "step": 52140 + }, + { + "epoch": 2.3180868560252477, + "grad_norm": 0.2085593044757843, + "learning_rate": 2.9633591099326562e-05, + "loss": 0.2643, + "step": 52150 + }, + { + "epoch": 2.3185313597368538, + "grad_norm": 0.12893803417682648, + "learning_rate": 2.9614716790018072e-05, + "loss": 0.2615, + "step": 52160 + }, + { + "epoch": 2.31897586344846, + "grad_norm": 0.14626342058181763, + "learning_rate": 2.959584596399697e-05, + "loss": 0.2602, + "step": 52170 + }, + { + "epoch": 2.319420367160066, + "grad_norm": 0.15243522822856903, + "learning_rate": 2.9576978624487717e-05, + "loss": 0.2611, + "step": 52180 + }, + { + "epoch": 2.3198648708716716, + "grad_norm": 0.14621992409229279, + "learning_rate": 2.9558114774714263e-05, + "loss": 0.2588, + "step": 52190 + }, + { + "epoch": 2.3203093745832777, + "grad_norm": 0.14036571979522705, + "learning_rate": 2.9539254417899897e-05, + "loss": 0.2586, + "step": 52200 + }, + { + "epoch": 2.3207538782948838, + "grad_norm": 0.14184188842773438, + "learning_rate": 2.9520397557267365e-05, + "loss": 0.2622, + "step": 52210 + }, + { + "epoch": 2.32119838200649, + "grad_norm": 0.14986661076545715, + "learning_rate": 2.9501544196038765e-05, + "loss": 0.2606, + "step": 52220 + }, + { + "epoch": 2.321642885718096, + "grad_norm": 0.1574765145778656, + "learning_rate": 2.9482694337435614e-05, + "loss": 0.2619, + "step": 52230 + }, + { + "epoch": 2.3220873894297016, + "grad_norm": 0.16670899093151093, + "learning_rate": 2.946384798467887e-05, + "loss": 0.263, + "step": 52240 + }, + { + "epoch": 2.3225318931413077, + "grad_norm": 0.1639973521232605, + "learning_rate": 2.9445005140988824e-05, + "loss": 0.2611, + "step": 52250 + }, + { + "epoch": 2.322976396852914, + "grad_norm": 0.15364013612270355, + "learning_rate": 2.942616580958524e-05, + "loss": 0.2597, + "step": 52260 + }, + { + "epoch": 2.32342090056452, + "grad_norm": 0.1414533108472824, + "learning_rate": 2.9407329993687193e-05, + "loss": 0.2592, + "step": 52270 + }, + { + "epoch": 2.3238654042761255, + "grad_norm": 0.17511491477489471, + "learning_rate": 2.938849769651326e-05, + "loss": 0.2607, + "step": 52280 + }, + { + "epoch": 2.3243099079877316, + "grad_norm": 0.14424435794353485, + "learning_rate": 2.9369668921281345e-05, + "loss": 0.2598, + "step": 52290 + }, + { + "epoch": 2.3247544116993377, + "grad_norm": 0.1696276217699051, + "learning_rate": 2.9350843671208773e-05, + "loss": 0.2624, + "step": 52300 + }, + { + "epoch": 2.325198915410944, + "grad_norm": 0.1561291515827179, + "learning_rate": 2.9332021949512245e-05, + "loss": 0.2625, + "step": 52310 + }, + { + "epoch": 2.3256434191225495, + "grad_norm": 0.1772870570421219, + "learning_rate": 2.9313203759407908e-05, + "loss": 0.2633, + "step": 52320 + }, + { + "epoch": 2.3260879228341556, + "grad_norm": 0.16022565960884094, + "learning_rate": 2.929438910411127e-05, + "loss": 0.2606, + "step": 52330 + }, + { + "epoch": 2.3265324265457616, + "grad_norm": 0.15671633183956146, + "learning_rate": 2.927557798683722e-05, + "loss": 0.2603, + "step": 52340 + }, + { + "epoch": 2.3269769302573677, + "grad_norm": 0.15006159245967865, + "learning_rate": 2.9256770410800095e-05, + "loss": 0.2578, + "step": 52350 + }, + { + "epoch": 2.327421433968974, + "grad_norm": 0.14456050097942352, + "learning_rate": 2.9237966379213554e-05, + "loss": 0.2614, + "step": 52360 + }, + { + "epoch": 2.3278659376805795, + "grad_norm": 0.16159822046756744, + "learning_rate": 2.9219165895290736e-05, + "loss": 0.2597, + "step": 52370 + }, + { + "epoch": 2.3283104413921856, + "grad_norm": 0.13372690975666046, + "learning_rate": 2.9200368962244083e-05, + "loss": 0.2589, + "step": 52380 + }, + { + "epoch": 2.3287549451037917, + "grad_norm": 0.12866370379924774, + "learning_rate": 2.9181575583285536e-05, + "loss": 0.2623, + "step": 52390 + }, + { + "epoch": 2.3291994488153978, + "grad_norm": 0.13349537551403046, + "learning_rate": 2.916278576162632e-05, + "loss": 0.2612, + "step": 52400 + }, + { + "epoch": 2.329643952527004, + "grad_norm": 0.14282920956611633, + "learning_rate": 2.9143999500477116e-05, + "loss": 0.2616, + "step": 52410 + }, + { + "epoch": 2.3300884562386095, + "grad_norm": 0.14095689356327057, + "learning_rate": 2.9125216803048004e-05, + "loss": 0.2618, + "step": 52420 + }, + { + "epoch": 2.3305329599502156, + "grad_norm": 0.14110800623893738, + "learning_rate": 2.9106437672548403e-05, + "loss": 0.2613, + "step": 52430 + }, + { + "epoch": 2.3309774636618217, + "grad_norm": 0.13108791410923004, + "learning_rate": 2.9087662112187154e-05, + "loss": 0.2627, + "step": 52440 + }, + { + "epoch": 2.3314219673734273, + "grad_norm": 0.15142729878425598, + "learning_rate": 2.9068890125172498e-05, + "loss": 0.262, + "step": 52450 + }, + { + "epoch": 2.3318664710850334, + "grad_norm": 0.13564074039459229, + "learning_rate": 2.9050121714712058e-05, + "loss": 0.2627, + "step": 52460 + }, + { + "epoch": 2.3323109747966395, + "grad_norm": 0.1644042432308197, + "learning_rate": 2.9031356884012835e-05, + "loss": 0.2631, + "step": 52470 + }, + { + "epoch": 2.3327554785082456, + "grad_norm": 0.1617337018251419, + "learning_rate": 2.9012595636281247e-05, + "loss": 0.2621, + "step": 52480 + }, + { + "epoch": 2.3331999822198517, + "grad_norm": 0.13857440650463104, + "learning_rate": 2.8993837974723016e-05, + "loss": 0.2611, + "step": 52490 + }, + { + "epoch": 2.3336444859314573, + "grad_norm": 0.13798485696315765, + "learning_rate": 2.8975083902543394e-05, + "loss": 0.2602, + "step": 52500 + }, + { + "epoch": 2.3340889896430634, + "grad_norm": 0.1431143581867218, + "learning_rate": 2.895633342294688e-05, + "loss": 0.2629, + "step": 52510 + }, + { + "epoch": 2.3345334933546695, + "grad_norm": 0.138307124376297, + "learning_rate": 2.893758653913744e-05, + "loss": 0.2609, + "step": 52520 + }, + { + "epoch": 2.3349779970662756, + "grad_norm": 0.15134701132774353, + "learning_rate": 2.8918843254318423e-05, + "loss": 0.2621, + "step": 52530 + }, + { + "epoch": 2.3354225007778817, + "grad_norm": 0.14165280759334564, + "learning_rate": 2.8900103571692483e-05, + "loss": 0.2636, + "step": 52540 + }, + { + "epoch": 2.3358670044894874, + "grad_norm": 0.15943430364131927, + "learning_rate": 2.8881367494461796e-05, + "loss": 0.263, + "step": 52550 + }, + { + "epoch": 2.3363115082010935, + "grad_norm": 0.16175128519535065, + "learning_rate": 2.886263502582779e-05, + "loss": 0.2622, + "step": 52560 + }, + { + "epoch": 2.3367560119126995, + "grad_norm": 0.14735791087150574, + "learning_rate": 2.8843906168991353e-05, + "loss": 0.262, + "step": 52570 + }, + { + "epoch": 2.3372005156243056, + "grad_norm": 0.15490904450416565, + "learning_rate": 2.8825180927152728e-05, + "loss": 0.2601, + "step": 52580 + }, + { + "epoch": 2.3376450193359113, + "grad_norm": 0.1770371049642563, + "learning_rate": 2.8806459303511547e-05, + "loss": 0.2607, + "step": 52590 + }, + { + "epoch": 2.3380895230475174, + "grad_norm": 0.138872891664505, + "learning_rate": 2.8787741301266852e-05, + "loss": 0.2597, + "step": 52600 + }, + { + "epoch": 2.3385340267591235, + "grad_norm": 0.144393190741539, + "learning_rate": 2.876902692361697e-05, + "loss": 0.2626, + "step": 52610 + }, + { + "epoch": 2.3389785304707296, + "grad_norm": 0.173512801527977, + "learning_rate": 2.875031617375976e-05, + "loss": 0.2649, + "step": 52620 + }, + { + "epoch": 2.339423034182335, + "grad_norm": 0.15525032579898834, + "learning_rate": 2.8731609054892318e-05, + "loss": 0.2614, + "step": 52630 + }, + { + "epoch": 2.3398675378939413, + "grad_norm": 0.14187444746494293, + "learning_rate": 2.871290557021119e-05, + "loss": 0.2612, + "step": 52640 + }, + { + "epoch": 2.3403120416055474, + "grad_norm": 0.15615339577198029, + "learning_rate": 2.8694205722912298e-05, + "loss": 0.2619, + "step": 52650 + }, + { + "epoch": 2.3407565453171535, + "grad_norm": 0.15385648608207703, + "learning_rate": 2.8675509516190936e-05, + "loss": 0.2592, + "step": 52660 + }, + { + "epoch": 2.3412010490287596, + "grad_norm": 0.18593236804008484, + "learning_rate": 2.8656816953241772e-05, + "loss": 0.2641, + "step": 52670 + }, + { + "epoch": 2.3416455527403652, + "grad_norm": 0.1431364119052887, + "learning_rate": 2.863812803725887e-05, + "loss": 0.2601, + "step": 52680 + }, + { + "epoch": 2.3420900564519713, + "grad_norm": 0.15411870181560516, + "learning_rate": 2.8619442771435623e-05, + "loss": 0.2628, + "step": 52690 + }, + { + "epoch": 2.3425345601635774, + "grad_norm": 0.13449916243553162, + "learning_rate": 2.860076115896484e-05, + "loss": 0.2613, + "step": 52700 + }, + { + "epoch": 2.3429790638751835, + "grad_norm": 0.15976347029209137, + "learning_rate": 2.8582083203038712e-05, + "loss": 0.263, + "step": 52710 + }, + { + "epoch": 2.343423567586789, + "grad_norm": 0.13158181309700012, + "learning_rate": 2.8563408906848778e-05, + "loss": 0.2595, + "step": 52720 + }, + { + "epoch": 2.3438680712983953, + "grad_norm": 0.16233552992343903, + "learning_rate": 2.8544738273585993e-05, + "loss": 0.262, + "step": 52730 + }, + { + "epoch": 2.3443125750100013, + "grad_norm": 0.1545286923646927, + "learning_rate": 2.8526071306440595e-05, + "loss": 0.2617, + "step": 52740 + }, + { + "epoch": 2.3447570787216074, + "grad_norm": 0.13554948568344116, + "learning_rate": 2.850740800860233e-05, + "loss": 0.2597, + "step": 52750 + }, + { + "epoch": 2.345201582433213, + "grad_norm": 0.1896379142999649, + "learning_rate": 2.8488748383260178e-05, + "loss": 0.2615, + "step": 52760 + }, + { + "epoch": 2.345646086144819, + "grad_norm": 0.14207054674625397, + "learning_rate": 2.8470092433602635e-05, + "loss": 0.262, + "step": 52770 + }, + { + "epoch": 2.3460905898564253, + "grad_norm": 0.14935468137264252, + "learning_rate": 2.845144016281745e-05, + "loss": 0.2623, + "step": 52780 + }, + { + "epoch": 2.3465350935680314, + "grad_norm": 0.14647699892520905, + "learning_rate": 2.8432791574091744e-05, + "loss": 0.2602, + "step": 52790 + }, + { + "epoch": 2.3469795972796375, + "grad_norm": 0.1735745668411255, + "learning_rate": 2.8414146670612134e-05, + "loss": 0.2642, + "step": 52800 + }, + { + "epoch": 2.347424100991243, + "grad_norm": 0.1574360728263855, + "learning_rate": 2.8395505455564446e-05, + "loss": 0.2598, + "step": 52810 + }, + { + "epoch": 2.347868604702849, + "grad_norm": 0.14233803749084473, + "learning_rate": 2.837686793213403e-05, + "loss": 0.2624, + "step": 52820 + }, + { + "epoch": 2.3483131084144553, + "grad_norm": 0.16577504575252533, + "learning_rate": 2.8358234103505477e-05, + "loss": 0.2602, + "step": 52830 + }, + { + "epoch": 2.3487576121260614, + "grad_norm": 0.15629734098911285, + "learning_rate": 2.8339603972862806e-05, + "loss": 0.2626, + "step": 52840 + }, + { + "epoch": 2.3492021158376675, + "grad_norm": 0.12494497746229172, + "learning_rate": 2.83209775433894e-05, + "loss": 0.2628, + "step": 52850 + }, + { + "epoch": 2.349646619549273, + "grad_norm": 0.15854011476039886, + "learning_rate": 2.8302354818268013e-05, + "loss": 0.2605, + "step": 52860 + }, + { + "epoch": 2.350091123260879, + "grad_norm": 0.165544793009758, + "learning_rate": 2.8283735800680754e-05, + "loss": 0.2611, + "step": 52870 + }, + { + "epoch": 2.3505356269724853, + "grad_norm": 0.1459653377532959, + "learning_rate": 2.826512049380913e-05, + "loss": 0.2631, + "step": 52880 + }, + { + "epoch": 2.3509801306840914, + "grad_norm": 0.1681002676486969, + "learning_rate": 2.8246508900833935e-05, + "loss": 0.2598, + "step": 52890 + }, + { + "epoch": 2.351424634395697, + "grad_norm": 0.14455753564834595, + "learning_rate": 2.822790102493541e-05, + "loss": 0.2628, + "step": 52900 + }, + { + "epoch": 2.351869138107303, + "grad_norm": 0.12886637449264526, + "learning_rate": 2.820929686929314e-05, + "loss": 0.2612, + "step": 52910 + }, + { + "epoch": 2.3523136418189092, + "grad_norm": 0.14297150075435638, + "learning_rate": 2.819069643708605e-05, + "loss": 0.2612, + "step": 52920 + }, + { + "epoch": 2.3527581455305153, + "grad_norm": 0.11760221421718597, + "learning_rate": 2.8172099731492484e-05, + "loss": 0.2601, + "step": 52930 + }, + { + "epoch": 2.353202649242121, + "grad_norm": 0.13963644206523895, + "learning_rate": 2.815350675569003e-05, + "loss": 0.2618, + "step": 52940 + }, + { + "epoch": 2.353647152953727, + "grad_norm": 0.17834089696407318, + "learning_rate": 2.8134917512855807e-05, + "loss": 0.2609, + "step": 52950 + }, + { + "epoch": 2.354091656665333, + "grad_norm": 0.152337908744812, + "learning_rate": 2.8116332006166134e-05, + "loss": 0.2586, + "step": 52960 + }, + { + "epoch": 2.3545361603769392, + "grad_norm": 0.1459495723247528, + "learning_rate": 2.809775023879685e-05, + "loss": 0.2589, + "step": 52970 + }, + { + "epoch": 2.3549806640885453, + "grad_norm": 0.17577560245990753, + "learning_rate": 2.807917221392299e-05, + "loss": 0.2611, + "step": 52980 + }, + { + "epoch": 2.355425167800151, + "grad_norm": 0.17455895245075226, + "learning_rate": 2.8060597934719067e-05, + "loss": 0.2599, + "step": 52990 + }, + { + "epoch": 2.355869671511757, + "grad_norm": 0.17260366678237915, + "learning_rate": 2.8042027404358927e-05, + "loss": 0.2623, + "step": 53000 + }, + { + "epoch": 2.356314175223363, + "grad_norm": 0.14223727583885193, + "learning_rate": 2.802346062601572e-05, + "loss": 0.2643, + "step": 53010 + }, + { + "epoch": 2.3567586789349693, + "grad_norm": 0.15135452151298523, + "learning_rate": 2.800489760286207e-05, + "loss": 0.2623, + "step": 53020 + }, + { + "epoch": 2.357203182646575, + "grad_norm": 0.1696147918701172, + "learning_rate": 2.7986338338069834e-05, + "loss": 0.2611, + "step": 53030 + }, + { + "epoch": 2.357647686358181, + "grad_norm": 0.160807266831398, + "learning_rate": 2.7967782834810297e-05, + "loss": 0.2608, + "step": 53040 + }, + { + "epoch": 2.358092190069787, + "grad_norm": 0.1479998379945755, + "learning_rate": 2.794923109625409e-05, + "loss": 0.2625, + "step": 53050 + }, + { + "epoch": 2.358536693781393, + "grad_norm": 0.14943398535251617, + "learning_rate": 2.7930683125571205e-05, + "loss": 0.261, + "step": 53060 + }, + { + "epoch": 2.358981197492999, + "grad_norm": 0.13191668689250946, + "learning_rate": 2.7912138925930997e-05, + "loss": 0.2586, + "step": 53070 + }, + { + "epoch": 2.359425701204605, + "grad_norm": 0.1310148686170578, + "learning_rate": 2.7893598500502117e-05, + "loss": 0.2613, + "step": 53080 + }, + { + "epoch": 2.359870204916211, + "grad_norm": 0.13373935222625732, + "learning_rate": 2.7875061852452644e-05, + "loss": 0.2591, + "step": 53090 + }, + { + "epoch": 2.360314708627817, + "grad_norm": 0.15299038589000702, + "learning_rate": 2.7856528984949982e-05, + "loss": 0.2642, + "step": 53100 + }, + { + "epoch": 2.360759212339423, + "grad_norm": 0.14731433987617493, + "learning_rate": 2.7837999901160888e-05, + "loss": 0.2594, + "step": 53110 + }, + { + "epoch": 2.361203716051029, + "grad_norm": 0.16289281845092773, + "learning_rate": 2.7819474604251484e-05, + "loss": 0.2593, + "step": 53120 + }, + { + "epoch": 2.361648219762635, + "grad_norm": 0.14611943066120148, + "learning_rate": 2.780095309738725e-05, + "loss": 0.2619, + "step": 53130 + }, + { + "epoch": 2.362092723474241, + "grad_norm": 0.14833655953407288, + "learning_rate": 2.778243538373294e-05, + "loss": 0.2637, + "step": 53140 + }, + { + "epoch": 2.362537227185847, + "grad_norm": 0.15406444668769836, + "learning_rate": 2.7763921466452826e-05, + "loss": 0.2608, + "step": 53150 + }, + { + "epoch": 2.3629817308974532, + "grad_norm": 0.15876363217830658, + "learning_rate": 2.7745411348710336e-05, + "loss": 0.2603, + "step": 53160 + }, + { + "epoch": 2.363426234609059, + "grad_norm": 0.14063747227191925, + "learning_rate": 2.7726905033668395e-05, + "loss": 0.2615, + "step": 53170 + }, + { + "epoch": 2.363870738320665, + "grad_norm": 0.13892528414726257, + "learning_rate": 2.7708402524489214e-05, + "loss": 0.2601, + "step": 53180 + }, + { + "epoch": 2.364315242032271, + "grad_norm": 0.15868957340717316, + "learning_rate": 2.7689903824334364e-05, + "loss": 0.2611, + "step": 53190 + }, + { + "epoch": 2.364759745743877, + "grad_norm": 0.14703992009162903, + "learning_rate": 2.7671408936364785e-05, + "loss": 0.26, + "step": 53200 + }, + { + "epoch": 2.365204249455483, + "grad_norm": 0.16480185091495514, + "learning_rate": 2.765291786374069e-05, + "loss": 0.2617, + "step": 53210 + }, + { + "epoch": 2.365648753167089, + "grad_norm": 0.13659922778606415, + "learning_rate": 2.7634430609621775e-05, + "loss": 0.2606, + "step": 53220 + }, + { + "epoch": 2.366093256878695, + "grad_norm": 0.134103462100029, + "learning_rate": 2.7615947177166956e-05, + "loss": 0.2575, + "step": 53230 + }, + { + "epoch": 2.366537760590301, + "grad_norm": 0.1384684145450592, + "learning_rate": 2.7597467569534553e-05, + "loss": 0.2597, + "step": 53240 + }, + { + "epoch": 2.3669822643019067, + "grad_norm": 0.15902848541736603, + "learning_rate": 2.757899178988226e-05, + "loss": 0.2583, + "step": 53250 + }, + { + "epoch": 2.367426768013513, + "grad_norm": 0.14401975274085999, + "learning_rate": 2.7560519841367005e-05, + "loss": 0.262, + "step": 53260 + }, + { + "epoch": 2.367871271725119, + "grad_norm": 0.17747747898101807, + "learning_rate": 2.7542051727145237e-05, + "loss": 0.2625, + "step": 53270 + }, + { + "epoch": 2.368315775436725, + "grad_norm": 0.1854570060968399, + "learning_rate": 2.7523587450372578e-05, + "loss": 0.2606, + "step": 53280 + }, + { + "epoch": 2.368760279148331, + "grad_norm": 0.14881950616836548, + "learning_rate": 2.750512701420409e-05, + "loss": 0.2601, + "step": 53290 + }, + { + "epoch": 2.3692047828599367, + "grad_norm": 0.1849198192358017, + "learning_rate": 2.7486670421794158e-05, + "loss": 0.2617, + "step": 53300 + }, + { + "epoch": 2.369649286571543, + "grad_norm": 0.16561606526374817, + "learning_rate": 2.7468217676296515e-05, + "loss": 0.2627, + "step": 53310 + }, + { + "epoch": 2.370093790283149, + "grad_norm": 0.1322454810142517, + "learning_rate": 2.7449768780864226e-05, + "loss": 0.2603, + "step": 53320 + }, + { + "epoch": 2.370538293994755, + "grad_norm": 0.17243103682994843, + "learning_rate": 2.7431323738649724e-05, + "loss": 0.2628, + "step": 53330 + }, + { + "epoch": 2.3709827977063607, + "grad_norm": 0.15086297690868378, + "learning_rate": 2.7412882552804713e-05, + "loss": 0.2602, + "step": 53340 + }, + { + "epoch": 2.3714273014179668, + "grad_norm": 0.16895540058612823, + "learning_rate": 2.739444522648032e-05, + "loss": 0.2615, + "step": 53350 + }, + { + "epoch": 2.371871805129573, + "grad_norm": 0.16382050514221191, + "learning_rate": 2.737601176282697e-05, + "loss": 0.2598, + "step": 53360 + }, + { + "epoch": 2.372316308841179, + "grad_norm": 0.15518158674240112, + "learning_rate": 2.735758216499445e-05, + "loss": 0.2621, + "step": 53370 + }, + { + "epoch": 2.3727608125527846, + "grad_norm": 0.1752317100763321, + "learning_rate": 2.7339156436131864e-05, + "loss": 0.2598, + "step": 53380 + }, + { + "epoch": 2.3732053162643907, + "grad_norm": 0.13999414443969727, + "learning_rate": 2.7320734579387663e-05, + "loss": 0.2625, + "step": 53390 + }, + { + "epoch": 2.3736498199759968, + "grad_norm": 0.14843708276748657, + "learning_rate": 2.730231659790966e-05, + "loss": 0.2653, + "step": 53400 + }, + { + "epoch": 2.374094323687603, + "grad_norm": 0.14657941460609436, + "learning_rate": 2.728390249484494e-05, + "loss": 0.261, + "step": 53410 + }, + { + "epoch": 2.374538827399209, + "grad_norm": 0.14281225204467773, + "learning_rate": 2.7265492273340032e-05, + "loss": 0.2626, + "step": 53420 + }, + { + "epoch": 2.3749833311108146, + "grad_norm": 0.13858145475387573, + "learning_rate": 2.7247085936540678e-05, + "loss": 0.2638, + "step": 53430 + }, + { + "epoch": 2.3754278348224207, + "grad_norm": 0.14043815433979034, + "learning_rate": 2.722868348759205e-05, + "loss": 0.2611, + "step": 53440 + }, + { + "epoch": 2.375872338534027, + "grad_norm": 0.1371614933013916, + "learning_rate": 2.7210284929638635e-05, + "loss": 0.2606, + "step": 53450 + }, + { + "epoch": 2.376316842245633, + "grad_norm": 0.14478729665279388, + "learning_rate": 2.7191890265824183e-05, + "loss": 0.2641, + "step": 53460 + }, + { + "epoch": 2.376761345957239, + "grad_norm": 0.17854249477386475, + "learning_rate": 2.7173499499291926e-05, + "loss": 0.2623, + "step": 53470 + }, + { + "epoch": 2.3772058496688446, + "grad_norm": 0.14898647367954254, + "learning_rate": 2.7155112633184277e-05, + "loss": 0.2624, + "step": 53480 + }, + { + "epoch": 2.3776503533804507, + "grad_norm": 0.13327930867671967, + "learning_rate": 2.713672967064307e-05, + "loss": 0.2606, + "step": 53490 + }, + { + "epoch": 2.378094857092057, + "grad_norm": 0.15858344733715057, + "learning_rate": 2.711835061480945e-05, + "loss": 0.2606, + "step": 53500 + }, + { + "epoch": 2.3785393608036625, + "grad_norm": 0.1391041874885559, + "learning_rate": 2.7099975468823896e-05, + "loss": 0.2607, + "step": 53510 + }, + { + "epoch": 2.3789838645152686, + "grad_norm": 0.15624043345451355, + "learning_rate": 2.708160423582622e-05, + "loss": 0.2598, + "step": 53520 + }, + { + "epoch": 2.3794283682268746, + "grad_norm": 0.14600731432437897, + "learning_rate": 2.706323691895557e-05, + "loss": 0.264, + "step": 53530 + }, + { + "epoch": 2.3798728719384807, + "grad_norm": 0.14376983046531677, + "learning_rate": 2.70448735213504e-05, + "loss": 0.2596, + "step": 53540 + }, + { + "epoch": 2.380317375650087, + "grad_norm": 0.1693611890077591, + "learning_rate": 2.702651404614852e-05, + "loss": 0.2625, + "step": 53550 + }, + { + "epoch": 2.3807618793616925, + "grad_norm": 0.12588787078857422, + "learning_rate": 2.7008158496487056e-05, + "loss": 0.2612, + "step": 53560 + }, + { + "epoch": 2.3812063830732986, + "grad_norm": 0.15974342823028564, + "learning_rate": 2.6989806875502487e-05, + "loss": 0.2636, + "step": 53570 + }, + { + "epoch": 2.3816508867849047, + "grad_norm": 0.13234871625900269, + "learning_rate": 2.6971459186330584e-05, + "loss": 0.2586, + "step": 53580 + }, + { + "epoch": 2.3820953904965108, + "grad_norm": 0.1247158944606781, + "learning_rate": 2.695311543210648e-05, + "loss": 0.2598, + "step": 53590 + }, + { + "epoch": 2.382539894208117, + "grad_norm": 0.16441841423511505, + "learning_rate": 2.693477561596463e-05, + "loss": 0.2619, + "step": 53600 + }, + { + "epoch": 2.3829843979197225, + "grad_norm": 0.1819610297679901, + "learning_rate": 2.6916439741038756e-05, + "loss": 0.2611, + "step": 53610 + }, + { + "epoch": 2.3834289016313286, + "grad_norm": 0.14969541132450104, + "learning_rate": 2.689810781046203e-05, + "loss": 0.2608, + "step": 53620 + }, + { + "epoch": 2.3838734053429347, + "grad_norm": 0.16250230371952057, + "learning_rate": 2.6879779827366823e-05, + "loss": 0.2598, + "step": 53630 + }, + { + "epoch": 2.3843179090545408, + "grad_norm": 0.14187148213386536, + "learning_rate": 2.6861455794884904e-05, + "loss": 0.2603, + "step": 53640 + }, + { + "epoch": 2.3847624127661464, + "grad_norm": 0.15238958597183228, + "learning_rate": 2.6843135716147373e-05, + "loss": 0.259, + "step": 53650 + }, + { + "epoch": 2.3852069164777525, + "grad_norm": 0.13067902624607086, + "learning_rate": 2.6824819594284556e-05, + "loss": 0.262, + "step": 53660 + }, + { + "epoch": 2.3856514201893586, + "grad_norm": 0.15175187587738037, + "learning_rate": 2.6806507432426275e-05, + "loss": 0.2582, + "step": 53670 + }, + { + "epoch": 2.3860959239009647, + "grad_norm": 0.1392628699541092, + "learning_rate": 2.6788199233701512e-05, + "loss": 0.2627, + "step": 53680 + }, + { + "epoch": 2.3865404276125703, + "grad_norm": 0.16674792766571045, + "learning_rate": 2.6769895001238652e-05, + "loss": 0.2605, + "step": 53690 + }, + { + "epoch": 2.3869849313241764, + "grad_norm": 0.17154812812805176, + "learning_rate": 2.67515947381654e-05, + "loss": 0.2618, + "step": 53700 + }, + { + "epoch": 2.3874294350357825, + "grad_norm": 0.13089802861213684, + "learning_rate": 2.6733298447608745e-05, + "loss": 0.2617, + "step": 53710 + }, + { + "epoch": 2.3878739387473886, + "grad_norm": 0.13469335436820984, + "learning_rate": 2.6715006132695074e-05, + "loss": 0.2581, + "step": 53720 + }, + { + "epoch": 2.3883184424589947, + "grad_norm": 0.14002440869808197, + "learning_rate": 2.6696717796549985e-05, + "loss": 0.2643, + "step": 53730 + }, + { + "epoch": 2.3887629461706004, + "grad_norm": 0.13248635828495026, + "learning_rate": 2.667843344229848e-05, + "loss": 0.2602, + "step": 53740 + }, + { + "epoch": 2.3892074498822065, + "grad_norm": 0.128944531083107, + "learning_rate": 2.6660153073064846e-05, + "loss": 0.2602, + "step": 53750 + }, + { + "epoch": 2.3896519535938126, + "grad_norm": 0.16473625600337982, + "learning_rate": 2.664187669197271e-05, + "loss": 0.2593, + "step": 53760 + }, + { + "epoch": 2.3900964573054186, + "grad_norm": 0.13854438066482544, + "learning_rate": 2.6623604302145005e-05, + "loss": 0.2633, + "step": 53770 + }, + { + "epoch": 2.3905409610170247, + "grad_norm": 0.13399003446102142, + "learning_rate": 2.6605335906703975e-05, + "loss": 0.2593, + "step": 53780 + }, + { + "epoch": 2.3909854647286304, + "grad_norm": 0.15315397083759308, + "learning_rate": 2.65870715087712e-05, + "loss": 0.2593, + "step": 53790 + }, + { + "epoch": 2.3914299684402365, + "grad_norm": 0.1350475251674652, + "learning_rate": 2.6568811111467573e-05, + "loss": 0.2585, + "step": 53800 + }, + { + "epoch": 2.3918744721518426, + "grad_norm": 0.13274286687374115, + "learning_rate": 2.6550554717913258e-05, + "loss": 0.2619, + "step": 53810 + }, + { + "epoch": 2.392318975863448, + "grad_norm": 0.14519113302230835, + "learning_rate": 2.6532302331227805e-05, + "loss": 0.259, + "step": 53820 + }, + { + "epoch": 2.3927634795750543, + "grad_norm": 0.13734722137451172, + "learning_rate": 2.651405395453004e-05, + "loss": 0.2587, + "step": 53830 + }, + { + "epoch": 2.3932079832866604, + "grad_norm": 0.1553809940814972, + "learning_rate": 2.6495809590938115e-05, + "loss": 0.2579, + "step": 53840 + }, + { + "epoch": 2.3936524869982665, + "grad_norm": 0.1640738993883133, + "learning_rate": 2.64775692435695e-05, + "loss": 0.2583, + "step": 53850 + }, + { + "epoch": 2.3940969907098726, + "grad_norm": 0.1616961658000946, + "learning_rate": 2.6459332915540928e-05, + "loss": 0.2597, + "step": 53860 + }, + { + "epoch": 2.3945414944214782, + "grad_norm": 0.16979768872261047, + "learning_rate": 2.644110060996856e-05, + "loss": 0.2591, + "step": 53870 + }, + { + "epoch": 2.3949859981330843, + "grad_norm": 0.13967746496200562, + "learning_rate": 2.642287232996774e-05, + "loss": 0.2585, + "step": 53880 + }, + { + "epoch": 2.3954305018446904, + "grad_norm": 0.1278989613056183, + "learning_rate": 2.6404648078653205e-05, + "loss": 0.2622, + "step": 53890 + }, + { + "epoch": 2.3958750055562965, + "grad_norm": 0.1271744817495346, + "learning_rate": 2.6386427859139002e-05, + "loss": 0.2601, + "step": 53900 + }, + { + "epoch": 2.3963195092679026, + "grad_norm": 0.1543719321489334, + "learning_rate": 2.63682116745384e-05, + "loss": 0.2588, + "step": 53910 + }, + { + "epoch": 2.3967640129795083, + "grad_norm": 0.14274196326732635, + "learning_rate": 2.6349999527964138e-05, + "loss": 0.2615, + "step": 53920 + }, + { + "epoch": 2.3972085166911143, + "grad_norm": 0.1420747935771942, + "learning_rate": 2.63317914225281e-05, + "loss": 0.2599, + "step": 53930 + }, + { + "epoch": 2.3976530204027204, + "grad_norm": 0.15689563751220703, + "learning_rate": 2.6313587361341585e-05, + "loss": 0.2586, + "step": 53940 + }, + { + "epoch": 2.3980975241143265, + "grad_norm": 0.12057896703481674, + "learning_rate": 2.6295387347515165e-05, + "loss": 0.2582, + "step": 53950 + }, + { + "epoch": 2.398542027825932, + "grad_norm": 0.16351667046546936, + "learning_rate": 2.6277191384158727e-05, + "loss": 0.261, + "step": 53960 + }, + { + "epoch": 2.3989865315375383, + "grad_norm": 0.15107065439224243, + "learning_rate": 2.6258999474381457e-05, + "loss": 0.2598, + "step": 53970 + }, + { + "epoch": 2.3994310352491444, + "grad_norm": 0.16239289939403534, + "learning_rate": 2.624081162129186e-05, + "loss": 0.2644, + "step": 53980 + }, + { + "epoch": 2.3998755389607505, + "grad_norm": 0.17031536996364594, + "learning_rate": 2.6222627827997765e-05, + "loss": 0.2596, + "step": 53990 + }, + { + "epoch": 2.400320042672356, + "grad_norm": 0.13222035765647888, + "learning_rate": 2.6204448097606236e-05, + "loss": 0.2611, + "step": 54000 + }, + { + "epoch": 2.400764546383962, + "grad_norm": 0.1420200914144516, + "learning_rate": 2.6186272433223726e-05, + "loss": 0.2612, + "step": 54010 + }, + { + "epoch": 2.4012090500955683, + "grad_norm": 0.1289943903684616, + "learning_rate": 2.6168100837955943e-05, + "loss": 0.2618, + "step": 54020 + }, + { + "epoch": 2.4016535538071744, + "grad_norm": 0.1392030268907547, + "learning_rate": 2.6149933314907926e-05, + "loss": 0.2581, + "step": 54030 + }, + { + "epoch": 2.4020980575187805, + "grad_norm": 0.1413772851228714, + "learning_rate": 2.6131769867184e-05, + "loss": 0.2604, + "step": 54040 + }, + { + "epoch": 2.402542561230386, + "grad_norm": 0.15452629327774048, + "learning_rate": 2.611361049788783e-05, + "loss": 0.2585, + "step": 54050 + }, + { + "epoch": 2.402987064941992, + "grad_norm": 0.15941181778907776, + "learning_rate": 2.6095455210122292e-05, + "loss": 0.2595, + "step": 54060 + }, + { + "epoch": 2.4034315686535983, + "grad_norm": 0.15293020009994507, + "learning_rate": 2.6077304006989712e-05, + "loss": 0.2617, + "step": 54070 + }, + { + "epoch": 2.4038760723652044, + "grad_norm": 0.1451609879732132, + "learning_rate": 2.6059156891591562e-05, + "loss": 0.2605, + "step": 54080 + }, + { + "epoch": 2.4043205760768105, + "grad_norm": 0.1280544549226761, + "learning_rate": 2.6041013867028718e-05, + "loss": 0.2611, + "step": 54090 + }, + { + "epoch": 2.404765079788416, + "grad_norm": 0.14348839223384857, + "learning_rate": 2.6022874936401347e-05, + "loss": 0.2583, + "step": 54100 + }, + { + "epoch": 2.4052095835000222, + "grad_norm": 0.1532093584537506, + "learning_rate": 2.6004740102808832e-05, + "loss": 0.2619, + "step": 54110 + }, + { + "epoch": 2.4056540872116283, + "grad_norm": 0.14311514794826508, + "learning_rate": 2.598660936935e-05, + "loss": 0.2587, + "step": 54120 + }, + { + "epoch": 2.406098590923234, + "grad_norm": 0.15311399102210999, + "learning_rate": 2.5968482739122845e-05, + "loss": 0.2618, + "step": 54130 + }, + { + "epoch": 2.40654309463484, + "grad_norm": 0.14543570578098297, + "learning_rate": 2.595036021522472e-05, + "loss": 0.2638, + "step": 54140 + }, + { + "epoch": 2.406987598346446, + "grad_norm": 0.16178759932518005, + "learning_rate": 2.5932241800752278e-05, + "loss": 0.2592, + "step": 54150 + }, + { + "epoch": 2.4074321020580522, + "grad_norm": 0.12540999054908752, + "learning_rate": 2.5914127498801453e-05, + "loss": 0.2591, + "step": 54160 + }, + { + "epoch": 2.4078766057696583, + "grad_norm": 0.14025446772575378, + "learning_rate": 2.5896017312467497e-05, + "loss": 0.2601, + "step": 54170 + }, + { + "epoch": 2.408321109481264, + "grad_norm": 0.16070954501628876, + "learning_rate": 2.587791124484493e-05, + "loss": 0.2591, + "step": 54180 + }, + { + "epoch": 2.40876561319287, + "grad_norm": 0.12668448686599731, + "learning_rate": 2.5859809299027615e-05, + "loss": 0.2624, + "step": 54190 + }, + { + "epoch": 2.409210116904476, + "grad_norm": 0.1347280740737915, + "learning_rate": 2.5841711478108632e-05, + "loss": 0.2619, + "step": 54200 + }, + { + "epoch": 2.4096546206160823, + "grad_norm": 0.15258237719535828, + "learning_rate": 2.582361778518043e-05, + "loss": 0.2603, + "step": 54210 + }, + { + "epoch": 2.4100991243276884, + "grad_norm": 0.12849372625350952, + "learning_rate": 2.580552822333472e-05, + "loss": 0.2613, + "step": 54220 + }, + { + "epoch": 2.410543628039294, + "grad_norm": 0.15000483393669128, + "learning_rate": 2.578744279566252e-05, + "loss": 0.2584, + "step": 54230 + }, + { + "epoch": 2.4109881317509, + "grad_norm": 0.13787980377674103, + "learning_rate": 2.576936150525413e-05, + "loss": 0.2611, + "step": 54240 + }, + { + "epoch": 2.411432635462506, + "grad_norm": 0.18083824217319489, + "learning_rate": 2.5751284355199168e-05, + "loss": 0.2621, + "step": 54250 + }, + { + "epoch": 2.4118771391741123, + "grad_norm": 0.15661664307117462, + "learning_rate": 2.573321134858646e-05, + "loss": 0.2568, + "step": 54260 + }, + { + "epoch": 2.412321642885718, + "grad_norm": 0.15545865893363953, + "learning_rate": 2.5715142488504286e-05, + "loss": 0.2599, + "step": 54270 + }, + { + "epoch": 2.412766146597324, + "grad_norm": 0.14309771358966827, + "learning_rate": 2.5697077778040042e-05, + "loss": 0.2598, + "step": 54280 + }, + { + "epoch": 2.41321065030893, + "grad_norm": 0.1463412493467331, + "learning_rate": 2.5679017220280522e-05, + "loss": 0.2595, + "step": 54290 + }, + { + "epoch": 2.413655154020536, + "grad_norm": 0.14069585502147675, + "learning_rate": 2.5660960818311796e-05, + "loss": 0.2584, + "step": 54300 + }, + { + "epoch": 2.414099657732142, + "grad_norm": 0.13458310067653656, + "learning_rate": 2.564290857521915e-05, + "loss": 0.2601, + "step": 54310 + }, + { + "epoch": 2.414544161443748, + "grad_norm": 0.15186424553394318, + "learning_rate": 2.5624860494087298e-05, + "loss": 0.2603, + "step": 54320 + }, + { + "epoch": 2.414988665155354, + "grad_norm": 0.12768569588661194, + "learning_rate": 2.5606816578000115e-05, + "loss": 0.2581, + "step": 54330 + }, + { + "epoch": 2.41543316886696, + "grad_norm": 0.1345934122800827, + "learning_rate": 2.558877683004082e-05, + "loss": 0.2609, + "step": 54340 + }, + { + "epoch": 2.4158776725785662, + "grad_norm": 0.15806734561920166, + "learning_rate": 2.557074125329192e-05, + "loss": 0.2597, + "step": 54350 + }, + { + "epoch": 2.416322176290172, + "grad_norm": 0.18757537007331848, + "learning_rate": 2.5552709850835195e-05, + "loss": 0.2614, + "step": 54360 + }, + { + "epoch": 2.416766680001778, + "grad_norm": 0.148810014128685, + "learning_rate": 2.5534682625751738e-05, + "loss": 0.26, + "step": 54370 + }, + { + "epoch": 2.417211183713384, + "grad_norm": 0.1526622474193573, + "learning_rate": 2.551665958112186e-05, + "loss": 0.2613, + "step": 54380 + }, + { + "epoch": 2.41765568742499, + "grad_norm": 0.12894712388515472, + "learning_rate": 2.549864072002527e-05, + "loss": 0.2597, + "step": 54390 + }, + { + "epoch": 2.418100191136596, + "grad_norm": 0.1449953019618988, + "learning_rate": 2.5480626045540858e-05, + "loss": 0.2612, + "step": 54400 + }, + { + "epoch": 2.418544694848202, + "grad_norm": 0.1537085920572281, + "learning_rate": 2.546261556074684e-05, + "loss": 0.2585, + "step": 54410 + }, + { + "epoch": 2.418989198559808, + "grad_norm": 0.13789406418800354, + "learning_rate": 2.5444609268720726e-05, + "loss": 0.2623, + "step": 54420 + }, + { + "epoch": 2.419433702271414, + "grad_norm": 0.1779574453830719, + "learning_rate": 2.5426607172539297e-05, + "loss": 0.2612, + "step": 54430 + }, + { + "epoch": 2.4198782059830197, + "grad_norm": 0.14871668815612793, + "learning_rate": 2.5408609275278617e-05, + "loss": 0.2589, + "step": 54440 + }, + { + "epoch": 2.420322709694626, + "grad_norm": 0.1529441773891449, + "learning_rate": 2.5390615580014055e-05, + "loss": 0.2598, + "step": 54450 + }, + { + "epoch": 2.420767213406232, + "grad_norm": 0.14956776797771454, + "learning_rate": 2.5372626089820207e-05, + "loss": 0.2567, + "step": 54460 + }, + { + "epoch": 2.421211717117838, + "grad_norm": 0.16131696105003357, + "learning_rate": 2.5354640807770997e-05, + "loss": 0.2588, + "step": 54470 + }, + { + "epoch": 2.421656220829444, + "grad_norm": 0.12913993000984192, + "learning_rate": 2.5336659736939622e-05, + "loss": 0.2603, + "step": 54480 + }, + { + "epoch": 2.4221007245410497, + "grad_norm": 0.13401736319065094, + "learning_rate": 2.5318682880398554e-05, + "loss": 0.2649, + "step": 54490 + }, + { + "epoch": 2.422545228252656, + "grad_norm": 0.1702931970357895, + "learning_rate": 2.530071024121956e-05, + "loss": 0.2605, + "step": 54500 + }, + { + "epoch": 2.422989731964262, + "grad_norm": 0.17291398346424103, + "learning_rate": 2.5282741822473627e-05, + "loss": 0.2581, + "step": 54510 + }, + { + "epoch": 2.423434235675868, + "grad_norm": 0.14864034950733185, + "learning_rate": 2.526477762723114e-05, + "loss": 0.2591, + "step": 54520 + }, + { + "epoch": 2.423878739387474, + "grad_norm": 0.13796097040176392, + "learning_rate": 2.5246817658561618e-05, + "loss": 0.2599, + "step": 54530 + }, + { + "epoch": 2.4243232430990798, + "grad_norm": 0.14494003355503082, + "learning_rate": 2.5228861919533965e-05, + "loss": 0.2586, + "step": 54540 + }, + { + "epoch": 2.424767746810686, + "grad_norm": 0.15484429895877838, + "learning_rate": 2.5210910413216326e-05, + "loss": 0.259, + "step": 54550 + }, + { + "epoch": 2.425212250522292, + "grad_norm": 0.1303582340478897, + "learning_rate": 2.5192963142676086e-05, + "loss": 0.2615, + "step": 54560 + }, + { + "epoch": 2.425656754233898, + "grad_norm": 0.15381942689418793, + "learning_rate": 2.517502011098001e-05, + "loss": 0.2608, + "step": 54570 + }, + { + "epoch": 2.4261012579455037, + "grad_norm": 0.15187588334083557, + "learning_rate": 2.5157081321193987e-05, + "loss": 0.2605, + "step": 54580 + }, + { + "epoch": 2.42654576165711, + "grad_norm": 0.1340639293193817, + "learning_rate": 2.5139146776383356e-05, + "loss": 0.2606, + "step": 54590 + }, + { + "epoch": 2.426990265368716, + "grad_norm": 0.1423543244600296, + "learning_rate": 2.5121216479612575e-05, + "loss": 0.2602, + "step": 54600 + }, + { + "epoch": 2.427434769080322, + "grad_norm": 0.14877437055110931, + "learning_rate": 2.510329043394546e-05, + "loss": 0.2607, + "step": 54610 + }, + { + "epoch": 2.4278792727919276, + "grad_norm": 0.15675371885299683, + "learning_rate": 2.508536864244508e-05, + "loss": 0.2585, + "step": 54620 + }, + { + "epoch": 2.4283237765035337, + "grad_norm": 0.1309569925069809, + "learning_rate": 2.5067451108173778e-05, + "loss": 0.2614, + "step": 54630 + }, + { + "epoch": 2.42876828021514, + "grad_norm": 0.15652421116828918, + "learning_rate": 2.5049537834193204e-05, + "loss": 0.2616, + "step": 54640 + }, + { + "epoch": 2.429212783926746, + "grad_norm": 0.1400669366121292, + "learning_rate": 2.5031628823564194e-05, + "loss": 0.2592, + "step": 54650 + }, + { + "epoch": 2.429657287638352, + "grad_norm": 0.15667885541915894, + "learning_rate": 2.5013724079346933e-05, + "loss": 0.2612, + "step": 54660 + }, + { + "epoch": 2.4301017913499576, + "grad_norm": 0.18021562695503235, + "learning_rate": 2.4995823604600854e-05, + "loss": 0.2584, + "step": 54670 + }, + { + "epoch": 2.4305462950615637, + "grad_norm": 0.15344950556755066, + "learning_rate": 2.497792740238465e-05, + "loss": 0.2621, + "step": 54680 + }, + { + "epoch": 2.43099079877317, + "grad_norm": 0.16979476809501648, + "learning_rate": 2.49600354757563e-05, + "loss": 0.2586, + "step": 54690 + }, + { + "epoch": 2.431435302484776, + "grad_norm": 0.132881298661232, + "learning_rate": 2.494214782777306e-05, + "loss": 0.2592, + "step": 54700 + }, + { + "epoch": 2.4318798061963816, + "grad_norm": 0.17210660874843597, + "learning_rate": 2.4924264461491386e-05, + "loss": 0.2576, + "step": 54710 + }, + { + "epoch": 2.4323243099079876, + "grad_norm": 0.1464741975069046, + "learning_rate": 2.4906385379967133e-05, + "loss": 0.2624, + "step": 54720 + }, + { + "epoch": 2.4327688136195937, + "grad_norm": 0.13619737327098846, + "learning_rate": 2.4888510586255285e-05, + "loss": 0.2595, + "step": 54730 + }, + { + "epoch": 2.4332133173312, + "grad_norm": 0.13561652600765228, + "learning_rate": 2.487064008341018e-05, + "loss": 0.2612, + "step": 54740 + }, + { + "epoch": 2.4336578210428055, + "grad_norm": 0.17314544320106506, + "learning_rate": 2.4852773874485407e-05, + "loss": 0.2608, + "step": 54750 + }, + { + "epoch": 2.4341023247544116, + "grad_norm": 0.18177983164787292, + "learning_rate": 2.483491196253377e-05, + "loss": 0.2574, + "step": 54760 + }, + { + "epoch": 2.4345468284660177, + "grad_norm": 0.174994096159935, + "learning_rate": 2.4817054350607443e-05, + "loss": 0.259, + "step": 54770 + }, + { + "epoch": 2.4349913321776238, + "grad_norm": 0.13285696506500244, + "learning_rate": 2.4799201041757742e-05, + "loss": 0.2584, + "step": 54780 + }, + { + "epoch": 2.43543583588923, + "grad_norm": 0.15894226729869843, + "learning_rate": 2.4781352039035373e-05, + "loss": 0.258, + "step": 54790 + }, + { + "epoch": 2.4358803396008355, + "grad_norm": 0.15918946266174316, + "learning_rate": 2.4763507345490194e-05, + "loss": 0.2591, + "step": 54800 + }, + { + "epoch": 2.4363248433124416, + "grad_norm": 0.17057105898857117, + "learning_rate": 2.4745666964171386e-05, + "loss": 0.2612, + "step": 54810 + }, + { + "epoch": 2.4367693470240477, + "grad_norm": 0.1426721066236496, + "learning_rate": 2.4727830898127387e-05, + "loss": 0.2582, + "step": 54820 + }, + { + "epoch": 2.4372138507356538, + "grad_norm": 0.13340604305267334, + "learning_rate": 2.4709999150405895e-05, + "loss": 0.2616, + "step": 54830 + }, + { + "epoch": 2.43765835444726, + "grad_norm": 0.1530497521162033, + "learning_rate": 2.469217172405388e-05, + "loss": 0.2569, + "step": 54840 + }, + { + "epoch": 2.4381028581588655, + "grad_norm": 0.12767978012561798, + "learning_rate": 2.4674348622117527e-05, + "loss": 0.2581, + "step": 54850 + }, + { + "epoch": 2.4385473618704716, + "grad_norm": 0.1677330732345581, + "learning_rate": 2.465652984764234e-05, + "loss": 0.261, + "step": 54860 + }, + { + "epoch": 2.4389918655820777, + "grad_norm": 0.14314387738704681, + "learning_rate": 2.4638715403673056e-05, + "loss": 0.2599, + "step": 54870 + }, + { + "epoch": 2.439436369293684, + "grad_norm": 0.14934518933296204, + "learning_rate": 2.4620905293253676e-05, + "loss": 0.2611, + "step": 54880 + }, + { + "epoch": 2.4398808730052894, + "grad_norm": 0.15372374653816223, + "learning_rate": 2.4603099519427463e-05, + "loss": 0.2609, + "step": 54890 + }, + { + "epoch": 2.4403253767168955, + "grad_norm": 0.14327514171600342, + "learning_rate": 2.458529808523695e-05, + "loss": 0.2617, + "step": 54900 + }, + { + "epoch": 2.4407698804285016, + "grad_norm": 0.13692854344844818, + "learning_rate": 2.456750099372387e-05, + "loss": 0.2612, + "step": 54910 + }, + { + "epoch": 2.4412143841401077, + "grad_norm": 0.1483580619096756, + "learning_rate": 2.4549708247929327e-05, + "loss": 0.26, + "step": 54920 + }, + { + "epoch": 2.4416588878517134, + "grad_norm": 0.14125798642635345, + "learning_rate": 2.4531919850893554e-05, + "loss": 0.2582, + "step": 54930 + }, + { + "epoch": 2.4421033915633195, + "grad_norm": 0.15817642211914062, + "learning_rate": 2.4514135805656125e-05, + "loss": 0.261, + "step": 54940 + }, + { + "epoch": 2.4425478952749256, + "grad_norm": 0.13414426147937775, + "learning_rate": 2.449635611525587e-05, + "loss": 0.2565, + "step": 54950 + }, + { + "epoch": 2.4429923989865316, + "grad_norm": 0.1411334127187729, + "learning_rate": 2.447858078273079e-05, + "loss": 0.2602, + "step": 54960 + }, + { + "epoch": 2.4434369026981377, + "grad_norm": 0.14392822980880737, + "learning_rate": 2.4460809811118275e-05, + "loss": 0.2619, + "step": 54970 + }, + { + "epoch": 2.4438814064097434, + "grad_norm": 0.13189151883125305, + "learning_rate": 2.444304320345483e-05, + "loss": 0.2595, + "step": 54980 + }, + { + "epoch": 2.4443259101213495, + "grad_norm": 0.136313334107399, + "learning_rate": 2.4425280962776354e-05, + "loss": 0.2605, + "step": 54990 + }, + { + "epoch": 2.4447704138329556, + "grad_norm": 0.13849768042564392, + "learning_rate": 2.4407523092117875e-05, + "loss": 0.258, + "step": 55000 + }, + { + "epoch": 2.4452149175445617, + "grad_norm": 0.1368350237607956, + "learning_rate": 2.438976959451374e-05, + "loss": 0.2616, + "step": 55010 + }, + { + "epoch": 2.4456594212561673, + "grad_norm": 0.13180813193321228, + "learning_rate": 2.4372020472997565e-05, + "loss": 0.2589, + "step": 55020 + }, + { + "epoch": 2.4461039249677734, + "grad_norm": 0.1446579545736313, + "learning_rate": 2.4354275730602122e-05, + "loss": 0.2604, + "step": 55030 + }, + { + "epoch": 2.4465484286793795, + "grad_norm": 0.14449259638786316, + "learning_rate": 2.433653537035958e-05, + "loss": 0.2596, + "step": 55040 + }, + { + "epoch": 2.4469929323909856, + "grad_norm": 0.15293806791305542, + "learning_rate": 2.431879939530123e-05, + "loss": 0.2606, + "step": 55050 + }, + { + "epoch": 2.4474374361025912, + "grad_norm": 0.12960699200630188, + "learning_rate": 2.4301067808457684e-05, + "loss": 0.2594, + "step": 55060 + }, + { + "epoch": 2.4478819398141973, + "grad_norm": 0.13493449985980988, + "learning_rate": 2.428334061285878e-05, + "loss": 0.2592, + "step": 55070 + }, + { + "epoch": 2.4483264435258034, + "grad_norm": 0.16031460464000702, + "learning_rate": 2.426561781153361e-05, + "loss": 0.2617, + "step": 55080 + }, + { + "epoch": 2.4487709472374095, + "grad_norm": 0.13826289772987366, + "learning_rate": 2.424789940751052e-05, + "loss": 0.2584, + "step": 55090 + }, + { + "epoch": 2.4492154509490156, + "grad_norm": 0.1470039188861847, + "learning_rate": 2.423018540381712e-05, + "loss": 0.2603, + "step": 55100 + }, + { + "epoch": 2.4496599546606213, + "grad_norm": 0.1496119201183319, + "learning_rate": 2.421247580348021e-05, + "loss": 0.2567, + "step": 55110 + }, + { + "epoch": 2.4501044583722273, + "grad_norm": 0.15811802446842194, + "learning_rate": 2.4194770609525897e-05, + "loss": 0.2619, + "step": 55120 + }, + { + "epoch": 2.4505489620838334, + "grad_norm": 0.18987257778644562, + "learning_rate": 2.417706982497951e-05, + "loss": 0.2604, + "step": 55130 + }, + { + "epoch": 2.4509934657954395, + "grad_norm": 0.1829836368560791, + "learning_rate": 2.415937345286563e-05, + "loss": 0.2595, + "step": 55140 + }, + { + "epoch": 2.4514379695070456, + "grad_norm": 0.1606568843126297, + "learning_rate": 2.4141681496208087e-05, + "loss": 0.26, + "step": 55150 + }, + { + "epoch": 2.4518824732186513, + "grad_norm": 0.1505681574344635, + "learning_rate": 2.4123993958029946e-05, + "loss": 0.2586, + "step": 55160 + }, + { + "epoch": 2.4523269769302574, + "grad_norm": 0.15318059921264648, + "learning_rate": 2.4106310841353548e-05, + "loss": 0.2564, + "step": 55170 + }, + { + "epoch": 2.4527714806418635, + "grad_norm": 0.15492717921733856, + "learning_rate": 2.4088632149200398e-05, + "loss": 0.2598, + "step": 55180 + }, + { + "epoch": 2.453215984353469, + "grad_norm": 0.1372058093547821, + "learning_rate": 2.4070957884591367e-05, + "loss": 0.2605, + "step": 55190 + }, + { + "epoch": 2.453660488065075, + "grad_norm": 0.18979378044605255, + "learning_rate": 2.4053288050546464e-05, + "loss": 0.2602, + "step": 55200 + }, + { + "epoch": 2.4541049917766813, + "grad_norm": 0.16331234574317932, + "learning_rate": 2.403562265008498e-05, + "loss": 0.2599, + "step": 55210 + }, + { + "epoch": 2.4545494954882874, + "grad_norm": 0.12753824889659882, + "learning_rate": 2.4017961686225483e-05, + "loss": 0.2617, + "step": 55220 + }, + { + "epoch": 2.4549939991998935, + "grad_norm": 0.15208247303962708, + "learning_rate": 2.400030516198568e-05, + "loss": 0.2606, + "step": 55230 + }, + { + "epoch": 2.455438502911499, + "grad_norm": 0.15925191342830658, + "learning_rate": 2.3982653080382673e-05, + "loss": 0.2615, + "step": 55240 + }, + { + "epoch": 2.455883006623105, + "grad_norm": 0.16682042181491852, + "learning_rate": 2.396500544443266e-05, + "loss": 0.2613, + "step": 55250 + }, + { + "epoch": 2.4563275103347113, + "grad_norm": 0.15420864522457123, + "learning_rate": 2.3947362257151156e-05, + "loss": 0.2611, + "step": 55260 + }, + { + "epoch": 2.4567720140463174, + "grad_norm": 0.13651619851589203, + "learning_rate": 2.39297235215529e-05, + "loss": 0.2581, + "step": 55270 + }, + { + "epoch": 2.4572165177579235, + "grad_norm": 0.14431189000606537, + "learning_rate": 2.3912089240651873e-05, + "loss": 0.2589, + "step": 55280 + }, + { + "epoch": 2.457661021469529, + "grad_norm": 0.19760988652706146, + "learning_rate": 2.389445941746129e-05, + "loss": 0.2608, + "step": 55290 + }, + { + "epoch": 2.4581055251811352, + "grad_norm": 0.1381232738494873, + "learning_rate": 2.3876834054993625e-05, + "loss": 0.2617, + "step": 55300 + }, + { + "epoch": 2.4585500288927413, + "grad_norm": 0.146511048078537, + "learning_rate": 2.3859213156260522e-05, + "loss": 0.2594, + "step": 55310 + }, + { + "epoch": 2.4589945326043474, + "grad_norm": 0.13617442548274994, + "learning_rate": 2.3841596724272948e-05, + "loss": 0.26, + "step": 55320 + }, + { + "epoch": 2.459439036315953, + "grad_norm": 0.1447283923625946, + "learning_rate": 2.382398476204106e-05, + "loss": 0.259, + "step": 55330 + }, + { + "epoch": 2.459883540027559, + "grad_norm": 0.1539747416973114, + "learning_rate": 2.3806377272574254e-05, + "loss": 0.2567, + "step": 55340 + }, + { + "epoch": 2.4603280437391652, + "grad_norm": 0.1440715342760086, + "learning_rate": 2.3788774258881174e-05, + "loss": 0.2608, + "step": 55350 + }, + { + "epoch": 2.4607725474507713, + "grad_norm": 0.13934579491615295, + "learning_rate": 2.37711757239697e-05, + "loss": 0.2593, + "step": 55360 + }, + { + "epoch": 2.461217051162377, + "grad_norm": 0.1353597193956375, + "learning_rate": 2.3753581670846954e-05, + "loss": 0.2623, + "step": 55370 + }, + { + "epoch": 2.461661554873983, + "grad_norm": 0.14476802945137024, + "learning_rate": 2.3735992102519216e-05, + "loss": 0.2562, + "step": 55380 + }, + { + "epoch": 2.462106058585589, + "grad_norm": 0.15223734080791473, + "learning_rate": 2.371840702199215e-05, + "loss": 0.2593, + "step": 55390 + }, + { + "epoch": 2.4625505622971953, + "grad_norm": 0.1465080976486206, + "learning_rate": 2.3700826432270494e-05, + "loss": 0.2568, + "step": 55400 + }, + { + "epoch": 2.4629950660088014, + "grad_norm": 0.1776125133037567, + "learning_rate": 2.3683250336358326e-05, + "loss": 0.26, + "step": 55410 + }, + { + "epoch": 2.463439569720407, + "grad_norm": 0.14416633546352386, + "learning_rate": 2.3665678737258923e-05, + "loss": 0.2596, + "step": 55420 + }, + { + "epoch": 2.463884073432013, + "grad_norm": 0.1608067750930786, + "learning_rate": 2.3648111637974745e-05, + "loss": 0.2619, + "step": 55430 + }, + { + "epoch": 2.464328577143619, + "grad_norm": 0.10957292467355728, + "learning_rate": 2.36305490415076e-05, + "loss": 0.2598, + "step": 55440 + }, + { + "epoch": 2.4647730808552253, + "grad_norm": 0.15387719869613647, + "learning_rate": 2.3612990950858392e-05, + "loss": 0.2583, + "step": 55450 + }, + { + "epoch": 2.4652175845668314, + "grad_norm": 0.15740253031253815, + "learning_rate": 2.359543736902735e-05, + "loss": 0.2624, + "step": 55460 + }, + { + "epoch": 2.465662088278437, + "grad_norm": 0.16016998887062073, + "learning_rate": 2.3577888299013896e-05, + "loss": 0.2582, + "step": 55470 + }, + { + "epoch": 2.466106591990043, + "grad_norm": 0.14672446250915527, + "learning_rate": 2.3560343743816683e-05, + "loss": 0.2589, + "step": 55480 + }, + { + "epoch": 2.466551095701649, + "grad_norm": 0.1451312005519867, + "learning_rate": 2.354280370643362e-05, + "loss": 0.2593, + "step": 55490 + }, + { + "epoch": 2.466995599413255, + "grad_norm": 0.13712884485721588, + "learning_rate": 2.3525268189861777e-05, + "loss": 0.2599, + "step": 55500 + }, + { + "epoch": 2.467440103124861, + "grad_norm": 0.17660830914974213, + "learning_rate": 2.3507737197097513e-05, + "loss": 0.2603, + "step": 55510 + }, + { + "epoch": 2.467884606836467, + "grad_norm": 0.17110715806484222, + "learning_rate": 2.34902107311364e-05, + "loss": 0.2596, + "step": 55520 + }, + { + "epoch": 2.468329110548073, + "grad_norm": 0.15432626008987427, + "learning_rate": 2.3472688794973225e-05, + "loss": 0.26, + "step": 55530 + }, + { + "epoch": 2.4687736142596792, + "grad_norm": 0.1380581110715866, + "learning_rate": 2.3455171391602016e-05, + "loss": 0.2607, + "step": 55540 + }, + { + "epoch": 2.469218117971285, + "grad_norm": 0.16348671913146973, + "learning_rate": 2.343765852401601e-05, + "loss": 0.2599, + "step": 55550 + }, + { + "epoch": 2.469662621682891, + "grad_norm": 0.15979065001010895, + "learning_rate": 2.342015019520768e-05, + "loss": 0.2605, + "step": 55560 + }, + { + "epoch": 2.470107125394497, + "grad_norm": 0.14922718703746796, + "learning_rate": 2.3402646408168742e-05, + "loss": 0.2576, + "step": 55570 + }, + { + "epoch": 2.470551629106103, + "grad_norm": 0.161453515291214, + "learning_rate": 2.3385147165890074e-05, + "loss": 0.2591, + "step": 55580 + }, + { + "epoch": 2.4709961328177092, + "grad_norm": 0.14720036089420319, + "learning_rate": 2.336765247136184e-05, + "loss": 0.2574, + "step": 55590 + }, + { + "epoch": 2.471440636529315, + "grad_norm": 0.15272285044193268, + "learning_rate": 2.33501623275734e-05, + "loss": 0.2575, + "step": 55600 + }, + { + "epoch": 2.471885140240921, + "grad_norm": 0.1384705901145935, + "learning_rate": 2.333267673751334e-05, + "loss": 0.2605, + "step": 55610 + }, + { + "epoch": 2.472329643952527, + "grad_norm": 0.1486244797706604, + "learning_rate": 2.33151957041695e-05, + "loss": 0.2594, + "step": 55620 + }, + { + "epoch": 2.472774147664133, + "grad_norm": 0.1464647650718689, + "learning_rate": 2.329771923052884e-05, + "loss": 0.2619, + "step": 55630 + }, + { + "epoch": 2.473218651375739, + "grad_norm": 0.12252145260572433, + "learning_rate": 2.3280247319577697e-05, + "loss": 0.2615, + "step": 55640 + }, + { + "epoch": 2.473663155087345, + "grad_norm": 0.13282501697540283, + "learning_rate": 2.3262779974301473e-05, + "loss": 0.2603, + "step": 55650 + }, + { + "epoch": 2.474107658798951, + "grad_norm": 0.1374673992395401, + "learning_rate": 2.3245317197684895e-05, + "loss": 0.2603, + "step": 55660 + }, + { + "epoch": 2.474552162510557, + "grad_norm": 0.12443745881319046, + "learning_rate": 2.322785899271188e-05, + "loss": 0.2605, + "step": 55670 + }, + { + "epoch": 2.4749966662221627, + "grad_norm": 0.12060022354125977, + "learning_rate": 2.3210405362365507e-05, + "loss": 0.2579, + "step": 55680 + }, + { + "epoch": 2.475441169933769, + "grad_norm": 0.17161665856838226, + "learning_rate": 2.31929563096282e-05, + "loss": 0.2584, + "step": 55690 + }, + { + "epoch": 2.475885673645375, + "grad_norm": 0.3878685534000397, + "learning_rate": 2.317551183748146e-05, + "loss": 0.2588, + "step": 55700 + }, + { + "epoch": 2.476330177356981, + "grad_norm": 0.1297599971294403, + "learning_rate": 2.3158071948906103e-05, + "loss": 0.2579, + "step": 55710 + }, + { + "epoch": 2.476774681068587, + "grad_norm": 0.1659054011106491, + "learning_rate": 2.314063664688212e-05, + "loss": 0.2598, + "step": 55720 + }, + { + "epoch": 2.4772191847801928, + "grad_norm": 0.11688219755887985, + "learning_rate": 2.3123205934388725e-05, + "loss": 0.2603, + "step": 55730 + }, + { + "epoch": 2.477663688491799, + "grad_norm": 0.11454391479492188, + "learning_rate": 2.310577981440436e-05, + "loss": 0.2586, + "step": 55740 + }, + { + "epoch": 2.478108192203405, + "grad_norm": 0.14853672683238983, + "learning_rate": 2.3088358289906665e-05, + "loss": 0.2607, + "step": 55750 + }, + { + "epoch": 2.478552695915011, + "grad_norm": 0.1438562124967575, + "learning_rate": 2.307094136387252e-05, + "loss": 0.2611, + "step": 55760 + }, + { + "epoch": 2.478997199626617, + "grad_norm": 0.1454857736825943, + "learning_rate": 2.305352903927796e-05, + "loss": 0.2615, + "step": 55770 + }, + { + "epoch": 2.479441703338223, + "grad_norm": 0.1396774798631668, + "learning_rate": 2.303612131909831e-05, + "loss": 0.261, + "step": 55780 + }, + { + "epoch": 2.479886207049829, + "grad_norm": 0.13490772247314453, + "learning_rate": 2.3018718206308054e-05, + "loss": 0.259, + "step": 55790 + }, + { + "epoch": 2.480330710761435, + "grad_norm": 0.14251719415187836, + "learning_rate": 2.3001319703880925e-05, + "loss": 0.2606, + "step": 55800 + }, + { + "epoch": 2.4807752144730406, + "grad_norm": 0.1321466565132141, + "learning_rate": 2.2983925814789835e-05, + "loss": 0.2594, + "step": 55810 + }, + { + "epoch": 2.4812197181846467, + "grad_norm": 0.1415586769580841, + "learning_rate": 2.2966536542006957e-05, + "loss": 0.2625, + "step": 55820 + }, + { + "epoch": 2.481664221896253, + "grad_norm": 0.14873260259628296, + "learning_rate": 2.294915188850358e-05, + "loss": 0.2585, + "step": 55830 + }, + { + "epoch": 2.482108725607859, + "grad_norm": 0.1616569608449936, + "learning_rate": 2.2931771857250333e-05, + "loss": 0.2619, + "step": 55840 + }, + { + "epoch": 2.482553229319465, + "grad_norm": 0.12264154851436615, + "learning_rate": 2.2914396451216946e-05, + "loss": 0.2585, + "step": 55850 + }, + { + "epoch": 2.4829977330310706, + "grad_norm": 0.12884792685508728, + "learning_rate": 2.2897025673372412e-05, + "loss": 0.2567, + "step": 55860 + }, + { + "epoch": 2.4834422367426767, + "grad_norm": 0.15027520060539246, + "learning_rate": 2.287965952668494e-05, + "loss": 0.2597, + "step": 55870 + }, + { + "epoch": 2.483886740454283, + "grad_norm": 0.15314900875091553, + "learning_rate": 2.2862298014121873e-05, + "loss": 0.262, + "step": 55880 + }, + { + "epoch": 2.484331244165889, + "grad_norm": 0.15864701569080353, + "learning_rate": 2.28449411386499e-05, + "loss": 0.2591, + "step": 55890 + }, + { + "epoch": 2.484775747877495, + "grad_norm": 0.14662782847881317, + "learning_rate": 2.282758890323477e-05, + "loss": 0.2605, + "step": 55900 + }, + { + "epoch": 2.4852202515891006, + "grad_norm": 0.14959348738193512, + "learning_rate": 2.2810241310841528e-05, + "loss": 0.2587, + "step": 55910 + }, + { + "epoch": 2.4856647553007067, + "grad_norm": 0.15702274441719055, + "learning_rate": 2.2792898364434412e-05, + "loss": 0.2608, + "step": 55920 + }, + { + "epoch": 2.486109259012313, + "grad_norm": 0.15419720113277435, + "learning_rate": 2.2775560066976843e-05, + "loss": 0.2596, + "step": 55930 + }, + { + "epoch": 2.486553762723919, + "grad_norm": 0.15559975802898407, + "learning_rate": 2.275822642143147e-05, + "loss": 0.2571, + "step": 55940 + }, + { + "epoch": 2.4869982664355246, + "grad_norm": 0.18086759746074677, + "learning_rate": 2.274089743076014e-05, + "loss": 0.2604, + "step": 55950 + }, + { + "epoch": 2.4874427701471307, + "grad_norm": 0.13479657471179962, + "learning_rate": 2.2723573097923917e-05, + "loss": 0.2588, + "step": 55960 + }, + { + "epoch": 2.4878872738587368, + "grad_norm": 0.16053277254104614, + "learning_rate": 2.2706253425883024e-05, + "loss": 0.2576, + "step": 55970 + }, + { + "epoch": 2.488331777570343, + "grad_norm": 0.17249397933483124, + "learning_rate": 2.2688938417596933e-05, + "loss": 0.261, + "step": 55980 + }, + { + "epoch": 2.4887762812819485, + "grad_norm": 0.13642564415931702, + "learning_rate": 2.2671628076024305e-05, + "loss": 0.2588, + "step": 55990 + }, + { + "epoch": 2.4892207849935546, + "grad_norm": 0.15503531694412231, + "learning_rate": 2.2654322404123008e-05, + "loss": 0.2611, + "step": 56000 + }, + { + "epoch": 2.4896652887051607, + "grad_norm": 0.1561664193868637, + "learning_rate": 2.2637021404850105e-05, + "loss": 0.2621, + "step": 56010 + }, + { + "epoch": 2.4901097924167668, + "grad_norm": 0.14780588448047638, + "learning_rate": 2.2619725081161885e-05, + "loss": 0.2594, + "step": 56020 + }, + { + "epoch": 2.490554296128373, + "grad_norm": 0.1402932107448578, + "learning_rate": 2.260243343601376e-05, + "loss": 0.2595, + "step": 56030 + }, + { + "epoch": 2.4909987998399785, + "grad_norm": 0.14303672313690186, + "learning_rate": 2.2585146472360473e-05, + "loss": 0.2594, + "step": 56040 + }, + { + "epoch": 2.4914433035515846, + "grad_norm": 0.16216282546520233, + "learning_rate": 2.2567864193155834e-05, + "loss": 0.2587, + "step": 56050 + }, + { + "epoch": 2.4918878072631907, + "grad_norm": 0.14728349447250366, + "learning_rate": 2.255058660135294e-05, + "loss": 0.2557, + "step": 56060 + }, + { + "epoch": 2.492332310974797, + "grad_norm": 0.143804132938385, + "learning_rate": 2.253331369990407e-05, + "loss": 0.261, + "step": 56070 + }, + { + "epoch": 2.4927768146864024, + "grad_norm": 0.13778334856033325, + "learning_rate": 2.251604549176063e-05, + "loss": 0.2581, + "step": 56080 + }, + { + "epoch": 2.4932213183980085, + "grad_norm": 0.13082082569599152, + "learning_rate": 2.249878197987337e-05, + "loss": 0.2578, + "step": 56090 + }, + { + "epoch": 2.4936658221096146, + "grad_norm": 0.1541392058134079, + "learning_rate": 2.2481523167192087e-05, + "loss": 0.2586, + "step": 56100 + }, + { + "epoch": 2.4941103258212207, + "grad_norm": 0.18775199353694916, + "learning_rate": 2.2464269056665867e-05, + "loss": 0.2599, + "step": 56110 + }, + { + "epoch": 2.4945548295328264, + "grad_norm": 0.15978747606277466, + "learning_rate": 2.2447019651242958e-05, + "loss": 0.2597, + "step": 56120 + }, + { + "epoch": 2.4949993332444325, + "grad_norm": 0.16080673038959503, + "learning_rate": 2.2429774953870818e-05, + "loss": 0.2576, + "step": 56130 + }, + { + "epoch": 2.4954438369560386, + "grad_norm": 0.16404496133327484, + "learning_rate": 2.241253496749611e-05, + "loss": 0.2586, + "step": 56140 + }, + { + "epoch": 2.4958883406676446, + "grad_norm": 0.1505802720785141, + "learning_rate": 2.2395299695064614e-05, + "loss": 0.2632, + "step": 56150 + }, + { + "epoch": 2.4963328443792507, + "grad_norm": 0.14416664838790894, + "learning_rate": 2.237806913952145e-05, + "loss": 0.2574, + "step": 56160 + }, + { + "epoch": 2.4967773480908564, + "grad_norm": 0.160038024187088, + "learning_rate": 2.2360843303810798e-05, + "loss": 0.2594, + "step": 56170 + }, + { + "epoch": 2.4972218518024625, + "grad_norm": 0.16517207026481628, + "learning_rate": 2.2343622190876084e-05, + "loss": 0.2594, + "step": 56180 + }, + { + "epoch": 2.4976663555140686, + "grad_norm": 0.14321503043174744, + "learning_rate": 2.2326405803659935e-05, + "loss": 0.2579, + "step": 56190 + }, + { + "epoch": 2.4981108592256747, + "grad_norm": 0.13554392755031586, + "learning_rate": 2.230919414510416e-05, + "loss": 0.259, + "step": 56200 + }, + { + "epoch": 2.4985553629372808, + "grad_norm": 0.1578301340341568, + "learning_rate": 2.229198721814976e-05, + "loss": 0.26, + "step": 56210 + }, + { + "epoch": 2.4989998666488864, + "grad_norm": 0.14666315913200378, + "learning_rate": 2.2274785025736948e-05, + "loss": 0.2608, + "step": 56220 + }, + { + "epoch": 2.4994443703604925, + "grad_norm": 0.14957663416862488, + "learning_rate": 2.225758757080507e-05, + "loss": 0.2605, + "step": 56230 + }, + { + "epoch": 2.4998888740720986, + "grad_norm": 0.16387583315372467, + "learning_rate": 2.2240394856292723e-05, + "loss": 0.2589, + "step": 56240 + }, + { + "epoch": 2.5003333777837042, + "grad_norm": 0.13333946466445923, + "learning_rate": 2.2223206885137664e-05, + "loss": 0.2598, + "step": 56250 + }, + { + "epoch": 2.5007778814953103, + "grad_norm": 0.14775444567203522, + "learning_rate": 2.2206023660276853e-05, + "loss": 0.2592, + "step": 56260 + }, + { + "epoch": 2.5012223852069164, + "grad_norm": 0.15233947336673737, + "learning_rate": 2.218884518464645e-05, + "loss": 0.26, + "step": 56270 + }, + { + "epoch": 2.5016668889185225, + "grad_norm": 0.18171505630016327, + "learning_rate": 2.2171671461181732e-05, + "loss": 0.259, + "step": 56280 + }, + { + "epoch": 2.5021113926301286, + "grad_norm": 0.13166511058807373, + "learning_rate": 2.2154502492817292e-05, + "loss": 0.2591, + "step": 56290 + }, + { + "epoch": 2.5025558963417343, + "grad_norm": 0.1251315325498581, + "learning_rate": 2.2137338282486782e-05, + "loss": 0.2587, + "step": 56300 + }, + { + "epoch": 2.5030004000533403, + "grad_norm": 0.12873053550720215, + "learning_rate": 2.2120178833123113e-05, + "loss": 0.2587, + "step": 56310 + }, + { + "epoch": 2.5034449037649464, + "grad_norm": 0.13082453608512878, + "learning_rate": 2.210302414765838e-05, + "loss": 0.2592, + "step": 56320 + }, + { + "epoch": 2.5038894074765525, + "grad_norm": 0.1399814635515213, + "learning_rate": 2.2085874229023808e-05, + "loss": 0.2576, + "step": 56330 + }, + { + "epoch": 2.5043339111881586, + "grad_norm": 0.1648130714893341, + "learning_rate": 2.2068729080149907e-05, + "loss": 0.2605, + "step": 56340 + }, + { + "epoch": 2.5047784148997643, + "grad_norm": 0.15914295613765717, + "learning_rate": 2.205158870396625e-05, + "loss": 0.2597, + "step": 56350 + }, + { + "epoch": 2.5052229186113704, + "grad_norm": 0.13686250150203705, + "learning_rate": 2.2034453103401732e-05, + "loss": 0.2594, + "step": 56360 + }, + { + "epoch": 2.5056674223229765, + "grad_norm": 0.15366582572460175, + "learning_rate": 2.20173222813843e-05, + "loss": 0.2595, + "step": 56370 + }, + { + "epoch": 2.5061119260345825, + "grad_norm": 0.18722093105316162, + "learning_rate": 2.200019624084116e-05, + "loss": 0.2599, + "step": 56380 + }, + { + "epoch": 2.5065564297461886, + "grad_norm": 0.14378605782985687, + "learning_rate": 2.1983074984698687e-05, + "loss": 0.2605, + "step": 56390 + }, + { + "epoch": 2.5070009334577943, + "grad_norm": 0.1524839550256729, + "learning_rate": 2.1965958515882433e-05, + "loss": 0.258, + "step": 56400 + }, + { + "epoch": 2.5074454371694004, + "grad_norm": 0.12654677033424377, + "learning_rate": 2.1948846837317162e-05, + "loss": 0.2574, + "step": 56410 + }, + { + "epoch": 2.5078899408810065, + "grad_norm": 0.14380285143852234, + "learning_rate": 2.1931739951926738e-05, + "loss": 0.2605, + "step": 56420 + }, + { + "epoch": 2.508334444592612, + "grad_norm": 0.14379915595054626, + "learning_rate": 2.191463786263429e-05, + "loss": 0.2617, + "step": 56430 + }, + { + "epoch": 2.508778948304218, + "grad_norm": 0.1691800206899643, + "learning_rate": 2.1897540572362095e-05, + "loss": 0.2599, + "step": 56440 + }, + { + "epoch": 2.5092234520158243, + "grad_norm": 0.13768263161182404, + "learning_rate": 2.1880448084031614e-05, + "loss": 0.258, + "step": 56450 + }, + { + "epoch": 2.5096679557274304, + "grad_norm": 0.1418009102344513, + "learning_rate": 2.1863360400563482e-05, + "loss": 0.2579, + "step": 56460 + }, + { + "epoch": 2.5101124594390365, + "grad_norm": 0.12389552593231201, + "learning_rate": 2.184627752487754e-05, + "loss": 0.259, + "step": 56470 + }, + { + "epoch": 2.510556963150642, + "grad_norm": 0.1388818919658661, + "learning_rate": 2.1829199459892725e-05, + "loss": 0.2593, + "step": 56480 + }, + { + "epoch": 2.5110014668622482, + "grad_norm": 0.12663781642913818, + "learning_rate": 2.1812126208527282e-05, + "loss": 0.2624, + "step": 56490 + }, + { + "epoch": 2.5114459705738543, + "grad_norm": 0.1581004410982132, + "learning_rate": 2.179505777369852e-05, + "loss": 0.2594, + "step": 56500 + }, + { + "epoch": 2.5118904742854604, + "grad_norm": 0.15101972222328186, + "learning_rate": 2.1777994158322974e-05, + "loss": 0.2599, + "step": 56510 + }, + { + "epoch": 2.5123349779970665, + "grad_norm": 0.15418650209903717, + "learning_rate": 2.176093536531637e-05, + "loss": 0.2609, + "step": 56520 + }, + { + "epoch": 2.512779481708672, + "grad_norm": 0.18084241449832916, + "learning_rate": 2.1743881397593537e-05, + "loss": 0.2614, + "step": 56530 + }, + { + "epoch": 2.5132239854202783, + "grad_norm": 0.12858116626739502, + "learning_rate": 2.1726832258068595e-05, + "loss": 0.2584, + "step": 56540 + }, + { + "epoch": 2.5136684891318843, + "grad_norm": 0.1410510390996933, + "learning_rate": 2.170978794965472e-05, + "loss": 0.2571, + "step": 56550 + }, + { + "epoch": 2.51411299284349, + "grad_norm": 0.13741958141326904, + "learning_rate": 2.169274847526438e-05, + "loss": 0.2592, + "step": 56560 + }, + { + "epoch": 2.514557496555096, + "grad_norm": 0.1454247534275055, + "learning_rate": 2.1675713837809103e-05, + "loss": 0.2585, + "step": 56570 + }, + { + "epoch": 2.515002000266702, + "grad_norm": 0.13992546498775482, + "learning_rate": 2.1658684040199655e-05, + "loss": 0.2586, + "step": 56580 + }, + { + "epoch": 2.5154465039783083, + "grad_norm": 0.16317489743232727, + "learning_rate": 2.1641659085345974e-05, + "loss": 0.2567, + "step": 56590 + }, + { + "epoch": 2.5158910076899144, + "grad_norm": 0.13494713604450226, + "learning_rate": 2.1624638976157154e-05, + "loss": 0.2593, + "step": 56600 + }, + { + "epoch": 2.51633551140152, + "grad_norm": 0.18208780884742737, + "learning_rate": 2.1607623715541476e-05, + "loss": 0.2592, + "step": 56610 + }, + { + "epoch": 2.516780015113126, + "grad_norm": 0.1364254355430603, + "learning_rate": 2.159061330640636e-05, + "loss": 0.2595, + "step": 56620 + }, + { + "epoch": 2.517224518824732, + "grad_norm": 0.15330061316490173, + "learning_rate": 2.1573607751658425e-05, + "loss": 0.262, + "step": 56630 + }, + { + "epoch": 2.5176690225363383, + "grad_norm": 0.13819590210914612, + "learning_rate": 2.155660705420347e-05, + "loss": 0.2608, + "step": 56640 + }, + { + "epoch": 2.5181135262479444, + "grad_norm": 0.13402603566646576, + "learning_rate": 2.153961121694644e-05, + "loss": 0.2583, + "step": 56650 + }, + { + "epoch": 2.51855802995955, + "grad_norm": 0.13414619863033295, + "learning_rate": 2.152262024279145e-05, + "loss": 0.2596, + "step": 56660 + }, + { + "epoch": 2.519002533671156, + "grad_norm": 0.16025739908218384, + "learning_rate": 2.150563413464183e-05, + "loss": 0.2621, + "step": 56670 + }, + { + "epoch": 2.519447037382762, + "grad_norm": 0.1339605152606964, + "learning_rate": 2.1488652895399974e-05, + "loss": 0.2595, + "step": 56680 + }, + { + "epoch": 2.5198915410943683, + "grad_norm": 0.14715313911437988, + "learning_rate": 2.1471676527967587e-05, + "loss": 0.2573, + "step": 56690 + }, + { + "epoch": 2.5203360448059744, + "grad_norm": 0.16231031715869904, + "learning_rate": 2.1454705035245414e-05, + "loss": 0.2601, + "step": 56700 + }, + { + "epoch": 2.52078054851758, + "grad_norm": 0.17047250270843506, + "learning_rate": 2.143773842013343e-05, + "loss": 0.2603, + "step": 56710 + }, + { + "epoch": 2.521225052229186, + "grad_norm": 0.1622290313243866, + "learning_rate": 2.1420776685530796e-05, + "loss": 0.258, + "step": 56720 + }, + { + "epoch": 2.5216695559407922, + "grad_norm": 0.15118861198425293, + "learning_rate": 2.1403819834335742e-05, + "loss": 0.2568, + "step": 56730 + }, + { + "epoch": 2.522114059652398, + "grad_norm": 0.137502059340477, + "learning_rate": 2.1386867869445808e-05, + "loss": 0.2576, + "step": 56740 + }, + { + "epoch": 2.522558563364004, + "grad_norm": 0.1315435916185379, + "learning_rate": 2.1369920793757548e-05, + "loss": 0.2589, + "step": 56750 + }, + { + "epoch": 2.52300306707561, + "grad_norm": 0.12687820196151733, + "learning_rate": 2.1352978610166828e-05, + "loss": 0.2595, + "step": 56760 + }, + { + "epoch": 2.523447570787216, + "grad_norm": 0.1244419664144516, + "learning_rate": 2.1336041321568546e-05, + "loss": 0.2632, + "step": 56770 + }, + { + "epoch": 2.5238920744988222, + "grad_norm": 0.1411494016647339, + "learning_rate": 2.131910893085684e-05, + "loss": 0.2613, + "step": 56780 + }, + { + "epoch": 2.524336578210428, + "grad_norm": 0.14764443039894104, + "learning_rate": 2.1302181440925012e-05, + "loss": 0.2557, + "step": 56790 + }, + { + "epoch": 2.524781081922034, + "grad_norm": 0.14080263674259186, + "learning_rate": 2.128525885466546e-05, + "loss": 0.2565, + "step": 56800 + }, + { + "epoch": 2.52522558563364, + "grad_norm": 0.15979443490505219, + "learning_rate": 2.1268341174969847e-05, + "loss": 0.2578, + "step": 56810 + }, + { + "epoch": 2.525670089345246, + "grad_norm": 0.141262024641037, + "learning_rate": 2.1251428404728894e-05, + "loss": 0.2589, + "step": 56820 + }, + { + "epoch": 2.5261145930568523, + "grad_norm": 0.13829496502876282, + "learning_rate": 2.123452054683256e-05, + "loss": 0.2582, + "step": 56830 + }, + { + "epoch": 2.526559096768458, + "grad_norm": 0.11697995662689209, + "learning_rate": 2.1217617604169927e-05, + "loss": 0.2591, + "step": 56840 + }, + { + "epoch": 2.527003600480064, + "grad_norm": 0.13424240052700043, + "learning_rate": 2.120071957962924e-05, + "loss": 0.2582, + "step": 56850 + }, + { + "epoch": 2.52744810419167, + "grad_norm": 0.12432042509317398, + "learning_rate": 2.1183826476097917e-05, + "loss": 0.2579, + "step": 56860 + }, + { + "epoch": 2.5278926079032757, + "grad_norm": 0.11512116342782974, + "learning_rate": 2.116693829646254e-05, + "loss": 0.2586, + "step": 56870 + }, + { + "epoch": 2.528337111614882, + "grad_norm": 0.13463594019412994, + "learning_rate": 2.1150055043608806e-05, + "loss": 0.2568, + "step": 56880 + }, + { + "epoch": 2.528781615326488, + "grad_norm": 0.14455893635749817, + "learning_rate": 2.113317672042162e-05, + "loss": 0.2581, + "step": 56890 + }, + { + "epoch": 2.529226119038094, + "grad_norm": 0.1782844364643097, + "learning_rate": 2.1116303329785025e-05, + "loss": 0.2568, + "step": 56900 + }, + { + "epoch": 2.5296706227497, + "grad_norm": 0.15750382840633392, + "learning_rate": 2.109943487458222e-05, + "loss": 0.2601, + "step": 56910 + }, + { + "epoch": 2.5301151264613058, + "grad_norm": 0.13480708003044128, + "learning_rate": 2.1082571357695574e-05, + "loss": 0.2585, + "step": 56920 + }, + { + "epoch": 2.530559630172912, + "grad_norm": 0.1478302776813507, + "learning_rate": 2.1065712782006557e-05, + "loss": 0.2581, + "step": 56930 + }, + { + "epoch": 2.531004133884518, + "grad_norm": 0.14937308430671692, + "learning_rate": 2.104885915039591e-05, + "loss": 0.2619, + "step": 56940 + }, + { + "epoch": 2.531448637596124, + "grad_norm": 0.13735754787921906, + "learning_rate": 2.103201046574338e-05, + "loss": 0.2619, + "step": 56950 + }, + { + "epoch": 2.53189314130773, + "grad_norm": 0.15342584252357483, + "learning_rate": 2.1015166730928022e-05, + "loss": 0.2592, + "step": 56960 + }, + { + "epoch": 2.532337645019336, + "grad_norm": 0.15977726876735687, + "learning_rate": 2.0998327948827912e-05, + "loss": 0.2619, + "step": 56970 + }, + { + "epoch": 2.532782148730942, + "grad_norm": 0.14145579934120178, + "learning_rate": 2.0981494122320363e-05, + "loss": 0.2567, + "step": 56980 + }, + { + "epoch": 2.533226652442548, + "grad_norm": 0.1426016241312027, + "learning_rate": 2.0964665254281822e-05, + "loss": 0.2579, + "step": 56990 + }, + { + "epoch": 2.5336711561541536, + "grad_norm": 0.15228185057640076, + "learning_rate": 2.094784134758784e-05, + "loss": 0.2583, + "step": 57000 + }, + { + "epoch": 2.53411565986576, + "grad_norm": 0.1352381706237793, + "learning_rate": 2.0931022405113226e-05, + "loss": 0.2577, + "step": 57010 + }, + { + "epoch": 2.534560163577366, + "grad_norm": 0.15063446760177612, + "learning_rate": 2.091420842973183e-05, + "loss": 0.2595, + "step": 57020 + }, + { + "epoch": 2.535004667288972, + "grad_norm": 0.15080448985099792, + "learning_rate": 2.0897399424316715e-05, + "loss": 0.2587, + "step": 57030 + }, + { + "epoch": 2.535449171000578, + "grad_norm": 0.13412673771381378, + "learning_rate": 2.0880595391740078e-05, + "loss": 0.2596, + "step": 57040 + }, + { + "epoch": 2.5358936747121836, + "grad_norm": 0.13640007376670837, + "learning_rate": 2.0863796334873277e-05, + "loss": 0.2589, + "step": 57050 + }, + { + "epoch": 2.5363381784237897, + "grad_norm": 0.13555610179901123, + "learning_rate": 2.08470022565868e-05, + "loss": 0.2572, + "step": 57060 + }, + { + "epoch": 2.536782682135396, + "grad_norm": 0.1449539214372635, + "learning_rate": 2.0830213159750317e-05, + "loss": 0.2589, + "step": 57070 + }, + { + "epoch": 2.537227185847002, + "grad_norm": 0.14937955141067505, + "learning_rate": 2.0813429047232596e-05, + "loss": 0.2595, + "step": 57080 + }, + { + "epoch": 2.537671689558608, + "grad_norm": 0.13699959218502045, + "learning_rate": 2.0796649921901594e-05, + "loss": 0.2584, + "step": 57090 + }, + { + "epoch": 2.5381161932702136, + "grad_norm": 0.12042135745286942, + "learning_rate": 2.077987578662441e-05, + "loss": 0.2556, + "step": 57100 + }, + { + "epoch": 2.5385606969818197, + "grad_norm": 0.14034387469291687, + "learning_rate": 2.0763106644267277e-05, + "loss": 0.2605, + "step": 57110 + }, + { + "epoch": 2.539005200693426, + "grad_norm": 0.16901051998138428, + "learning_rate": 2.0746342497695607e-05, + "loss": 0.2583, + "step": 57120 + }, + { + "epoch": 2.539449704405032, + "grad_norm": 0.14233268797397614, + "learning_rate": 2.0729583349773886e-05, + "loss": 0.2602, + "step": 57130 + }, + { + "epoch": 2.539894208116638, + "grad_norm": 0.1377968192100525, + "learning_rate": 2.0712829203365853e-05, + "loss": 0.2597, + "step": 57140 + }, + { + "epoch": 2.5403387118282437, + "grad_norm": 0.15117453038692474, + "learning_rate": 2.0696080061334267e-05, + "loss": 0.2621, + "step": 57150 + }, + { + "epoch": 2.5407832155398498, + "grad_norm": 0.14923708140850067, + "learning_rate": 2.067933592654117e-05, + "loss": 0.2589, + "step": 57160 + }, + { + "epoch": 2.541227719251456, + "grad_norm": 0.12985843420028687, + "learning_rate": 2.066259680184763e-05, + "loss": 0.2611, + "step": 57170 + }, + { + "epoch": 2.5416722229630615, + "grad_norm": 0.14367114007472992, + "learning_rate": 2.0645862690113908e-05, + "loss": 0.2589, + "step": 57180 + }, + { + "epoch": 2.5421167266746676, + "grad_norm": 0.16333316266536713, + "learning_rate": 2.0629133594199436e-05, + "loss": 0.2573, + "step": 57190 + }, + { + "epoch": 2.5425612303862737, + "grad_norm": 0.15919901430606842, + "learning_rate": 2.0612409516962704e-05, + "loss": 0.2603, + "step": 57200 + }, + { + "epoch": 2.5430057340978798, + "grad_norm": 0.16201692819595337, + "learning_rate": 2.0595690461261467e-05, + "loss": 0.258, + "step": 57210 + }, + { + "epoch": 2.543450237809486, + "grad_norm": 0.1381838023662567, + "learning_rate": 2.0578976429952503e-05, + "loss": 0.2577, + "step": 57220 + }, + { + "epoch": 2.5438947415210915, + "grad_norm": 0.13592417538166046, + "learning_rate": 2.0562267425891802e-05, + "loss": 0.2584, + "step": 57230 + }, + { + "epoch": 2.5443392452326976, + "grad_norm": 0.1438094675540924, + "learning_rate": 2.0545563451934467e-05, + "loss": 0.2593, + "step": 57240 + }, + { + "epoch": 2.5447837489443037, + "grad_norm": 0.13471615314483643, + "learning_rate": 2.0528864510934764e-05, + "loss": 0.2591, + "step": 57250 + }, + { + "epoch": 2.54522825265591, + "grad_norm": 0.12808190286159515, + "learning_rate": 2.0512170605746096e-05, + "loss": 0.2599, + "step": 57260 + }, + { + "epoch": 2.545672756367516, + "grad_norm": 0.1319545954465866, + "learning_rate": 2.0495481739220963e-05, + "loss": 0.2555, + "step": 57270 + }, + { + "epoch": 2.5461172600791215, + "grad_norm": 0.17080636322498322, + "learning_rate": 2.0478797914211045e-05, + "loss": 0.2601, + "step": 57280 + }, + { + "epoch": 2.5465617637907276, + "grad_norm": 0.1362050175666809, + "learning_rate": 2.046211913356716e-05, + "loss": 0.2582, + "step": 57290 + }, + { + "epoch": 2.5470062675023337, + "grad_norm": 0.15584778785705566, + "learning_rate": 2.0445445400139247e-05, + "loss": 0.2584, + "step": 57300 + }, + { + "epoch": 2.5474507712139394, + "grad_norm": 0.13547968864440918, + "learning_rate": 2.0428776716776405e-05, + "loss": 0.2582, + "step": 57310 + }, + { + "epoch": 2.547895274925546, + "grad_norm": 0.1650974303483963, + "learning_rate": 2.0412113086326856e-05, + "loss": 0.259, + "step": 57320 + }, + { + "epoch": 2.5483397786371516, + "grad_norm": 0.13346876204013824, + "learning_rate": 2.0395454511637918e-05, + "loss": 0.2574, + "step": 57330 + }, + { + "epoch": 2.5487842823487576, + "grad_norm": 0.14327789843082428, + "learning_rate": 2.037880099555616e-05, + "loss": 0.258, + "step": 57340 + }, + { + "epoch": 2.5492287860603637, + "grad_norm": 0.14157193899154663, + "learning_rate": 2.0362152540927144e-05, + "loss": 0.2566, + "step": 57350 + }, + { + "epoch": 2.5496732897719694, + "grad_norm": 0.12432114779949188, + "learning_rate": 2.0345509150595666e-05, + "loss": 0.256, + "step": 57360 + }, + { + "epoch": 2.5501177934835755, + "grad_norm": 0.12024202197790146, + "learning_rate": 2.0328870827405617e-05, + "loss": 0.2586, + "step": 57370 + }, + { + "epoch": 2.5505622971951816, + "grad_norm": 0.15698741376399994, + "learning_rate": 2.0312237574200043e-05, + "loss": 0.2553, + "step": 57380 + }, + { + "epoch": 2.5510068009067877, + "grad_norm": 0.1404125988483429, + "learning_rate": 2.029560939382112e-05, + "loss": 0.2577, + "step": 57390 + }, + { + "epoch": 2.5514513046183938, + "grad_norm": 0.15386143326759338, + "learning_rate": 2.0278986289110097e-05, + "loss": 0.2567, + "step": 57400 + }, + { + "epoch": 2.5518958083299994, + "grad_norm": 0.1579085737466812, + "learning_rate": 2.0262368262907484e-05, + "loss": 0.2607, + "step": 57410 + }, + { + "epoch": 2.5523403120416055, + "grad_norm": 0.13231994211673737, + "learning_rate": 2.024575531805279e-05, + "loss": 0.259, + "step": 57420 + }, + { + "epoch": 2.5527848157532116, + "grad_norm": 0.14626067876815796, + "learning_rate": 2.0229147457384735e-05, + "loss": 0.26, + "step": 57430 + }, + { + "epoch": 2.5532293194648177, + "grad_norm": 0.11396131664514542, + "learning_rate": 2.0212544683741157e-05, + "loss": 0.2584, + "step": 57440 + }, + { + "epoch": 2.5536738231764238, + "grad_norm": 0.11985598504543304, + "learning_rate": 2.0195946999958976e-05, + "loss": 0.2581, + "step": 57450 + }, + { + "epoch": 2.5541183268880294, + "grad_norm": 0.15035226941108704, + "learning_rate": 2.017935440887434e-05, + "loss": 0.2564, + "step": 57460 + }, + { + "epoch": 2.5545628305996355, + "grad_norm": 0.1297147125005722, + "learning_rate": 2.0162766913322423e-05, + "loss": 0.259, + "step": 57470 + }, + { + "epoch": 2.5550073343112416, + "grad_norm": 0.14253467321395874, + "learning_rate": 2.0146184516137588e-05, + "loss": 0.2585, + "step": 57480 + }, + { + "epoch": 2.5554518380228473, + "grad_norm": 0.14578713476657867, + "learning_rate": 2.012960722015332e-05, + "loss": 0.2585, + "step": 57490 + }, + { + "epoch": 2.5558963417344533, + "grad_norm": 0.14917393028736115, + "learning_rate": 2.0113035028202214e-05, + "loss": 0.2601, + "step": 57500 + }, + { + "epoch": 2.5563408454460594, + "grad_norm": 0.1391851305961609, + "learning_rate": 2.009646794311602e-05, + "loss": 0.2618, + "step": 57510 + }, + { + "epoch": 2.5567853491576655, + "grad_norm": 0.14508801698684692, + "learning_rate": 2.007990596772559e-05, + "loss": 0.2577, + "step": 57520 + }, + { + "epoch": 2.5572298528692716, + "grad_norm": 0.1435338407754898, + "learning_rate": 2.0063349104860923e-05, + "loss": 0.2577, + "step": 57530 + }, + { + "epoch": 2.5576743565808773, + "grad_norm": 0.12670952081680298, + "learning_rate": 2.0046797357351116e-05, + "loss": 0.2595, + "step": 57540 + }, + { + "epoch": 2.5581188602924834, + "grad_norm": 0.15251223742961884, + "learning_rate": 2.0030250728024412e-05, + "loss": 0.2588, + "step": 57550 + }, + { + "epoch": 2.5585633640040895, + "grad_norm": 0.14237426221370697, + "learning_rate": 2.001370921970819e-05, + "loss": 0.2605, + "step": 57560 + }, + { + "epoch": 2.5590078677156956, + "grad_norm": 0.14682918787002563, + "learning_rate": 1.9997172835228932e-05, + "loss": 0.2578, + "step": 57570 + }, + { + "epoch": 2.5594523714273016, + "grad_norm": 0.14895698428153992, + "learning_rate": 1.9980641577412262e-05, + "loss": 0.2609, + "step": 57580 + }, + { + "epoch": 2.5598968751389073, + "grad_norm": 0.1617768555879593, + "learning_rate": 1.9964115449082925e-05, + "loss": 0.2574, + "step": 57590 + }, + { + "epoch": 2.5603413788505134, + "grad_norm": 0.1270231306552887, + "learning_rate": 1.9947594453064742e-05, + "loss": 0.2585, + "step": 57600 + }, + { + "epoch": 2.5607858825621195, + "grad_norm": 0.16494978964328766, + "learning_rate": 1.9931078592180774e-05, + "loss": 0.2559, + "step": 57610 + }, + { + "epoch": 2.561230386273725, + "grad_norm": 0.12890665233135223, + "learning_rate": 1.9914567869253065e-05, + "loss": 0.2596, + "step": 57620 + }, + { + "epoch": 2.561674889985331, + "grad_norm": 0.15740150213241577, + "learning_rate": 1.989806228710287e-05, + "loss": 0.2584, + "step": 57630 + }, + { + "epoch": 2.5621193936969373, + "grad_norm": 0.1245487630367279, + "learning_rate": 1.9881561848550555e-05, + "loss": 0.2569, + "step": 57640 + }, + { + "epoch": 2.5625638974085434, + "grad_norm": 0.14623592793941498, + "learning_rate": 1.9865066556415544e-05, + "loss": 0.2609, + "step": 57650 + }, + { + "epoch": 2.5630084011201495, + "grad_norm": 0.13908377289772034, + "learning_rate": 1.98485764135165e-05, + "loss": 0.2559, + "step": 57660 + }, + { + "epoch": 2.563452904831755, + "grad_norm": 0.1316874772310257, + "learning_rate": 1.983209142267109e-05, + "loss": 0.2577, + "step": 57670 + }, + { + "epoch": 2.5638974085433612, + "grad_norm": 0.16626329720020294, + "learning_rate": 1.9815611586696165e-05, + "loss": 0.2579, + "step": 57680 + }, + { + "epoch": 2.5643419122549673, + "grad_norm": 0.15408232808113098, + "learning_rate": 1.9799136908407667e-05, + "loss": 0.2576, + "step": 57690 + }, + { + "epoch": 2.5647864159665734, + "grad_norm": 0.1508876234292984, + "learning_rate": 1.9782667390620678e-05, + "loss": 0.2578, + "step": 57700 + }, + { + "epoch": 2.5652309196781795, + "grad_norm": 0.14518077671527863, + "learning_rate": 1.976620303614939e-05, + "loss": 0.2591, + "step": 57710 + }, + { + "epoch": 2.565675423389785, + "grad_norm": 0.13963621854782104, + "learning_rate": 1.9749743847807108e-05, + "loss": 0.2588, + "step": 57720 + }, + { + "epoch": 2.5661199271013913, + "grad_norm": 0.18021538853645325, + "learning_rate": 1.9733289828406272e-05, + "loss": 0.2613, + "step": 57730 + }, + { + "epoch": 2.5665644308129973, + "grad_norm": 0.12877391278743744, + "learning_rate": 1.9716840980758382e-05, + "loss": 0.2592, + "step": 57740 + }, + { + "epoch": 2.5670089345246034, + "grad_norm": 0.14113885164260864, + "learning_rate": 1.9700397307674134e-05, + "loss": 0.2575, + "step": 57750 + }, + { + "epoch": 2.5674534382362095, + "grad_norm": 0.13748441636562347, + "learning_rate": 1.968395881196328e-05, + "loss": 0.2606, + "step": 57760 + }, + { + "epoch": 2.567897941947815, + "grad_norm": 0.13726039230823517, + "learning_rate": 1.966752549643473e-05, + "loss": 0.2577, + "step": 57770 + }, + { + "epoch": 2.5683424456594213, + "grad_norm": 0.12363076210021973, + "learning_rate": 1.965109736389647e-05, + "loss": 0.259, + "step": 57780 + }, + { + "epoch": 2.5687869493710274, + "grad_norm": 0.14361847937107086, + "learning_rate": 1.9634674417155645e-05, + "loss": 0.2588, + "step": 57790 + }, + { + "epoch": 2.569231453082633, + "grad_norm": 0.13852134346961975, + "learning_rate": 1.9618256659018434e-05, + "loss": 0.2616, + "step": 57800 + }, + { + "epoch": 2.569675956794239, + "grad_norm": 0.14778700470924377, + "learning_rate": 1.9601844092290257e-05, + "loss": 0.2586, + "step": 57810 + }, + { + "epoch": 2.570120460505845, + "grad_norm": 0.1254405826330185, + "learning_rate": 1.9585436719775512e-05, + "loss": 0.2589, + "step": 57820 + }, + { + "epoch": 2.5705649642174513, + "grad_norm": 0.13533638417720795, + "learning_rate": 1.9569034544277793e-05, + "loss": 0.2576, + "step": 57830 + }, + { + "epoch": 2.5710094679290574, + "grad_norm": 0.15735755860805511, + "learning_rate": 1.9552637568599798e-05, + "loss": 0.2579, + "step": 57840 + }, + { + "epoch": 2.571453971640663, + "grad_norm": 0.1594971865415573, + "learning_rate": 1.953624579554327e-05, + "loss": 0.2551, + "step": 57850 + }, + { + "epoch": 2.571898475352269, + "grad_norm": 0.134843647480011, + "learning_rate": 1.951985922790918e-05, + "loss": 0.2597, + "step": 57860 + }, + { + "epoch": 2.572342979063875, + "grad_norm": 0.12639112770557404, + "learning_rate": 1.9503477868497505e-05, + "loss": 0.2586, + "step": 57870 + }, + { + "epoch": 2.5727874827754813, + "grad_norm": 0.1301959604024887, + "learning_rate": 1.9487101720107375e-05, + "loss": 0.2573, + "step": 57880 + }, + { + "epoch": 2.5732319864870874, + "grad_norm": 0.1253037303686142, + "learning_rate": 1.9470730785537032e-05, + "loss": 0.2579, + "step": 57890 + }, + { + "epoch": 2.573676490198693, + "grad_norm": 0.12327878922224045, + "learning_rate": 1.9454365067583823e-05, + "loss": 0.2583, + "step": 57900 + }, + { + "epoch": 2.574120993910299, + "grad_norm": 0.13507981598377228, + "learning_rate": 1.9438004569044215e-05, + "loss": 0.2596, + "step": 57910 + }, + { + "epoch": 2.5745654976219052, + "grad_norm": 0.1349710077047348, + "learning_rate": 1.9421649292713724e-05, + "loss": 0.2573, + "step": 57920 + }, + { + "epoch": 2.575010001333511, + "grad_norm": 0.12491349130868912, + "learning_rate": 1.9405299241387076e-05, + "loss": 0.2576, + "step": 57930 + }, + { + "epoch": 2.575454505045117, + "grad_norm": 0.12099231034517288, + "learning_rate": 1.9388954417858007e-05, + "loss": 0.2573, + "step": 57940 + }, + { + "epoch": 2.575899008756723, + "grad_norm": 0.15301240980625153, + "learning_rate": 1.9372614824919417e-05, + "loss": 0.259, + "step": 57950 + }, + { + "epoch": 2.576343512468329, + "grad_norm": 0.1448858231306076, + "learning_rate": 1.9356280465363284e-05, + "loss": 0.2577, + "step": 57960 + }, + { + "epoch": 2.5767880161799352, + "grad_norm": 0.14880208671092987, + "learning_rate": 1.9339951341980723e-05, + "loss": 0.2602, + "step": 57970 + }, + { + "epoch": 2.577232519891541, + "grad_norm": 0.14195221662521362, + "learning_rate": 1.9323627457561916e-05, + "loss": 0.2577, + "step": 57980 + }, + { + "epoch": 2.577677023603147, + "grad_norm": 0.15505331754684448, + "learning_rate": 1.9307308814896198e-05, + "loss": 0.2586, + "step": 57990 + }, + { + "epoch": 2.578121527314753, + "grad_norm": 0.1373174637556076, + "learning_rate": 1.9290995416771935e-05, + "loss": 0.2578, + "step": 58000 + }, + { + "epoch": 2.578566031026359, + "grad_norm": 0.1354530155658722, + "learning_rate": 1.9274687265976665e-05, + "loss": 0.2585, + "step": 58010 + }, + { + "epoch": 2.5790105347379653, + "grad_norm": 0.13981805741786957, + "learning_rate": 1.9258384365297e-05, + "loss": 0.2553, + "step": 58020 + }, + { + "epoch": 2.579455038449571, + "grad_norm": 0.13408997654914856, + "learning_rate": 1.924208671751866e-05, + "loss": 0.2564, + "step": 58030 + }, + { + "epoch": 2.579899542161177, + "grad_norm": 0.14605741202831268, + "learning_rate": 1.9225794325426492e-05, + "loss": 0.2605, + "step": 58040 + }, + { + "epoch": 2.580344045872783, + "grad_norm": 0.13863752782344818, + "learning_rate": 1.920950719180436e-05, + "loss": 0.2557, + "step": 58050 + }, + { + "epoch": 2.580788549584389, + "grad_norm": 0.13970504701137543, + "learning_rate": 1.919322531943536e-05, + "loss": 0.2583, + "step": 58060 + }, + { + "epoch": 2.5812330532959953, + "grad_norm": 0.17505645751953125, + "learning_rate": 1.917694871110157e-05, + "loss": 0.2591, + "step": 58070 + }, + { + "epoch": 2.581677557007601, + "grad_norm": 0.1349313110113144, + "learning_rate": 1.9160677369584234e-05, + "loss": 0.2579, + "step": 58080 + }, + { + "epoch": 2.582122060719207, + "grad_norm": 0.13921742141246796, + "learning_rate": 1.9144411297663694e-05, + "loss": 0.2583, + "step": 58090 + }, + { + "epoch": 2.582566564430813, + "grad_norm": 0.15114696323871613, + "learning_rate": 1.9128150498119328e-05, + "loss": 0.2595, + "step": 58100 + }, + { + "epoch": 2.5830110681424188, + "grad_norm": 0.1418963223695755, + "learning_rate": 1.9111894973729726e-05, + "loss": 0.2596, + "step": 58110 + }, + { + "epoch": 2.583455571854025, + "grad_norm": 0.14544376730918884, + "learning_rate": 1.9095644727272454e-05, + "loss": 0.2588, + "step": 58120 + }, + { + "epoch": 2.583900075565631, + "grad_norm": 0.14984576404094696, + "learning_rate": 1.907939976152429e-05, + "loss": 0.2615, + "step": 58130 + }, + { + "epoch": 2.584344579277237, + "grad_norm": 0.15008631348609924, + "learning_rate": 1.906316007926101e-05, + "loss": 0.2581, + "step": 58140 + }, + { + "epoch": 2.584789082988843, + "grad_norm": 0.12939275801181793, + "learning_rate": 1.904692568325755e-05, + "loss": 0.2561, + "step": 58150 + }, + { + "epoch": 2.585233586700449, + "grad_norm": 0.12248633056879044, + "learning_rate": 1.9030696576287925e-05, + "loss": 0.259, + "step": 58160 + }, + { + "epoch": 2.585678090412055, + "grad_norm": 0.14189454913139343, + "learning_rate": 1.9014472761125242e-05, + "loss": 0.2564, + "step": 58170 + }, + { + "epoch": 2.586122594123661, + "grad_norm": 0.11461863666772842, + "learning_rate": 1.899825424054172e-05, + "loss": 0.259, + "step": 58180 + }, + { + "epoch": 2.586567097835267, + "grad_norm": 0.13528230786323547, + "learning_rate": 1.898204101730863e-05, + "loss": 0.2575, + "step": 58190 + }, + { + "epoch": 2.587011601546873, + "grad_norm": 0.12133082747459412, + "learning_rate": 1.8965833094196394e-05, + "loss": 0.2572, + "step": 58200 + }, + { + "epoch": 2.587456105258479, + "grad_norm": 0.12427672743797302, + "learning_rate": 1.8949630473974495e-05, + "loss": 0.2551, + "step": 58210 + }, + { + "epoch": 2.587900608970085, + "grad_norm": 0.129355788230896, + "learning_rate": 1.8933433159411517e-05, + "loss": 0.2588, + "step": 58220 + }, + { + "epoch": 2.588345112681691, + "grad_norm": 0.13746440410614014, + "learning_rate": 1.891724115327514e-05, + "loss": 0.2587, + "step": 58230 + }, + { + "epoch": 2.5887896163932966, + "grad_norm": 0.11675186455249786, + "learning_rate": 1.8901054458332152e-05, + "loss": 0.2603, + "step": 58240 + }, + { + "epoch": 2.5892341201049027, + "grad_norm": 0.1456165611743927, + "learning_rate": 1.8884873077348364e-05, + "loss": 0.2577, + "step": 58250 + }, + { + "epoch": 2.589678623816509, + "grad_norm": 0.12416241317987442, + "learning_rate": 1.88686970130888e-05, + "loss": 0.2558, + "step": 58260 + }, + { + "epoch": 2.590123127528115, + "grad_norm": 0.13267667591571808, + "learning_rate": 1.8852526268317456e-05, + "loss": 0.2587, + "step": 58270 + }, + { + "epoch": 2.590567631239721, + "grad_norm": 0.1318807601928711, + "learning_rate": 1.883636084579749e-05, + "loss": 0.2575, + "step": 58280 + }, + { + "epoch": 2.5910121349513267, + "grad_norm": 0.14942193031311035, + "learning_rate": 1.8820200748291145e-05, + "loss": 0.2593, + "step": 58290 + }, + { + "epoch": 2.5914566386629327, + "grad_norm": 0.13394971191883087, + "learning_rate": 1.8804045978559686e-05, + "loss": 0.2564, + "step": 58300 + }, + { + "epoch": 2.591901142374539, + "grad_norm": 0.1449703574180603, + "learning_rate": 1.8787896539363594e-05, + "loss": 0.2579, + "step": 58310 + }, + { + "epoch": 2.592345646086145, + "grad_norm": 0.18392013013362885, + "learning_rate": 1.8771752433462298e-05, + "loss": 0.2589, + "step": 58320 + }, + { + "epoch": 2.592790149797751, + "grad_norm": 0.13605576753616333, + "learning_rate": 1.8755613663614447e-05, + "loss": 0.2568, + "step": 58330 + }, + { + "epoch": 2.5932346535093567, + "grad_norm": 0.13083279132843018, + "learning_rate": 1.873948023257767e-05, + "loss": 0.2601, + "step": 58340 + }, + { + "epoch": 2.5936791572209628, + "grad_norm": 0.14663748443126678, + "learning_rate": 1.8723352143108747e-05, + "loss": 0.2592, + "step": 58350 + }, + { + "epoch": 2.594123660932569, + "grad_norm": 0.13812921941280365, + "learning_rate": 1.870722939796352e-05, + "loss": 0.2597, + "step": 58360 + }, + { + "epoch": 2.5945681646441745, + "grad_norm": 0.1663462370634079, + "learning_rate": 1.869111199989693e-05, + "loss": 0.2594, + "step": 58370 + }, + { + "epoch": 2.595012668355781, + "grad_norm": 0.17797522246837616, + "learning_rate": 1.8674999951663018e-05, + "loss": 0.2598, + "step": 58380 + }, + { + "epoch": 2.5954571720673867, + "grad_norm": 0.13579201698303223, + "learning_rate": 1.8658893256014857e-05, + "loss": 0.2573, + "step": 58390 + }, + { + "epoch": 2.595901675778993, + "grad_norm": 0.15085048973560333, + "learning_rate": 1.8642791915704655e-05, + "loss": 0.2594, + "step": 58400 + }, + { + "epoch": 2.596346179490599, + "grad_norm": 0.12026708573102951, + "learning_rate": 1.8626695933483697e-05, + "loss": 0.2568, + "step": 58410 + }, + { + "epoch": 2.5967906832022045, + "grad_norm": 0.1396855115890503, + "learning_rate": 1.8610605312102335e-05, + "loss": 0.2584, + "step": 58420 + }, + { + "epoch": 2.5972351869138106, + "grad_norm": 0.14944997429847717, + "learning_rate": 1.8594520054310032e-05, + "loss": 0.2572, + "step": 58430 + }, + { + "epoch": 2.5976796906254167, + "grad_norm": 0.142926424741745, + "learning_rate": 1.8578440162855326e-05, + "loss": 0.2601, + "step": 58440 + }, + { + "epoch": 2.598124194337023, + "grad_norm": 0.14013074338436127, + "learning_rate": 1.8562365640485774e-05, + "loss": 0.2553, + "step": 58450 + }, + { + "epoch": 2.598568698048629, + "grad_norm": 0.13817253708839417, + "learning_rate": 1.854629648994815e-05, + "loss": 0.259, + "step": 58460 + }, + { + "epoch": 2.5990132017602345, + "grad_norm": 0.12501458823680878, + "learning_rate": 1.8530232713988183e-05, + "loss": 0.2546, + "step": 58470 + }, + { + "epoch": 2.5994577054718406, + "grad_norm": 0.18143609166145325, + "learning_rate": 1.851417431535074e-05, + "loss": 0.2602, + "step": 58480 + }, + { + "epoch": 2.5999022091834467, + "grad_norm": 0.1425943374633789, + "learning_rate": 1.8498121296779785e-05, + "loss": 0.2569, + "step": 58490 + }, + { + "epoch": 2.600346712895053, + "grad_norm": 0.13544179499149323, + "learning_rate": 1.848207366101829e-05, + "loss": 0.2546, + "step": 58500 + }, + { + "epoch": 2.600791216606659, + "grad_norm": 0.12061214447021484, + "learning_rate": 1.8466031410808422e-05, + "loss": 0.2578, + "step": 58510 + }, + { + "epoch": 2.6012357203182646, + "grad_norm": 0.14856742322444916, + "learning_rate": 1.844999454889129e-05, + "loss": 0.2577, + "step": 58520 + }, + { + "epoch": 2.6016802240298706, + "grad_norm": 0.14400476217269897, + "learning_rate": 1.8433963078007237e-05, + "loss": 0.2561, + "step": 58530 + }, + { + "epoch": 2.6021247277414767, + "grad_norm": 0.16355673968791962, + "learning_rate": 1.841793700089554e-05, + "loss": 0.2578, + "step": 58540 + }, + { + "epoch": 2.6025692314530824, + "grad_norm": 0.13559240102767944, + "learning_rate": 1.8401916320294644e-05, + "loss": 0.2552, + "step": 58550 + }, + { + "epoch": 2.6030137351646885, + "grad_norm": 0.16555656492710114, + "learning_rate": 1.838590103894205e-05, + "loss": 0.2609, + "step": 58560 + }, + { + "epoch": 2.6034582388762946, + "grad_norm": 0.11196870356798172, + "learning_rate": 1.8369891159574292e-05, + "loss": 0.2579, + "step": 58570 + }, + { + "epoch": 2.6039027425879007, + "grad_norm": 0.1685989499092102, + "learning_rate": 1.835388668492708e-05, + "loss": 0.2588, + "step": 58580 + }, + { + "epoch": 2.6043472462995068, + "grad_norm": 0.1291351467370987, + "learning_rate": 1.8337887617735095e-05, + "loss": 0.2557, + "step": 58590 + }, + { + "epoch": 2.6047917500111124, + "grad_norm": 0.162295401096344, + "learning_rate": 1.8321893960732157e-05, + "loss": 0.2586, + "step": 58600 + }, + { + "epoch": 2.6052362537227185, + "grad_norm": 0.1432657688856125, + "learning_rate": 1.8305905716651138e-05, + "loss": 0.2621, + "step": 58610 + }, + { + "epoch": 2.6056807574343246, + "grad_norm": 0.16648653149604797, + "learning_rate": 1.8289922888224e-05, + "loss": 0.2584, + "step": 58620 + }, + { + "epoch": 2.6061252611459307, + "grad_norm": 0.1258690357208252, + "learning_rate": 1.8273945478181765e-05, + "loss": 0.2566, + "step": 58630 + }, + { + "epoch": 2.6065697648575368, + "grad_norm": 0.1485162228345871, + "learning_rate": 1.8257973489254558e-05, + "loss": 0.2599, + "step": 58640 + }, + { + "epoch": 2.6070142685691424, + "grad_norm": 0.12320176512002945, + "learning_rate": 1.824200692417152e-05, + "loss": 0.2573, + "step": 58650 + }, + { + "epoch": 2.6074587722807485, + "grad_norm": 0.14483389258384705, + "learning_rate": 1.8226045785660912e-05, + "loss": 0.2532, + "step": 58660 + }, + { + "epoch": 2.6079032759923546, + "grad_norm": 0.1374175250530243, + "learning_rate": 1.821009007645006e-05, + "loss": 0.2581, + "step": 58670 + }, + { + "epoch": 2.6083477797039603, + "grad_norm": 0.16512495279312134, + "learning_rate": 1.8194139799265357e-05, + "loss": 0.261, + "step": 58680 + }, + { + "epoch": 2.608792283415567, + "grad_norm": 0.14202335476875305, + "learning_rate": 1.8178194956832295e-05, + "loss": 0.26, + "step": 58690 + }, + { + "epoch": 2.6092367871271724, + "grad_norm": 0.1133420392870903, + "learning_rate": 1.8162255551875346e-05, + "loss": 0.2585, + "step": 58700 + }, + { + "epoch": 2.6096812908387785, + "grad_norm": 0.1327621340751648, + "learning_rate": 1.8146321587118194e-05, + "loss": 0.2585, + "step": 58710 + }, + { + "epoch": 2.6101257945503846, + "grad_norm": 0.13379846513271332, + "learning_rate": 1.8130393065283448e-05, + "loss": 0.257, + "step": 58720 + }, + { + "epoch": 2.6105702982619903, + "grad_norm": 0.12968070805072784, + "learning_rate": 1.8114469989092925e-05, + "loss": 0.2537, + "step": 58730 + }, + { + "epoch": 2.6110148019735964, + "grad_norm": 0.1644061952829361, + "learning_rate": 1.8098552361267397e-05, + "loss": 0.2582, + "step": 58740 + }, + { + "epoch": 2.6114593056852025, + "grad_norm": 0.14513720571994781, + "learning_rate": 1.8082640184526763e-05, + "loss": 0.2574, + "step": 58750 + }, + { + "epoch": 2.6119038093968086, + "grad_norm": 0.14395186305046082, + "learning_rate": 1.806673346158999e-05, + "loss": 0.2575, + "step": 58760 + }, + { + "epoch": 2.6123483131084146, + "grad_norm": 0.12668265402317047, + "learning_rate": 1.8050832195175067e-05, + "loss": 0.255, + "step": 58770 + }, + { + "epoch": 2.6127928168200203, + "grad_norm": 0.13935600221157074, + "learning_rate": 1.8034936387999136e-05, + "loss": 0.2602, + "step": 58780 + }, + { + "epoch": 2.6132373205316264, + "grad_norm": 0.1381150484085083, + "learning_rate": 1.8019046042778315e-05, + "loss": 0.2558, + "step": 58790 + }, + { + "epoch": 2.6136818242432325, + "grad_norm": 0.15117113292217255, + "learning_rate": 1.800316116222785e-05, + "loss": 0.2555, + "step": 58800 + }, + { + "epoch": 2.6141263279548386, + "grad_norm": 0.13310515880584717, + "learning_rate": 1.7987281749062018e-05, + "loss": 0.2566, + "step": 58810 + }, + { + "epoch": 2.6145708316664447, + "grad_norm": 0.14736177027225494, + "learning_rate": 1.7971407805994195e-05, + "loss": 0.2585, + "step": 58820 + }, + { + "epoch": 2.6150153353780503, + "grad_norm": 0.1391032338142395, + "learning_rate": 1.7955539335736787e-05, + "loss": 0.258, + "step": 58830 + }, + { + "epoch": 2.6154598390896564, + "grad_norm": 0.14220532774925232, + "learning_rate": 1.7939676341001304e-05, + "loss": 0.2551, + "step": 58840 + }, + { + "epoch": 2.6159043428012625, + "grad_norm": 0.14680300652980804, + "learning_rate": 1.7923818824498275e-05, + "loss": 0.2562, + "step": 58850 + }, + { + "epoch": 2.616348846512868, + "grad_norm": 0.1800345778465271, + "learning_rate": 1.7907966788937315e-05, + "loss": 0.2615, + "step": 58860 + }, + { + "epoch": 2.6167933502244742, + "grad_norm": 0.16554823517799377, + "learning_rate": 1.7892120237027116e-05, + "loss": 0.2561, + "step": 58870 + }, + { + "epoch": 2.6172378539360803, + "grad_norm": 0.12668035924434662, + "learning_rate": 1.7876279171475413e-05, + "loss": 0.2547, + "step": 58880 + }, + { + "epoch": 2.6176823576476864, + "grad_norm": 0.1526336669921875, + "learning_rate": 1.7860443594989028e-05, + "loss": 0.2562, + "step": 58890 + }, + { + "epoch": 2.6181268613592925, + "grad_norm": 0.1524258404970169, + "learning_rate": 1.784461351027379e-05, + "loss": 0.2583, + "step": 58900 + }, + { + "epoch": 2.618571365070898, + "grad_norm": 0.12579157948493958, + "learning_rate": 1.7828788920034677e-05, + "loss": 0.2586, + "step": 58910 + }, + { + "epoch": 2.6190158687825043, + "grad_norm": 0.14846853911876678, + "learning_rate": 1.7812969826975623e-05, + "loss": 0.2596, + "step": 58920 + }, + { + "epoch": 2.6194603724941103, + "grad_norm": 0.12560024857521057, + "learning_rate": 1.7797156233799738e-05, + "loss": 0.2555, + "step": 58930 + }, + { + "epoch": 2.6199048762057164, + "grad_norm": 0.11079704761505127, + "learning_rate": 1.7781348143209094e-05, + "loss": 0.2607, + "step": 58940 + }, + { + "epoch": 2.6203493799173225, + "grad_norm": 0.14915847778320312, + "learning_rate": 1.7765545557904862e-05, + "loss": 0.2587, + "step": 58950 + }, + { + "epoch": 2.620793883628928, + "grad_norm": 0.14025302231311798, + "learning_rate": 1.7749748480587302e-05, + "loss": 0.2582, + "step": 58960 + }, + { + "epoch": 2.6212383873405343, + "grad_norm": 0.13021321594715118, + "learning_rate": 1.773395691395564e-05, + "loss": 0.2586, + "step": 58970 + }, + { + "epoch": 2.6216828910521404, + "grad_norm": 0.13828790187835693, + "learning_rate": 1.7718170860708305e-05, + "loss": 0.2594, + "step": 58980 + }, + { + "epoch": 2.622127394763746, + "grad_norm": 0.13261575996875763, + "learning_rate": 1.770239032354264e-05, + "loss": 0.2578, + "step": 58990 + }, + { + "epoch": 2.622571898475352, + "grad_norm": 0.13567514717578888, + "learning_rate": 1.7686615305155124e-05, + "loss": 0.2557, + "step": 59000 + }, + { + "epoch": 2.623016402186958, + "grad_norm": 0.12659995257854462, + "learning_rate": 1.767084580824128e-05, + "loss": 0.2552, + "step": 59010 + }, + { + "epoch": 2.6234609058985643, + "grad_norm": 0.14262735843658447, + "learning_rate": 1.765508183549569e-05, + "loss": 0.2591, + "step": 59020 + }, + { + "epoch": 2.6239054096101704, + "grad_norm": 0.1316583752632141, + "learning_rate": 1.763932338961199e-05, + "loss": 0.258, + "step": 59030 + }, + { + "epoch": 2.624349913321776, + "grad_norm": 0.12452832609415054, + "learning_rate": 1.762357047328284e-05, + "loss": 0.2584, + "step": 59040 + }, + { + "epoch": 2.624794417033382, + "grad_norm": 0.1394549012184143, + "learning_rate": 1.7607823089199997e-05, + "loss": 0.2595, + "step": 59050 + }, + { + "epoch": 2.625238920744988, + "grad_norm": 0.11332371085882187, + "learning_rate": 1.7592081240054265e-05, + "loss": 0.2575, + "step": 59060 + }, + { + "epoch": 2.6256834244565943, + "grad_norm": 0.15282361209392548, + "learning_rate": 1.7576344928535488e-05, + "loss": 0.256, + "step": 59070 + }, + { + "epoch": 2.6261279281682004, + "grad_norm": 0.13564477860927582, + "learning_rate": 1.7560614157332573e-05, + "loss": 0.2571, + "step": 59080 + }, + { + "epoch": 2.626572431879806, + "grad_norm": 0.11747431010007858, + "learning_rate": 1.7544888929133495e-05, + "loss": 0.2559, + "step": 59090 + }, + { + "epoch": 2.627016935591412, + "grad_norm": 0.15244260430335999, + "learning_rate": 1.752916924662522e-05, + "loss": 0.2588, + "step": 59100 + }, + { + "epoch": 2.6274614393030182, + "grad_norm": 0.16911368072032928, + "learning_rate": 1.751345511249387e-05, + "loss": 0.2579, + "step": 59110 + }, + { + "epoch": 2.6279059430146243, + "grad_norm": 0.12687353789806366, + "learning_rate": 1.749774652942452e-05, + "loss": 0.2563, + "step": 59120 + }, + { + "epoch": 2.6283504467262304, + "grad_norm": 0.13830314576625824, + "learning_rate": 1.748204350010135e-05, + "loss": 0.2578, + "step": 59130 + }, + { + "epoch": 2.628794950437836, + "grad_norm": 0.125711128115654, + "learning_rate": 1.7466346027207574e-05, + "loss": 0.2566, + "step": 59140 + }, + { + "epoch": 2.629239454149442, + "grad_norm": 0.1213197112083435, + "learning_rate": 1.745065411342547e-05, + "loss": 0.2579, + "step": 59150 + }, + { + "epoch": 2.6296839578610482, + "grad_norm": 0.13305696845054626, + "learning_rate": 1.7434967761436366e-05, + "loss": 0.2615, + "step": 59160 + }, + { + "epoch": 2.630128461572654, + "grad_norm": 0.1303785741329193, + "learning_rate": 1.741928697392058e-05, + "loss": 0.2588, + "step": 59170 + }, + { + "epoch": 2.63057296528426, + "grad_norm": 0.13249632716178894, + "learning_rate": 1.7403611753557597e-05, + "loss": 0.2564, + "step": 59180 + }, + { + "epoch": 2.631017468995866, + "grad_norm": 0.12789848446846008, + "learning_rate": 1.738794210302584e-05, + "loss": 0.2597, + "step": 59190 + }, + { + "epoch": 2.631461972707472, + "grad_norm": 0.12286687642335892, + "learning_rate": 1.7372278025002837e-05, + "loss": 0.2581, + "step": 59200 + }, + { + "epoch": 2.6319064764190783, + "grad_norm": 0.13338090479373932, + "learning_rate": 1.7356619522165164e-05, + "loss": 0.2566, + "step": 59210 + }, + { + "epoch": 2.632350980130684, + "grad_norm": 0.13344936072826385, + "learning_rate": 1.7340966597188378e-05, + "loss": 0.257, + "step": 59220 + }, + { + "epoch": 2.63279548384229, + "grad_norm": 0.1421816200017929, + "learning_rate": 1.732531925274722e-05, + "loss": 0.2557, + "step": 59230 + }, + { + "epoch": 2.633239987553896, + "grad_norm": 0.13602469861507416, + "learning_rate": 1.7309677491515318e-05, + "loss": 0.26, + "step": 59240 + }, + { + "epoch": 2.633684491265502, + "grad_norm": 0.12352374941110611, + "learning_rate": 1.7294041316165455e-05, + "loss": 0.2563, + "step": 59250 + }, + { + "epoch": 2.6341289949771083, + "grad_norm": 0.1648363322019577, + "learning_rate": 1.727841072936942e-05, + "loss": 0.2581, + "step": 59260 + }, + { + "epoch": 2.634573498688714, + "grad_norm": 0.17000220715999603, + "learning_rate": 1.7262785733798058e-05, + "loss": 0.2579, + "step": 59270 + }, + { + "epoch": 2.63501800240032, + "grad_norm": 0.15906067192554474, + "learning_rate": 1.7247166332121246e-05, + "loss": 0.2598, + "step": 59280 + }, + { + "epoch": 2.635462506111926, + "grad_norm": 0.13343185186386108, + "learning_rate": 1.7231552527007933e-05, + "loss": 0.258, + "step": 59290 + }, + { + "epoch": 2.6359070098235318, + "grad_norm": 0.15006427466869354, + "learning_rate": 1.721594432112606e-05, + "loss": 0.2575, + "step": 59300 + }, + { + "epoch": 2.636351513535138, + "grad_norm": 0.136173278093338, + "learning_rate": 1.7200341717142655e-05, + "loss": 0.257, + "step": 59310 + }, + { + "epoch": 2.636796017246744, + "grad_norm": 0.1647479385137558, + "learning_rate": 1.718474471772377e-05, + "loss": 0.2574, + "step": 59320 + }, + { + "epoch": 2.63724052095835, + "grad_norm": 0.14773611724376678, + "learning_rate": 1.7169153325534528e-05, + "loss": 0.2567, + "step": 59330 + }, + { + "epoch": 2.637685024669956, + "grad_norm": 0.14281630516052246, + "learning_rate": 1.7153567543239045e-05, + "loss": 0.2562, + "step": 59340 + }, + { + "epoch": 2.638129528381562, + "grad_norm": 0.14333060383796692, + "learning_rate": 1.7137987373500525e-05, + "loss": 0.2559, + "step": 59350 + }, + { + "epoch": 2.638574032093168, + "grad_norm": 0.1217353567481041, + "learning_rate": 1.7122412818981198e-05, + "loss": 0.2594, + "step": 59360 + }, + { + "epoch": 2.639018535804774, + "grad_norm": 0.15818186104297638, + "learning_rate": 1.7106843882342283e-05, + "loss": 0.2578, + "step": 59370 + }, + { + "epoch": 2.63946303951638, + "grad_norm": 0.1404886096715927, + "learning_rate": 1.709128056624415e-05, + "loss": 0.2584, + "step": 59380 + }, + { + "epoch": 2.639907543227986, + "grad_norm": 0.15723110735416412, + "learning_rate": 1.70757228733461e-05, + "loss": 0.2591, + "step": 59390 + }, + { + "epoch": 2.640352046939592, + "grad_norm": 0.13348442316055298, + "learning_rate": 1.706017080630653e-05, + "loss": 0.2558, + "step": 59400 + }, + { + "epoch": 2.640796550651198, + "grad_norm": 0.1460338681936264, + "learning_rate": 1.7044624367782873e-05, + "loss": 0.2563, + "step": 59410 + }, + { + "epoch": 2.641241054362804, + "grad_norm": 0.12011642754077911, + "learning_rate": 1.7029083560431553e-05, + "loss": 0.2566, + "step": 59420 + }, + { + "epoch": 2.64168555807441, + "grad_norm": 0.13610760867595673, + "learning_rate": 1.701354838690813e-05, + "loss": 0.2589, + "step": 59430 + }, + { + "epoch": 2.642130061786016, + "grad_norm": 0.1311655193567276, + "learning_rate": 1.6998018849867087e-05, + "loss": 0.2573, + "step": 59440 + }, + { + "epoch": 2.642574565497622, + "grad_norm": 0.17993582785129547, + "learning_rate": 1.698249495196202e-05, + "loss": 0.2565, + "step": 59450 + }, + { + "epoch": 2.643019069209228, + "grad_norm": 0.1353447288274765, + "learning_rate": 1.6966976695845528e-05, + "loss": 0.2565, + "step": 59460 + }, + { + "epoch": 2.643463572920834, + "grad_norm": 0.11764068156480789, + "learning_rate": 1.6951464084169268e-05, + "loss": 0.2556, + "step": 59470 + }, + { + "epoch": 2.6439080766324397, + "grad_norm": 0.1577184498310089, + "learning_rate": 1.6935957119583916e-05, + "loss": 0.2562, + "step": 59480 + }, + { + "epoch": 2.6443525803440457, + "grad_norm": 0.1452079564332962, + "learning_rate": 1.6920455804739205e-05, + "loss": 0.2585, + "step": 59490 + }, + { + "epoch": 2.644797084055652, + "grad_norm": 0.1519273966550827, + "learning_rate": 1.6904960142283855e-05, + "loss": 0.2567, + "step": 59500 + }, + { + "epoch": 2.645241587767258, + "grad_norm": 0.12536747753620148, + "learning_rate": 1.6889470134865666e-05, + "loss": 0.2562, + "step": 59510 + }, + { + "epoch": 2.645686091478864, + "grad_norm": 0.13497816026210785, + "learning_rate": 1.687398578513145e-05, + "loss": 0.256, + "step": 59520 + }, + { + "epoch": 2.6461305951904697, + "grad_norm": 0.12073882669210434, + "learning_rate": 1.6858507095727066e-05, + "loss": 0.2587, + "step": 59530 + }, + { + "epoch": 2.6465750989020758, + "grad_norm": 0.14459602534770966, + "learning_rate": 1.6843034069297403e-05, + "loss": 0.2576, + "step": 59540 + }, + { + "epoch": 2.647019602613682, + "grad_norm": 0.17854511737823486, + "learning_rate": 1.682756670848637e-05, + "loss": 0.2579, + "step": 59550 + }, + { + "epoch": 2.647464106325288, + "grad_norm": 0.17875435948371887, + "learning_rate": 1.6812105015936937e-05, + "loss": 0.2603, + "step": 59560 + }, + { + "epoch": 2.647908610036894, + "grad_norm": 0.1632152944803238, + "learning_rate": 1.6796648994291027e-05, + "loss": 0.2579, + "step": 59570 + }, + { + "epoch": 2.6483531137484997, + "grad_norm": 0.13888032734394073, + "learning_rate": 1.678119864618973e-05, + "loss": 0.2568, + "step": 59580 + }, + { + "epoch": 2.648797617460106, + "grad_norm": 0.14677250385284424, + "learning_rate": 1.6765753974273023e-05, + "loss": 0.2598, + "step": 59590 + }, + { + "epoch": 2.649242121171712, + "grad_norm": 0.14631807804107666, + "learning_rate": 1.675031498118001e-05, + "loss": 0.2559, + "step": 59600 + }, + { + "epoch": 2.6496866248833175, + "grad_norm": 0.12896551191806793, + "learning_rate": 1.6734881669548804e-05, + "loss": 0.2577, + "step": 59610 + }, + { + "epoch": 2.6501311285949236, + "grad_norm": 0.13180062174797058, + "learning_rate": 1.671945404201648e-05, + "loss": 0.2565, + "step": 59620 + }, + { + "epoch": 2.6505756323065297, + "grad_norm": 0.17162299156188965, + "learning_rate": 1.670403210121927e-05, + "loss": 0.2584, + "step": 59630 + }, + { + "epoch": 2.651020136018136, + "grad_norm": 0.1598387509584427, + "learning_rate": 1.6688615849792312e-05, + "loss": 0.2554, + "step": 59640 + }, + { + "epoch": 2.651464639729742, + "grad_norm": 0.13904467225074768, + "learning_rate": 1.6673205290369832e-05, + "loss": 0.258, + "step": 59650 + }, + { + "epoch": 2.6519091434413475, + "grad_norm": 0.14050361514091492, + "learning_rate": 1.6657800425585086e-05, + "loss": 0.2595, + "step": 59660 + }, + { + "epoch": 2.6523536471529536, + "grad_norm": 0.14207883179187775, + "learning_rate": 1.6642401258070327e-05, + "loss": 0.2577, + "step": 59670 + }, + { + "epoch": 2.6527981508645597, + "grad_norm": 0.1273978352546692, + "learning_rate": 1.6627007790456884e-05, + "loss": 0.256, + "step": 59680 + }, + { + "epoch": 2.653242654576166, + "grad_norm": 0.11575022339820862, + "learning_rate": 1.6611620025375037e-05, + "loss": 0.2577, + "step": 59690 + }, + { + "epoch": 2.653687158287772, + "grad_norm": 0.1258535087108612, + "learning_rate": 1.6596237965454154e-05, + "loss": 0.2579, + "step": 59700 + }, + { + "epoch": 2.6541316619993776, + "grad_norm": 0.13670282065868378, + "learning_rate": 1.6580861613322607e-05, + "loss": 0.2561, + "step": 59710 + }, + { + "epoch": 2.6545761657109836, + "grad_norm": 0.13812632858753204, + "learning_rate": 1.6565490971607796e-05, + "loss": 0.2588, + "step": 59720 + }, + { + "epoch": 2.6550206694225897, + "grad_norm": 0.15674243867397308, + "learning_rate": 1.655012604293614e-05, + "loss": 0.2601, + "step": 59730 + }, + { + "epoch": 2.655465173134196, + "grad_norm": 0.12379049509763718, + "learning_rate": 1.653476682993309e-05, + "loss": 0.2556, + "step": 59740 + }, + { + "epoch": 2.655909676845802, + "grad_norm": 0.13839396834373474, + "learning_rate": 1.651941333522311e-05, + "loss": 0.2572, + "step": 59750 + }, + { + "epoch": 2.6563541805574076, + "grad_norm": 0.12598100304603577, + "learning_rate": 1.6504065561429715e-05, + "loss": 0.258, + "step": 59760 + }, + { + "epoch": 2.6567986842690137, + "grad_norm": 0.17662228643894196, + "learning_rate": 1.6488723511175385e-05, + "loss": 0.2597, + "step": 59770 + }, + { + "epoch": 2.6572431879806198, + "grad_norm": 0.16506479680538177, + "learning_rate": 1.6473387187081668e-05, + "loss": 0.2571, + "step": 59780 + }, + { + "epoch": 2.6576876916922254, + "grad_norm": 0.11908158659934998, + "learning_rate": 1.6458056591769123e-05, + "loss": 0.2573, + "step": 59790 + }, + { + "epoch": 2.6581321954038315, + "grad_norm": 0.14317940175533295, + "learning_rate": 1.6442731727857335e-05, + "loss": 0.2558, + "step": 59800 + }, + { + "epoch": 2.6585766991154376, + "grad_norm": 0.1352396160364151, + "learning_rate": 1.642741259796492e-05, + "loss": 0.2569, + "step": 59810 + }, + { + "epoch": 2.6590212028270437, + "grad_norm": 0.12782928347587585, + "learning_rate": 1.641209920470944e-05, + "loss": 0.256, + "step": 59820 + }, + { + "epoch": 2.6594657065386498, + "grad_norm": 0.17314471304416656, + "learning_rate": 1.639679155070762e-05, + "loss": 0.2577, + "step": 59830 + }, + { + "epoch": 2.6599102102502554, + "grad_norm": 0.12714940309524536, + "learning_rate": 1.6381489638575048e-05, + "loss": 0.2571, + "step": 59840 + }, + { + "epoch": 2.6603547139618615, + "grad_norm": 0.14411866664886475, + "learning_rate": 1.636619347092643e-05, + "loss": 0.2591, + "step": 59850 + }, + { + "epoch": 2.6607992176734676, + "grad_norm": 0.14707772433757782, + "learning_rate": 1.6350903050375476e-05, + "loss": 0.2562, + "step": 59860 + }, + { + "epoch": 2.6612437213850737, + "grad_norm": 0.1435239315032959, + "learning_rate": 1.6335618379534856e-05, + "loss": 0.2576, + "step": 59870 + }, + { + "epoch": 2.66168822509668, + "grad_norm": 0.11530078202486038, + "learning_rate": 1.6320339461016364e-05, + "loss": 0.2554, + "step": 59880 + }, + { + "epoch": 2.6621327288082854, + "grad_norm": 0.1358613818883896, + "learning_rate": 1.6305066297430687e-05, + "loss": 0.2568, + "step": 59890 + }, + { + "epoch": 2.6625772325198915, + "grad_norm": 0.16357196867465973, + "learning_rate": 1.6289798891387654e-05, + "loss": 0.2583, + "step": 59900 + }, + { + "epoch": 2.6630217362314976, + "grad_norm": 0.14102022349834442, + "learning_rate": 1.6274537245495995e-05, + "loss": 0.2563, + "step": 59910 + }, + { + "epoch": 2.6634662399431033, + "grad_norm": 0.13097384572029114, + "learning_rate": 1.6259281362363527e-05, + "loss": 0.2541, + "step": 59920 + }, + { + "epoch": 2.6639107436547094, + "grad_norm": 0.11798316985368729, + "learning_rate": 1.6244031244597068e-05, + "loss": 0.2596, + "step": 59930 + }, + { + "epoch": 2.6643552473663155, + "grad_norm": 0.16280406713485718, + "learning_rate": 1.622878689480244e-05, + "loss": 0.2539, + "step": 59940 + }, + { + "epoch": 2.6647997510779216, + "grad_norm": 0.11825660616159439, + "learning_rate": 1.6213548315584498e-05, + "loss": 0.2591, + "step": 59950 + }, + { + "epoch": 2.6652442547895276, + "grad_norm": 0.143699049949646, + "learning_rate": 1.6198315509547074e-05, + "loss": 0.2572, + "step": 59960 + }, + { + "epoch": 2.6656887585011333, + "grad_norm": 0.11781725287437439, + "learning_rate": 1.6183088479293056e-05, + "loss": 0.2571, + "step": 59970 + }, + { + "epoch": 2.6661332622127394, + "grad_norm": 0.14177049696445465, + "learning_rate": 1.6167867227424316e-05, + "loss": 0.2568, + "step": 59980 + }, + { + "epoch": 2.6665777659243455, + "grad_norm": 0.1589653044939041, + "learning_rate": 1.615265175654176e-05, + "loss": 0.2582, + "step": 59990 + }, + { + "epoch": 2.6670222696359516, + "grad_norm": 0.13827750086784363, + "learning_rate": 1.613744206924529e-05, + "loss": 0.2565, + "step": 60000 + }, + { + "epoch": 2.6674667733475577, + "grad_norm": 0.1345442831516266, + "learning_rate": 1.6122238168133845e-05, + "loss": 0.255, + "step": 60010 + }, + { + "epoch": 2.6679112770591633, + "grad_norm": 0.12914320826530457, + "learning_rate": 1.6107040055805305e-05, + "loss": 0.2583, + "step": 60020 + }, + { + "epoch": 2.6683557807707694, + "grad_norm": 0.15331582725048065, + "learning_rate": 1.6091847734856673e-05, + "loss": 0.2598, + "step": 60030 + }, + { + "epoch": 2.6688002844823755, + "grad_norm": 0.1607656031847, + "learning_rate": 1.6076661207883865e-05, + "loss": 0.2569, + "step": 60040 + }, + { + "epoch": 2.669244788193981, + "grad_norm": 0.134032741189003, + "learning_rate": 1.6061480477481848e-05, + "loss": 0.2574, + "step": 60050 + }, + { + "epoch": 2.6696892919055877, + "grad_norm": 0.1411522924900055, + "learning_rate": 1.604630554624461e-05, + "loss": 0.2554, + "step": 60060 + }, + { + "epoch": 2.6701337956171933, + "grad_norm": 0.14411506056785583, + "learning_rate": 1.603113641676509e-05, + "loss": 0.2571, + "step": 60070 + }, + { + "epoch": 2.6705782993287994, + "grad_norm": 0.13434365391731262, + "learning_rate": 1.6015973091635338e-05, + "loss": 0.2583, + "step": 60080 + }, + { + "epoch": 2.6710228030404055, + "grad_norm": 0.12774096429347992, + "learning_rate": 1.6000815573446288e-05, + "loss": 0.2536, + "step": 60090 + }, + { + "epoch": 2.671467306752011, + "grad_norm": 0.1622992306947708, + "learning_rate": 1.5985663864788002e-05, + "loss": 0.2562, + "step": 60100 + }, + { + "epoch": 2.6719118104636173, + "grad_norm": 0.11792024970054626, + "learning_rate": 1.597051796824946e-05, + "loss": 0.2567, + "step": 60110 + }, + { + "epoch": 2.6723563141752233, + "grad_norm": 0.1642628312110901, + "learning_rate": 1.5955377886418682e-05, + "loss": 0.2539, + "step": 60120 + }, + { + "epoch": 2.6728008178868294, + "grad_norm": 0.13158616423606873, + "learning_rate": 1.5940243621882704e-05, + "loss": 0.2576, + "step": 60130 + }, + { + "epoch": 2.6732453215984355, + "grad_norm": 0.15622512996196747, + "learning_rate": 1.5925115177227555e-05, + "loss": 0.2614, + "step": 60140 + }, + { + "epoch": 2.673689825310041, + "grad_norm": 0.14716149866580963, + "learning_rate": 1.5909992555038288e-05, + "loss": 0.2577, + "step": 60150 + }, + { + "epoch": 2.6741343290216473, + "grad_norm": 0.15803799033164978, + "learning_rate": 1.5894875757898912e-05, + "loss": 0.2565, + "step": 60160 + }, + { + "epoch": 2.6745788327332534, + "grad_norm": 0.142480731010437, + "learning_rate": 1.58797647883925e-05, + "loss": 0.2559, + "step": 60170 + }, + { + "epoch": 2.6750233364448595, + "grad_norm": 0.1432759314775467, + "learning_rate": 1.586465964910109e-05, + "loss": 0.2573, + "step": 60180 + }, + { + "epoch": 2.6754678401564655, + "grad_norm": 0.15122513473033905, + "learning_rate": 1.5849560342605734e-05, + "loss": 0.2595, + "step": 60190 + }, + { + "epoch": 2.675912343868071, + "grad_norm": 0.13371874392032623, + "learning_rate": 1.583446687148651e-05, + "loss": 0.2579, + "step": 60200 + }, + { + "epoch": 2.6763568475796773, + "grad_norm": 0.11757712811231613, + "learning_rate": 1.581937923832248e-05, + "loss": 0.2562, + "step": 60210 + }, + { + "epoch": 2.6768013512912834, + "grad_norm": 0.13613088428974152, + "learning_rate": 1.5804297445691663e-05, + "loss": 0.2586, + "step": 60220 + }, + { + "epoch": 2.677245855002889, + "grad_norm": 0.11227578669786453, + "learning_rate": 1.578922149617119e-05, + "loss": 0.2573, + "step": 60230 + }, + { + "epoch": 2.677690358714495, + "grad_norm": 0.14532285928726196, + "learning_rate": 1.5774151392337084e-05, + "loss": 0.2578, + "step": 60240 + }, + { + "epoch": 2.678134862426101, + "grad_norm": 0.1567414253950119, + "learning_rate": 1.575908713676442e-05, + "loss": 0.259, + "step": 60250 + }, + { + "epoch": 2.6785793661377073, + "grad_norm": 0.1463613659143448, + "learning_rate": 1.5744028732027293e-05, + "loss": 0.2544, + "step": 60260 + }, + { + "epoch": 2.6790238698493134, + "grad_norm": 0.17018172144889832, + "learning_rate": 1.5728976180698723e-05, + "loss": 0.2581, + "step": 60270 + }, + { + "epoch": 2.679468373560919, + "grad_norm": 0.13328449428081512, + "learning_rate": 1.5713929485350836e-05, + "loss": 0.2574, + "step": 60280 + }, + { + "epoch": 2.679912877272525, + "grad_norm": 0.13122375309467316, + "learning_rate": 1.569888864855465e-05, + "loss": 0.256, + "step": 60290 + }, + { + "epoch": 2.6803573809841312, + "grad_norm": 0.1536010503768921, + "learning_rate": 1.568385367288028e-05, + "loss": 0.2592, + "step": 60300 + }, + { + "epoch": 2.6808018846957373, + "grad_norm": 0.1407640427350998, + "learning_rate": 1.5668824560896755e-05, + "loss": 0.2576, + "step": 60310 + }, + { + "epoch": 2.6812463884073434, + "grad_norm": 0.13612350821495056, + "learning_rate": 1.5653801315172156e-05, + "loss": 0.2569, + "step": 60320 + }, + { + "epoch": 2.681690892118949, + "grad_norm": 0.11549112945795059, + "learning_rate": 1.5638783938273554e-05, + "loss": 0.255, + "step": 60330 + }, + { + "epoch": 2.682135395830555, + "grad_norm": 0.1409757137298584, + "learning_rate": 1.5623772432766966e-05, + "loss": 0.2555, + "step": 60340 + }, + { + "epoch": 2.6825798995421613, + "grad_norm": 0.12310541421175003, + "learning_rate": 1.5608766801217507e-05, + "loss": 0.2568, + "step": 60350 + }, + { + "epoch": 2.683024403253767, + "grad_norm": 0.11596052348613739, + "learning_rate": 1.5593767046189184e-05, + "loss": 0.2559, + "step": 60360 + }, + { + "epoch": 2.6834689069653734, + "grad_norm": 0.13120505213737488, + "learning_rate": 1.5578773170245064e-05, + "loss": 0.2567, + "step": 60370 + }, + { + "epoch": 2.683913410676979, + "grad_norm": 0.11547081172466278, + "learning_rate": 1.5563785175947182e-05, + "loss": 0.2574, + "step": 60380 + }, + { + "epoch": 2.684357914388585, + "grad_norm": 0.11791194975376129, + "learning_rate": 1.5548803065856582e-05, + "loss": 0.2587, + "step": 60390 + }, + { + "epoch": 2.6848024181001913, + "grad_norm": 0.11845945566892624, + "learning_rate": 1.55338268425333e-05, + "loss": 0.2559, + "step": 60400 + }, + { + "epoch": 2.685246921811797, + "grad_norm": 0.1268535703420639, + "learning_rate": 1.5518856508536373e-05, + "loss": 0.2567, + "step": 60410 + }, + { + "epoch": 2.685691425523403, + "grad_norm": 0.1652509868144989, + "learning_rate": 1.5503892066423786e-05, + "loss": 0.2556, + "step": 60420 + }, + { + "epoch": 2.686135929235009, + "grad_norm": 0.13094298541545868, + "learning_rate": 1.548893351875258e-05, + "loss": 0.2555, + "step": 60430 + }, + { + "epoch": 2.686580432946615, + "grad_norm": 0.16100221872329712, + "learning_rate": 1.547398086807875e-05, + "loss": 0.2565, + "step": 60440 + }, + { + "epoch": 2.6870249366582213, + "grad_norm": 0.1443994790315628, + "learning_rate": 1.5459034116957305e-05, + "loss": 0.2566, + "step": 60450 + }, + { + "epoch": 2.687469440369827, + "grad_norm": 0.1535177230834961, + "learning_rate": 1.544409326794225e-05, + "loss": 0.2595, + "step": 60460 + }, + { + "epoch": 2.687913944081433, + "grad_norm": 0.13104356825351715, + "learning_rate": 1.542915832358651e-05, + "loss": 0.2583, + "step": 60470 + }, + { + "epoch": 2.688358447793039, + "grad_norm": 0.1414777934551239, + "learning_rate": 1.541422928644214e-05, + "loss": 0.254, + "step": 60480 + }, + { + "epoch": 2.688802951504645, + "grad_norm": 0.15183554589748383, + "learning_rate": 1.5399306159060024e-05, + "loss": 0.2581, + "step": 60490 + }, + { + "epoch": 2.6892474552162513, + "grad_norm": 0.15168073773384094, + "learning_rate": 1.5384388943990185e-05, + "loss": 0.2564, + "step": 60500 + }, + { + "epoch": 2.689691958927857, + "grad_norm": 0.12873488664627075, + "learning_rate": 1.5369477643781526e-05, + "loss": 0.2569, + "step": 60510 + }, + { + "epoch": 2.690136462639463, + "grad_norm": 0.13918262720108032, + "learning_rate": 1.5354572260981985e-05, + "loss": 0.2568, + "step": 60520 + }, + { + "epoch": 2.690580966351069, + "grad_norm": 0.14049924910068512, + "learning_rate": 1.533967279813851e-05, + "loss": 0.2573, + "step": 60530 + }, + { + "epoch": 2.691025470062675, + "grad_norm": 0.13386060297489166, + "learning_rate": 1.5324779257796956e-05, + "loss": 0.2592, + "step": 60540 + }, + { + "epoch": 2.691469973774281, + "grad_norm": 0.15999753773212433, + "learning_rate": 1.5309891642502293e-05, + "loss": 0.2572, + "step": 60550 + }, + { + "epoch": 2.691914477485887, + "grad_norm": 0.12733903527259827, + "learning_rate": 1.5295009954798357e-05, + "loss": 0.257, + "step": 60560 + }, + { + "epoch": 2.692358981197493, + "grad_norm": 0.1512010395526886, + "learning_rate": 1.5280134197228036e-05, + "loss": 0.2554, + "step": 60570 + }, + { + "epoch": 2.692803484909099, + "grad_norm": 0.13872788846492767, + "learning_rate": 1.526526437233319e-05, + "loss": 0.2561, + "step": 60580 + }, + { + "epoch": 2.693247988620705, + "grad_norm": 0.1410488337278366, + "learning_rate": 1.525040048265466e-05, + "loss": 0.2582, + "step": 60590 + }, + { + "epoch": 2.693692492332311, + "grad_norm": 0.14589864015579224, + "learning_rate": 1.5235542530732288e-05, + "loss": 0.2561, + "step": 60600 + }, + { + "epoch": 2.694136996043917, + "grad_norm": 0.15948902070522308, + "learning_rate": 1.5220690519104901e-05, + "loss": 0.2575, + "step": 60610 + }, + { + "epoch": 2.694581499755523, + "grad_norm": 0.1148267313838005, + "learning_rate": 1.5205844450310275e-05, + "loss": 0.2581, + "step": 60620 + }, + { + "epoch": 2.695026003467129, + "grad_norm": 0.1380411684513092, + "learning_rate": 1.519100432688521e-05, + "loss": 0.2557, + "step": 60630 + }, + { + "epoch": 2.695470507178735, + "grad_norm": 0.12766072154045105, + "learning_rate": 1.5176170151365476e-05, + "loss": 0.2576, + "step": 60640 + }, + { + "epoch": 2.695915010890341, + "grad_norm": 0.13162295520305634, + "learning_rate": 1.5161341926285833e-05, + "loss": 0.2564, + "step": 60650 + }, + { + "epoch": 2.696359514601947, + "grad_norm": 0.13482360541820526, + "learning_rate": 1.5146519654180025e-05, + "loss": 0.2587, + "step": 60660 + }, + { + "epoch": 2.6968040183135527, + "grad_norm": 0.13434645533561707, + "learning_rate": 1.5131703337580739e-05, + "loss": 0.2586, + "step": 60670 + }, + { + "epoch": 2.6972485220251587, + "grad_norm": 0.14518257975578308, + "learning_rate": 1.5116892979019731e-05, + "loss": 0.2593, + "step": 60680 + }, + { + "epoch": 2.697693025736765, + "grad_norm": 0.1390068531036377, + "learning_rate": 1.5102088581027623e-05, + "loss": 0.258, + "step": 60690 + }, + { + "epoch": 2.698137529448371, + "grad_norm": 0.13847105205059052, + "learning_rate": 1.5087290146134154e-05, + "loss": 0.2555, + "step": 60700 + }, + { + "epoch": 2.698582033159977, + "grad_norm": 0.12940570712089539, + "learning_rate": 1.5072497676867915e-05, + "loss": 0.2545, + "step": 60710 + }, + { + "epoch": 2.6990265368715827, + "grad_norm": 0.14726239442825317, + "learning_rate": 1.505771117575655e-05, + "loss": 0.2567, + "step": 60720 + }, + { + "epoch": 2.6994710405831888, + "grad_norm": 0.12047098577022552, + "learning_rate": 1.5042930645326691e-05, + "loss": 0.2566, + "step": 60730 + }, + { + "epoch": 2.699915544294795, + "grad_norm": 0.1315678358078003, + "learning_rate": 1.5028156088103879e-05, + "loss": 0.2554, + "step": 60740 + }, + { + "epoch": 2.700360048006401, + "grad_norm": 0.1495722383260727, + "learning_rate": 1.5013387506612735e-05, + "loss": 0.2584, + "step": 60750 + }, + { + "epoch": 2.700804551718007, + "grad_norm": 0.14639858901500702, + "learning_rate": 1.4998624903376767e-05, + "loss": 0.2588, + "step": 60760 + }, + { + "epoch": 2.7012490554296127, + "grad_norm": 0.14336629211902618, + "learning_rate": 1.4983868280918517e-05, + "loss": 0.2559, + "step": 60770 + }, + { + "epoch": 2.701693559141219, + "grad_norm": 0.11113694310188293, + "learning_rate": 1.4969117641759478e-05, + "loss": 0.2553, + "step": 60780 + }, + { + "epoch": 2.702138062852825, + "grad_norm": 0.12891454994678497, + "learning_rate": 1.495437298842014e-05, + "loss": 0.2558, + "step": 60790 + }, + { + "epoch": 2.702582566564431, + "grad_norm": 0.140007883310318, + "learning_rate": 1.4939634323419976e-05, + "loss": 0.2575, + "step": 60800 + }, + { + "epoch": 2.703027070276037, + "grad_norm": 0.14943484961986542, + "learning_rate": 1.4924901649277384e-05, + "loss": 0.2565, + "step": 60810 + }, + { + "epoch": 2.7034715739876427, + "grad_norm": 0.132712721824646, + "learning_rate": 1.4910174968509793e-05, + "loss": 0.2561, + "step": 60820 + }, + { + "epoch": 2.703916077699249, + "grad_norm": 0.11228329688310623, + "learning_rate": 1.4895454283633598e-05, + "loss": 0.2573, + "step": 60830 + }, + { + "epoch": 2.704360581410855, + "grad_norm": 0.13063539564609528, + "learning_rate": 1.488073959716415e-05, + "loss": 0.2556, + "step": 60840 + }, + { + "epoch": 2.7048050851224605, + "grad_norm": 0.1292748749256134, + "learning_rate": 1.4866030911615791e-05, + "loss": 0.2565, + "step": 60850 + }, + { + "epoch": 2.7052495888340666, + "grad_norm": 0.1423276662826538, + "learning_rate": 1.4851328229501849e-05, + "loss": 0.2552, + "step": 60860 + }, + { + "epoch": 2.7056940925456727, + "grad_norm": 0.1460987627506256, + "learning_rate": 1.4836631553334562e-05, + "loss": 0.257, + "step": 60870 + }, + { + "epoch": 2.706138596257279, + "grad_norm": 0.30430907011032104, + "learning_rate": 1.4821940885625251e-05, + "loss": 0.256, + "step": 60880 + }, + { + "epoch": 2.706583099968885, + "grad_norm": 0.1301928609609604, + "learning_rate": 1.4807256228884109e-05, + "loss": 0.2544, + "step": 60890 + }, + { + "epoch": 2.7070276036804906, + "grad_norm": 0.14728084206581116, + "learning_rate": 1.4792577585620353e-05, + "loss": 0.2561, + "step": 60900 + }, + { + "epoch": 2.7074721073920966, + "grad_norm": 0.13702338933944702, + "learning_rate": 1.4777904958342164e-05, + "loss": 0.2583, + "step": 60910 + }, + { + "epoch": 2.7079166111037027, + "grad_norm": 0.15431924164295197, + "learning_rate": 1.4763238349556691e-05, + "loss": 0.2561, + "step": 60920 + }, + { + "epoch": 2.708361114815309, + "grad_norm": 0.13443012535572052, + "learning_rate": 1.4748577761770072e-05, + "loss": 0.2565, + "step": 60930 + }, + { + "epoch": 2.708805618526915, + "grad_norm": 0.15381649136543274, + "learning_rate": 1.4733923197487354e-05, + "loss": 0.2593, + "step": 60940 + }, + { + "epoch": 2.7092501222385206, + "grad_norm": 0.11542849242687225, + "learning_rate": 1.4719274659212662e-05, + "loss": 0.2561, + "step": 60950 + }, + { + "epoch": 2.7096946259501267, + "grad_norm": 0.13132436573505402, + "learning_rate": 1.4704632149448983e-05, + "loss": 0.2561, + "step": 60960 + }, + { + "epoch": 2.7101391296617328, + "grad_norm": 0.12982286512851715, + "learning_rate": 1.4689995670698343e-05, + "loss": 0.2571, + "step": 60970 + }, + { + "epoch": 2.7105836333733384, + "grad_norm": 0.12465247511863708, + "learning_rate": 1.4675365225461728e-05, + "loss": 0.2539, + "step": 60980 + }, + { + "epoch": 2.7110281370849445, + "grad_norm": 0.12726780772209167, + "learning_rate": 1.4660740816239032e-05, + "loss": 0.2596, + "step": 60990 + }, + { + "epoch": 2.7114726407965506, + "grad_norm": 0.11398378014564514, + "learning_rate": 1.464612244552923e-05, + "loss": 0.2581, + "step": 61000 + }, + { + "epoch": 2.7119171445081567, + "grad_norm": 0.11912883073091507, + "learning_rate": 1.4631510115830161e-05, + "loss": 0.2549, + "step": 61010 + }, + { + "epoch": 2.7123616482197628, + "grad_norm": 0.11386843025684357, + "learning_rate": 1.4616903829638679e-05, + "loss": 0.2556, + "step": 61020 + }, + { + "epoch": 2.7128061519313684, + "grad_norm": 0.12959086894989014, + "learning_rate": 1.46023035894506e-05, + "loss": 0.2605, + "step": 61030 + }, + { + "epoch": 2.7132506556429745, + "grad_norm": 0.12432833015918732, + "learning_rate": 1.4587709397760713e-05, + "loss": 0.2538, + "step": 61040 + }, + { + "epoch": 2.7136951593545806, + "grad_norm": 0.12189079076051712, + "learning_rate": 1.4573121257062755e-05, + "loss": 0.2578, + "step": 61050 + }, + { + "epoch": 2.7141396630661867, + "grad_norm": 0.12846939265727997, + "learning_rate": 1.4558539169849472e-05, + "loss": 0.2554, + "step": 61060 + }, + { + "epoch": 2.714584166777793, + "grad_norm": 0.12360472232103348, + "learning_rate": 1.4543963138612499e-05, + "loss": 0.2569, + "step": 61070 + }, + { + "epoch": 2.7150286704893984, + "grad_norm": 0.17032790184020996, + "learning_rate": 1.4529393165842498e-05, + "loss": 0.2595, + "step": 61080 + }, + { + "epoch": 2.7154731742010045, + "grad_norm": 0.14095644652843475, + "learning_rate": 1.4514829254029084e-05, + "loss": 0.2567, + "step": 61090 + }, + { + "epoch": 2.7159176779126106, + "grad_norm": 0.15203455090522766, + "learning_rate": 1.450027140566083e-05, + "loss": 0.2565, + "step": 61100 + }, + { + "epoch": 2.7163621816242167, + "grad_norm": 0.15074752271175385, + "learning_rate": 1.4485719623225268e-05, + "loss": 0.258, + "step": 61110 + }, + { + "epoch": 2.716806685335823, + "grad_norm": 0.13842271268367767, + "learning_rate": 1.4471173909208912e-05, + "loss": 0.2575, + "step": 61120 + }, + { + "epoch": 2.7172511890474285, + "grad_norm": 0.17627084255218506, + "learning_rate": 1.4456634266097236e-05, + "loss": 0.2564, + "step": 61130 + }, + { + "epoch": 2.7176956927590346, + "grad_norm": 0.1322067528963089, + "learning_rate": 1.444210069637461e-05, + "loss": 0.2564, + "step": 61140 + }, + { + "epoch": 2.7181401964706406, + "grad_norm": 0.14267323911190033, + "learning_rate": 1.4427573202524502e-05, + "loss": 0.2589, + "step": 61150 + }, + { + "epoch": 2.7185847001822463, + "grad_norm": 0.12082046270370483, + "learning_rate": 1.4413051787029208e-05, + "loss": 0.2549, + "step": 61160 + }, + { + "epoch": 2.7190292038938524, + "grad_norm": 0.13186350464820862, + "learning_rate": 1.439853645237006e-05, + "loss": 0.2537, + "step": 61170 + }, + { + "epoch": 2.7194737076054585, + "grad_norm": 0.1620790958404541, + "learning_rate": 1.4384027201027344e-05, + "loss": 0.2557, + "step": 61180 + }, + { + "epoch": 2.7199182113170646, + "grad_norm": 0.13428102433681488, + "learning_rate": 1.4369524035480253e-05, + "loss": 0.2548, + "step": 61190 + }, + { + "epoch": 2.7203627150286707, + "grad_norm": 0.13752315938472748, + "learning_rate": 1.4355026958207035e-05, + "loss": 0.2589, + "step": 61200 + }, + { + "epoch": 2.7208072187402763, + "grad_norm": 0.13374444842338562, + "learning_rate": 1.434053597168481e-05, + "loss": 0.2574, + "step": 61210 + }, + { + "epoch": 2.7212517224518824, + "grad_norm": 0.17666833102703094, + "learning_rate": 1.432605107838969e-05, + "loss": 0.2574, + "step": 61220 + }, + { + "epoch": 2.7216962261634885, + "grad_norm": 0.14544342458248138, + "learning_rate": 1.4311572280796759e-05, + "loss": 0.2562, + "step": 61230 + }, + { + "epoch": 2.7221407298750946, + "grad_norm": 0.14844657480716705, + "learning_rate": 1.4297099581380047e-05, + "loss": 0.2577, + "step": 61240 + }, + { + "epoch": 2.7225852335867007, + "grad_norm": 0.1370159387588501, + "learning_rate": 1.4282632982612538e-05, + "loss": 0.2575, + "step": 61250 + }, + { + "epoch": 2.7230297372983063, + "grad_norm": 0.14193150401115417, + "learning_rate": 1.4268172486966197e-05, + "loss": 0.2574, + "step": 61260 + }, + { + "epoch": 2.7234742410099124, + "grad_norm": 0.13072490692138672, + "learning_rate": 1.4253718096911894e-05, + "loss": 0.257, + "step": 61270 + }, + { + "epoch": 2.7239187447215185, + "grad_norm": 0.13182029128074646, + "learning_rate": 1.4239269814919504e-05, + "loss": 0.2575, + "step": 61280 + }, + { + "epoch": 2.724363248433124, + "grad_norm": 0.14273619651794434, + "learning_rate": 1.4224827643457843e-05, + "loss": 0.2557, + "step": 61290 + }, + { + "epoch": 2.7248077521447303, + "grad_norm": 0.13776496052742004, + "learning_rate": 1.4210391584994686e-05, + "loss": 0.2547, + "step": 61300 + }, + { + "epoch": 2.7252522558563363, + "grad_norm": 0.11848807334899902, + "learning_rate": 1.4195961641996763e-05, + "loss": 0.2565, + "step": 61310 + }, + { + "epoch": 2.7256967595679424, + "grad_norm": 0.1492552012205124, + "learning_rate": 1.4181537816929751e-05, + "loss": 0.2571, + "step": 61320 + }, + { + "epoch": 2.7261412632795485, + "grad_norm": 0.13938309252262115, + "learning_rate": 1.4167120112258303e-05, + "loss": 0.2559, + "step": 61330 + }, + { + "epoch": 2.726585766991154, + "grad_norm": 0.12976138293743134, + "learning_rate": 1.4152708530445963e-05, + "loss": 0.2558, + "step": 61340 + }, + { + "epoch": 2.7270302707027603, + "grad_norm": 0.13221792876720428, + "learning_rate": 1.4138303073955344e-05, + "loss": 0.2562, + "step": 61350 + }, + { + "epoch": 2.7274747744143664, + "grad_norm": 0.12427391856908798, + "learning_rate": 1.4123903745247897e-05, + "loss": 0.2594, + "step": 61360 + }, + { + "epoch": 2.7279192781259725, + "grad_norm": 0.12594294548034668, + "learning_rate": 1.4109510546784082e-05, + "loss": 0.2547, + "step": 61370 + }, + { + "epoch": 2.7283637818375785, + "grad_norm": 0.1251390129327774, + "learning_rate": 1.4095123481023325e-05, + "loss": 0.2549, + "step": 61380 + }, + { + "epoch": 2.728808285549184, + "grad_norm": 0.14114491641521454, + "learning_rate": 1.4080742550423936e-05, + "loss": 0.2549, + "step": 61390 + }, + { + "epoch": 2.7292527892607903, + "grad_norm": 0.15681278705596924, + "learning_rate": 1.4066367757443278e-05, + "loss": 0.2559, + "step": 61400 + }, + { + "epoch": 2.7296972929723964, + "grad_norm": 0.1171446442604065, + "learning_rate": 1.4051999104537572e-05, + "loss": 0.26, + "step": 61410 + }, + { + "epoch": 2.7301417966840025, + "grad_norm": 0.1616641730070114, + "learning_rate": 1.4037636594162034e-05, + "loss": 0.2573, + "step": 61420 + }, + { + "epoch": 2.7305863003956086, + "grad_norm": 0.13337203860282898, + "learning_rate": 1.402328022877083e-05, + "loss": 0.2577, + "step": 61430 + }, + { + "epoch": 2.731030804107214, + "grad_norm": 0.12976211309432983, + "learning_rate": 1.4008930010817073e-05, + "loss": 0.2571, + "step": 61440 + }, + { + "epoch": 2.7314753078188203, + "grad_norm": 0.12876823544502258, + "learning_rate": 1.3994585942752831e-05, + "loss": 0.2569, + "step": 61450 + }, + { + "epoch": 2.7319198115304264, + "grad_norm": 0.1573069989681244, + "learning_rate": 1.398024802702909e-05, + "loss": 0.2578, + "step": 61460 + }, + { + "epoch": 2.732364315242032, + "grad_norm": 0.12917354702949524, + "learning_rate": 1.3965916266095819e-05, + "loss": 0.2565, + "step": 61470 + }, + { + "epoch": 2.732808818953638, + "grad_norm": 0.14699703454971313, + "learning_rate": 1.395159066240192e-05, + "loss": 0.2579, + "step": 61480 + }, + { + "epoch": 2.7332533226652442, + "grad_norm": 0.14317385852336884, + "learning_rate": 1.3937271218395248e-05, + "loss": 0.2591, + "step": 61490 + }, + { + "epoch": 2.7336978263768503, + "grad_norm": 0.12531894445419312, + "learning_rate": 1.3922957936522613e-05, + "loss": 0.2571, + "step": 61500 + }, + { + "epoch": 2.7341423300884564, + "grad_norm": 0.13226640224456787, + "learning_rate": 1.3908650819229758e-05, + "loss": 0.2561, + "step": 61510 + }, + { + "epoch": 2.734586833800062, + "grad_norm": 0.13573670387268066, + "learning_rate": 1.389434986896137e-05, + "loss": 0.2577, + "step": 61520 + }, + { + "epoch": 2.735031337511668, + "grad_norm": 0.13897772133350372, + "learning_rate": 1.388005508816112e-05, + "loss": 0.2556, + "step": 61530 + }, + { + "epoch": 2.7354758412232743, + "grad_norm": 0.10773730278015137, + "learning_rate": 1.3865766479271557e-05, + "loss": 0.2558, + "step": 61540 + }, + { + "epoch": 2.7359203449348803, + "grad_norm": 0.12223762273788452, + "learning_rate": 1.3851484044734225e-05, + "loss": 0.2575, + "step": 61550 + }, + { + "epoch": 2.7363648486464864, + "grad_norm": 0.13902375102043152, + "learning_rate": 1.3837207786989608e-05, + "loss": 0.2573, + "step": 61560 + }, + { + "epoch": 2.736809352358092, + "grad_norm": 0.14416465163230896, + "learning_rate": 1.382293770847713e-05, + "loss": 0.2562, + "step": 61570 + }, + { + "epoch": 2.737253856069698, + "grad_norm": 0.14912237226963043, + "learning_rate": 1.3808673811635159e-05, + "loss": 0.2568, + "step": 61580 + }, + { + "epoch": 2.7376983597813043, + "grad_norm": 0.12836286425590515, + "learning_rate": 1.3794416098900975e-05, + "loss": 0.2584, + "step": 61590 + }, + { + "epoch": 2.73814286349291, + "grad_norm": 0.15470583736896515, + "learning_rate": 1.3780164572710886e-05, + "loss": 0.2563, + "step": 61600 + }, + { + "epoch": 2.738587367204516, + "grad_norm": 0.13994742929935455, + "learning_rate": 1.3765919235500035e-05, + "loss": 0.2559, + "step": 61610 + }, + { + "epoch": 2.739031870916122, + "grad_norm": 0.1180020123720169, + "learning_rate": 1.375168008970259e-05, + "loss": 0.2548, + "step": 61620 + }, + { + "epoch": 2.739476374627728, + "grad_norm": 0.13465067744255066, + "learning_rate": 1.3737447137751635e-05, + "loss": 0.2561, + "step": 61630 + }, + { + "epoch": 2.7399208783393343, + "grad_norm": 0.15253761410713196, + "learning_rate": 1.372322038207915e-05, + "loss": 0.2567, + "step": 61640 + }, + { + "epoch": 2.74036538205094, + "grad_norm": 0.12315097451210022, + "learning_rate": 1.3708999825116159e-05, + "loss": 0.2564, + "step": 61650 + }, + { + "epoch": 2.740809885762546, + "grad_norm": 0.12754762172698975, + "learning_rate": 1.3694785469292526e-05, + "loss": 0.2557, + "step": 61660 + }, + { + "epoch": 2.741254389474152, + "grad_norm": 0.12889930605888367, + "learning_rate": 1.3680577317037101e-05, + "loss": 0.2558, + "step": 61670 + }, + { + "epoch": 2.741698893185758, + "grad_norm": 0.1474778950214386, + "learning_rate": 1.3666375370777678e-05, + "loss": 0.2552, + "step": 61680 + }, + { + "epoch": 2.7421433968973643, + "grad_norm": 0.12788896262645721, + "learning_rate": 1.3652179632940981e-05, + "loss": 0.2528, + "step": 61690 + }, + { + "epoch": 2.74258790060897, + "grad_norm": 0.12684978544712067, + "learning_rate": 1.3637990105952664e-05, + "loss": 0.2554, + "step": 61700 + }, + { + "epoch": 2.743032404320576, + "grad_norm": 0.1946885585784912, + "learning_rate": 1.3623806792237337e-05, + "loss": 0.2577, + "step": 61710 + }, + { + "epoch": 2.743476908032182, + "grad_norm": 0.1275569498538971, + "learning_rate": 1.3609629694218551e-05, + "loss": 0.2551, + "step": 61720 + }, + { + "epoch": 2.743921411743788, + "grad_norm": 0.1369779109954834, + "learning_rate": 1.3595458814318762e-05, + "loss": 0.2545, + "step": 61730 + }, + { + "epoch": 2.7443659154553943, + "grad_norm": 0.13468922674655914, + "learning_rate": 1.3581294154959389e-05, + "loss": 0.254, + "step": 61740 + }, + { + "epoch": 2.744810419167, + "grad_norm": 0.12934081256389618, + "learning_rate": 1.3567135718560792e-05, + "loss": 0.2579, + "step": 61750 + }, + { + "epoch": 2.745254922878606, + "grad_norm": 0.12646351754665375, + "learning_rate": 1.3552983507542261e-05, + "loss": 0.257, + "step": 61760 + }, + { + "epoch": 2.745699426590212, + "grad_norm": 0.16915777325630188, + "learning_rate": 1.3538837524322023e-05, + "loss": 0.2589, + "step": 61770 + }, + { + "epoch": 2.746143930301818, + "grad_norm": 0.14536193013191223, + "learning_rate": 1.3524697771317251e-05, + "loss": 0.2567, + "step": 61780 + }, + { + "epoch": 2.746588434013424, + "grad_norm": 0.1520392745733261, + "learning_rate": 1.3510564250943997e-05, + "loss": 0.2589, + "step": 61790 + }, + { + "epoch": 2.74703293772503, + "grad_norm": 0.13604718446731567, + "learning_rate": 1.3496436965617353e-05, + "loss": 0.2569, + "step": 61800 + }, + { + "epoch": 2.747477441436636, + "grad_norm": 0.12872545421123505, + "learning_rate": 1.3482315917751243e-05, + "loss": 0.2581, + "step": 61810 + }, + { + "epoch": 2.747921945148242, + "grad_norm": 0.13777968287467957, + "learning_rate": 1.3468201109758582e-05, + "loss": 0.2553, + "step": 61820 + }, + { + "epoch": 2.748366448859848, + "grad_norm": 0.13788266479969025, + "learning_rate": 1.3454092544051222e-05, + "loss": 0.2582, + "step": 61830 + }, + { + "epoch": 2.748810952571454, + "grad_norm": 0.14220863580703735, + "learning_rate": 1.3439990223039878e-05, + "loss": 0.2566, + "step": 61840 + }, + { + "epoch": 2.74925545628306, + "grad_norm": 0.13086137175559998, + "learning_rate": 1.3425894149134321e-05, + "loss": 0.2581, + "step": 61850 + }, + { + "epoch": 2.749699959994666, + "grad_norm": 0.12222402542829514, + "learning_rate": 1.3411804324743132e-05, + "loss": 0.2559, + "step": 61860 + }, + { + "epoch": 2.750144463706272, + "grad_norm": 0.14468097686767578, + "learning_rate": 1.3397720752273896e-05, + "loss": 0.2567, + "step": 61870 + }, + { + "epoch": 2.750588967417878, + "grad_norm": 0.1346142739057541, + "learning_rate": 1.3383643434133108e-05, + "loss": 0.2568, + "step": 61880 + }, + { + "epoch": 2.751033471129484, + "grad_norm": 0.14574258029460907, + "learning_rate": 1.3369572372726196e-05, + "loss": 0.2577, + "step": 61890 + }, + { + "epoch": 2.75147797484109, + "grad_norm": 0.15336553752422333, + "learning_rate": 1.3355507570457525e-05, + "loss": 0.2565, + "step": 61900 + }, + { + "epoch": 2.7519224785526957, + "grad_norm": 0.12249205261468887, + "learning_rate": 1.334144902973038e-05, + "loss": 0.2573, + "step": 61910 + }, + { + "epoch": 2.7523669822643018, + "grad_norm": 0.1579149067401886, + "learning_rate": 1.3327396752946997e-05, + "loss": 0.2568, + "step": 61920 + }, + { + "epoch": 2.752811485975908, + "grad_norm": 0.12972140312194824, + "learning_rate": 1.3313350742508496e-05, + "loss": 0.2559, + "step": 61930 + }, + { + "epoch": 2.753255989687514, + "grad_norm": 0.13285920023918152, + "learning_rate": 1.3299311000814973e-05, + "loss": 0.2563, + "step": 61940 + }, + { + "epoch": 2.75370049339912, + "grad_norm": 0.15356668829917908, + "learning_rate": 1.3285277530265432e-05, + "loss": 0.2555, + "step": 61950 + }, + { + "epoch": 2.7541449971107257, + "grad_norm": 0.13739937543869019, + "learning_rate": 1.3271250333257813e-05, + "loss": 0.2565, + "step": 61960 + }, + { + "epoch": 2.754589500822332, + "grad_norm": 0.15085454285144806, + "learning_rate": 1.3257229412188977e-05, + "loss": 0.256, + "step": 61970 + }, + { + "epoch": 2.755034004533938, + "grad_norm": 0.12314674258232117, + "learning_rate": 1.3243214769454726e-05, + "loss": 0.2574, + "step": 61980 + }, + { + "epoch": 2.755478508245544, + "grad_norm": 0.11472348868846893, + "learning_rate": 1.3229206407449751e-05, + "loss": 0.259, + "step": 61990 + }, + { + "epoch": 2.75592301195715, + "grad_norm": 0.12648098170757294, + "learning_rate": 1.3215204328567742e-05, + "loss": 0.2556, + "step": 62000 + }, + { + "epoch": 2.7563675156687557, + "grad_norm": 0.14055626094341278, + "learning_rate": 1.3201208535201232e-05, + "loss": 0.258, + "step": 62010 + }, + { + "epoch": 2.756812019380362, + "grad_norm": 0.1381070762872696, + "learning_rate": 1.3187219029741737e-05, + "loss": 0.2564, + "step": 62020 + }, + { + "epoch": 2.757256523091968, + "grad_norm": 0.11616089940071106, + "learning_rate": 1.3173235814579693e-05, + "loss": 0.2585, + "step": 62030 + }, + { + "epoch": 2.7577010268035735, + "grad_norm": 0.15122009813785553, + "learning_rate": 1.3159258892104398e-05, + "loss": 0.2529, + "step": 62040 + }, + { + "epoch": 2.75814553051518, + "grad_norm": 0.12874428927898407, + "learning_rate": 1.3145288264704198e-05, + "loss": 0.2551, + "step": 62050 + }, + { + "epoch": 2.7585900342267857, + "grad_norm": 0.13859272003173828, + "learning_rate": 1.3131323934766237e-05, + "loss": 0.258, + "step": 62060 + }, + { + "epoch": 2.759034537938392, + "grad_norm": 0.14981231093406677, + "learning_rate": 1.311736590467666e-05, + "loss": 0.2574, + "step": 62070 + }, + { + "epoch": 2.759479041649998, + "grad_norm": 0.13243532180786133, + "learning_rate": 1.3103414176820505e-05, + "loss": 0.2573, + "step": 62080 + }, + { + "epoch": 2.7599235453616036, + "grad_norm": 0.13439491391181946, + "learning_rate": 1.308946875358174e-05, + "loss": 0.2567, + "step": 62090 + }, + { + "epoch": 2.7603680490732097, + "grad_norm": 0.12156356126070023, + "learning_rate": 1.3075529637343276e-05, + "loss": 0.2567, + "step": 62100 + }, + { + "epoch": 2.7608125527848157, + "grad_norm": 0.1483202427625656, + "learning_rate": 1.3061596830486883e-05, + "loss": 0.259, + "step": 62110 + }, + { + "epoch": 2.761257056496422, + "grad_norm": 0.15883325040340424, + "learning_rate": 1.3047670335393353e-05, + "loss": 0.2558, + "step": 62120 + }, + { + "epoch": 2.761701560208028, + "grad_norm": 0.1310390830039978, + "learning_rate": 1.3033750154442298e-05, + "loss": 0.2558, + "step": 62130 + }, + { + "epoch": 2.7621460639196336, + "grad_norm": 0.13687457144260406, + "learning_rate": 1.3019836290012316e-05, + "loss": 0.2541, + "step": 62140 + }, + { + "epoch": 2.7625905676312397, + "grad_norm": 0.13186374306678772, + "learning_rate": 1.30059287444809e-05, + "loss": 0.2568, + "step": 62150 + }, + { + "epoch": 2.7630350713428458, + "grad_norm": 0.13022135198116302, + "learning_rate": 1.299202752022447e-05, + "loss": 0.2559, + "step": 62160 + }, + { + "epoch": 2.763479575054452, + "grad_norm": 0.11515762656927109, + "learning_rate": 1.2978132619618371e-05, + "loss": 0.2576, + "step": 62170 + }, + { + "epoch": 2.763924078766058, + "grad_norm": 0.11714336276054382, + "learning_rate": 1.2964244045036866e-05, + "loss": 0.2565, + "step": 62180 + }, + { + "epoch": 2.7643685824776636, + "grad_norm": 0.12482546269893646, + "learning_rate": 1.295036179885311e-05, + "loss": 0.2547, + "step": 62190 + }, + { + "epoch": 2.7648130861892697, + "grad_norm": 0.14312368631362915, + "learning_rate": 1.293648588343922e-05, + "loss": 0.2552, + "step": 62200 + }, + { + "epoch": 2.765257589900876, + "grad_norm": 0.12005997449159622, + "learning_rate": 1.2922616301166196e-05, + "loss": 0.2553, + "step": 62210 + }, + { + "epoch": 2.7657020936124814, + "grad_norm": 0.14569474756717682, + "learning_rate": 1.2908753054403976e-05, + "loss": 0.259, + "step": 62220 + }, + { + "epoch": 2.7661465973240875, + "grad_norm": 0.12386803328990936, + "learning_rate": 1.2894896145521429e-05, + "loss": 0.2566, + "step": 62230 + }, + { + "epoch": 2.7665911010356936, + "grad_norm": 0.15388554334640503, + "learning_rate": 1.2881045576886275e-05, + "loss": 0.2589, + "step": 62240 + }, + { + "epoch": 2.7670356047472997, + "grad_norm": 0.10301946848630905, + "learning_rate": 1.2867201350865254e-05, + "loss": 0.2554, + "step": 62250 + }, + { + "epoch": 2.767480108458906, + "grad_norm": 0.13266949355602264, + "learning_rate": 1.2853363469823914e-05, + "loss": 0.2561, + "step": 62260 + }, + { + "epoch": 2.7679246121705114, + "grad_norm": 0.1359679251909256, + "learning_rate": 1.283953193612682e-05, + "loss": 0.2566, + "step": 62270 + }, + { + "epoch": 2.7683691158821175, + "grad_norm": 0.12745900452136993, + "learning_rate": 1.2825706752137372e-05, + "loss": 0.2566, + "step": 62280 + }, + { + "epoch": 2.7688136195937236, + "grad_norm": 0.14045438170433044, + "learning_rate": 1.2811887920217896e-05, + "loss": 0.2579, + "step": 62290 + }, + { + "epoch": 2.7692581233053297, + "grad_norm": 0.1200268343091011, + "learning_rate": 1.2798075442729707e-05, + "loss": 0.2558, + "step": 62300 + }, + { + "epoch": 2.769702627016936, + "grad_norm": 0.14245536923408508, + "learning_rate": 1.2784269322032922e-05, + "loss": 0.2587, + "step": 62310 + }, + { + "epoch": 2.7701471307285415, + "grad_norm": 0.12501445412635803, + "learning_rate": 1.2770469560486686e-05, + "loss": 0.2565, + "step": 62320 + }, + { + "epoch": 2.7705916344401476, + "grad_norm": 0.1362815499305725, + "learning_rate": 1.2756676160448956e-05, + "loss": 0.2566, + "step": 62330 + }, + { + "epoch": 2.7710361381517536, + "grad_norm": 0.14697392284870148, + "learning_rate": 1.2742889124276663e-05, + "loss": 0.2576, + "step": 62340 + }, + { + "epoch": 2.7714806418633593, + "grad_norm": 0.13903047144412994, + "learning_rate": 1.2729108454325639e-05, + "loss": 0.2569, + "step": 62350 + }, + { + "epoch": 2.7719251455749654, + "grad_norm": 0.14109881222248077, + "learning_rate": 1.2715334152950614e-05, + "loss": 0.2551, + "step": 62360 + }, + { + "epoch": 2.7723696492865715, + "grad_norm": 0.13788242638111115, + "learning_rate": 1.2701566222505246e-05, + "loss": 0.2538, + "step": 62370 + }, + { + "epoch": 2.7728141529981776, + "grad_norm": 0.13099674880504608, + "learning_rate": 1.2687804665342107e-05, + "loss": 0.2611, + "step": 62380 + }, + { + "epoch": 2.7732586567097837, + "grad_norm": 0.12661388516426086, + "learning_rate": 1.267404948381265e-05, + "loss": 0.2551, + "step": 62390 + }, + { + "epoch": 2.7737031604213893, + "grad_norm": 0.13834023475646973, + "learning_rate": 1.2660300680267267e-05, + "loss": 0.256, + "step": 62400 + }, + { + "epoch": 2.7741476641329954, + "grad_norm": 0.13428384065628052, + "learning_rate": 1.2646558257055257e-05, + "loss": 0.2588, + "step": 62410 + }, + { + "epoch": 2.7745921678446015, + "grad_norm": 0.12811890244483948, + "learning_rate": 1.263282221652482e-05, + "loss": 0.2548, + "step": 62420 + }, + { + "epoch": 2.7750366715562076, + "grad_norm": 0.14321452379226685, + "learning_rate": 1.2619092561023088e-05, + "loss": 0.2556, + "step": 62430 + }, + { + "epoch": 2.7754811752678137, + "grad_norm": 0.14183062314987183, + "learning_rate": 1.2605369292896036e-05, + "loss": 0.2561, + "step": 62440 + }, + { + "epoch": 2.7759256789794193, + "grad_norm": 0.13705894351005554, + "learning_rate": 1.2591652414488658e-05, + "loss": 0.2549, + "step": 62450 + }, + { + "epoch": 2.7763701826910254, + "grad_norm": 0.123197540640831, + "learning_rate": 1.2577941928144732e-05, + "loss": 0.2547, + "step": 62460 + }, + { + "epoch": 2.7768146864026315, + "grad_norm": 0.13952556252479553, + "learning_rate": 1.256423783620706e-05, + "loss": 0.2547, + "step": 62470 + }, + { + "epoch": 2.7772591901142376, + "grad_norm": 0.1337570995092392, + "learning_rate": 1.2550540141017264e-05, + "loss": 0.2588, + "step": 62480 + }, + { + "epoch": 2.7777036938258437, + "grad_norm": 0.13635753095149994, + "learning_rate": 1.253684884491591e-05, + "loss": 0.2544, + "step": 62490 + }, + { + "epoch": 2.7781481975374493, + "grad_norm": 0.12022969126701355, + "learning_rate": 1.2523163950242483e-05, + "loss": 0.2564, + "step": 62500 + }, + { + "epoch": 2.7785927012490554, + "grad_norm": 0.1276351660490036, + "learning_rate": 1.2509485459335313e-05, + "loss": 0.2561, + "step": 62510 + }, + { + "epoch": 2.7790372049606615, + "grad_norm": 0.13473264873027802, + "learning_rate": 1.2495813374531739e-05, + "loss": 0.2579, + "step": 62520 + }, + { + "epoch": 2.779481708672267, + "grad_norm": 0.1398290991783142, + "learning_rate": 1.2482147698167907e-05, + "loss": 0.258, + "step": 62530 + }, + { + "epoch": 2.7799262123838733, + "grad_norm": 0.14480793476104736, + "learning_rate": 1.2468488432578911e-05, + "loss": 0.2563, + "step": 62540 + }, + { + "epoch": 2.7803707160954794, + "grad_norm": 0.1361950933933258, + "learning_rate": 1.2454835580098761e-05, + "loss": 0.2549, + "step": 62550 + }, + { + "epoch": 2.7808152198070855, + "grad_norm": 0.14168015122413635, + "learning_rate": 1.2441189143060338e-05, + "loss": 0.258, + "step": 62560 + }, + { + "epoch": 2.7812597235186916, + "grad_norm": 0.16250821948051453, + "learning_rate": 1.242754912379548e-05, + "loss": 0.2581, + "step": 62570 + }, + { + "epoch": 2.781704227230297, + "grad_norm": 0.1313403844833374, + "learning_rate": 1.2413915524634844e-05, + "loss": 0.2572, + "step": 62580 + }, + { + "epoch": 2.7821487309419033, + "grad_norm": 0.12714113295078278, + "learning_rate": 1.2400288347908073e-05, + "loss": 0.2548, + "step": 62590 + }, + { + "epoch": 2.7825932346535094, + "grad_norm": 0.1323811113834381, + "learning_rate": 1.2386667595943663e-05, + "loss": 0.2536, + "step": 62600 + }, + { + "epoch": 2.7830377383651155, + "grad_norm": 0.14456871151924133, + "learning_rate": 1.2373053271069035e-05, + "loss": 0.258, + "step": 62610 + }, + { + "epoch": 2.7834822420767216, + "grad_norm": 0.12397652119398117, + "learning_rate": 1.2359445375610501e-05, + "loss": 0.2549, + "step": 62620 + }, + { + "epoch": 2.783926745788327, + "grad_norm": 0.11689791828393936, + "learning_rate": 1.2345843911893301e-05, + "loss": 0.2567, + "step": 62630 + }, + { + "epoch": 2.7843712494999333, + "grad_norm": 0.1174652948975563, + "learning_rate": 1.2332248882241498e-05, + "loss": 0.255, + "step": 62640 + }, + { + "epoch": 2.7848157532115394, + "grad_norm": 0.13487201929092407, + "learning_rate": 1.2318660288978178e-05, + "loss": 0.2594, + "step": 62650 + }, + { + "epoch": 2.785260256923145, + "grad_norm": 0.15677624940872192, + "learning_rate": 1.2305078134425213e-05, + "loss": 0.2552, + "step": 62660 + }, + { + "epoch": 2.785704760634751, + "grad_norm": 0.13233058154582977, + "learning_rate": 1.2291502420903434e-05, + "loss": 0.2559, + "step": 62670 + }, + { + "epoch": 2.7861492643463572, + "grad_norm": 0.12954600155353546, + "learning_rate": 1.2277933150732567e-05, + "loss": 0.2563, + "step": 62680 + }, + { + "epoch": 2.7865937680579633, + "grad_norm": 0.15349194407463074, + "learning_rate": 1.2264370326231216e-05, + "loss": 0.2564, + "step": 62690 + }, + { + "epoch": 2.7870382717695694, + "grad_norm": 0.1337401121854782, + "learning_rate": 1.2250813949716927e-05, + "loss": 0.2531, + "step": 62700 + }, + { + "epoch": 2.787482775481175, + "grad_norm": 0.13146692514419556, + "learning_rate": 1.2237264023506063e-05, + "loss": 0.257, + "step": 62710 + }, + { + "epoch": 2.787927279192781, + "grad_norm": 0.11806964129209518, + "learning_rate": 1.2223720549913987e-05, + "loss": 0.256, + "step": 62720 + }, + { + "epoch": 2.7883717829043873, + "grad_norm": 0.12298361212015152, + "learning_rate": 1.221018353125487e-05, + "loss": 0.2557, + "step": 62730 + }, + { + "epoch": 2.7888162866159933, + "grad_norm": 0.1276666820049286, + "learning_rate": 1.2196652969841837e-05, + "loss": 0.2561, + "step": 62740 + }, + { + "epoch": 2.7892607903275994, + "grad_norm": 0.1167319044470787, + "learning_rate": 1.2183128867986904e-05, + "loss": 0.254, + "step": 62750 + }, + { + "epoch": 2.789705294039205, + "grad_norm": 0.14112792909145355, + "learning_rate": 1.2169611228000927e-05, + "loss": 0.2543, + "step": 62760 + }, + { + "epoch": 2.790149797750811, + "grad_norm": 0.1272856444120407, + "learning_rate": 1.2156100052193752e-05, + "loss": 0.2589, + "step": 62770 + }, + { + "epoch": 2.7905943014624173, + "grad_norm": 0.1303594559431076, + "learning_rate": 1.214259534287403e-05, + "loss": 0.2554, + "step": 62780 + }, + { + "epoch": 2.7910388051740234, + "grad_norm": 0.12946777045726776, + "learning_rate": 1.2129097102349363e-05, + "loss": 0.2583, + "step": 62790 + }, + { + "epoch": 2.7914833088856295, + "grad_norm": 0.15154621005058289, + "learning_rate": 1.2115605332926227e-05, + "loss": 0.2556, + "step": 62800 + }, + { + "epoch": 2.791927812597235, + "grad_norm": 0.12768904864788055, + "learning_rate": 1.2102120036910003e-05, + "loss": 0.2544, + "step": 62810 + }, + { + "epoch": 2.792372316308841, + "grad_norm": 0.16188263893127441, + "learning_rate": 1.2088641216604956e-05, + "loss": 0.2588, + "step": 62820 + }, + { + "epoch": 2.7928168200204473, + "grad_norm": 0.14702987670898438, + "learning_rate": 1.2075168874314264e-05, + "loss": 0.2556, + "step": 62830 + }, + { + "epoch": 2.793261323732053, + "grad_norm": 0.11122041940689087, + "learning_rate": 1.2061703012339942e-05, + "loss": 0.2569, + "step": 62840 + }, + { + "epoch": 2.793705827443659, + "grad_norm": 0.1171547994017601, + "learning_rate": 1.204824363298297e-05, + "loss": 0.2566, + "step": 62850 + }, + { + "epoch": 2.794150331155265, + "grad_norm": 0.14295341074466705, + "learning_rate": 1.2034790738543173e-05, + "loss": 0.2594, + "step": 62860 + }, + { + "epoch": 2.794594834866871, + "grad_norm": 0.13201269507408142, + "learning_rate": 1.2021344331319284e-05, + "loss": 0.2522, + "step": 62870 + }, + { + "epoch": 2.7950393385784773, + "grad_norm": 0.1252588927745819, + "learning_rate": 1.200790441360894e-05, + "loss": 0.2548, + "step": 62880 + }, + { + "epoch": 2.795483842290083, + "grad_norm": 0.15031534433364868, + "learning_rate": 1.199447098770864e-05, + "loss": 0.2573, + "step": 62890 + }, + { + "epoch": 2.795928346001689, + "grad_norm": 0.14988720417022705, + "learning_rate": 1.198104405591381e-05, + "loss": 0.2564, + "step": 62900 + }, + { + "epoch": 2.796372849713295, + "grad_norm": 0.12672708928585052, + "learning_rate": 1.1967623620518697e-05, + "loss": 0.2548, + "step": 62910 + }, + { + "epoch": 2.7968173534249012, + "grad_norm": 0.12861424684524536, + "learning_rate": 1.1954209683816554e-05, + "loss": 0.2546, + "step": 62920 + }, + { + "epoch": 2.7972618571365073, + "grad_norm": 0.1351245790719986, + "learning_rate": 1.1940802248099402e-05, + "loss": 0.2574, + "step": 62930 + }, + { + "epoch": 2.797706360848113, + "grad_norm": 0.150813490152359, + "learning_rate": 1.1927401315658232e-05, + "loss": 0.2566, + "step": 62940 + }, + { + "epoch": 2.798150864559719, + "grad_norm": 0.13067643344402313, + "learning_rate": 1.1914006888782898e-05, + "loss": 0.2555, + "step": 62950 + }, + { + "epoch": 2.798595368271325, + "grad_norm": 0.11259137839078903, + "learning_rate": 1.1900618969762106e-05, + "loss": 0.2565, + "step": 62960 + }, + { + "epoch": 2.799039871982931, + "grad_norm": 0.14159703254699707, + "learning_rate": 1.188723756088354e-05, + "loss": 0.2557, + "step": 62970 + }, + { + "epoch": 2.799484375694537, + "grad_norm": 0.15320423245429993, + "learning_rate": 1.1873862664433672e-05, + "loss": 0.2536, + "step": 62980 + }, + { + "epoch": 2.799928879406143, + "grad_norm": 0.1333671659231186, + "learning_rate": 1.1860494282697927e-05, + "loss": 0.2539, + "step": 62990 + }, + { + "epoch": 2.800373383117749, + "grad_norm": 0.1333533376455307, + "learning_rate": 1.1847132417960588e-05, + "loss": 0.2555, + "step": 63000 + }, + { + "epoch": 2.800817886829355, + "grad_norm": 0.12730073928833008, + "learning_rate": 1.1833777072504832e-05, + "loss": 0.2569, + "step": 63010 + }, + { + "epoch": 2.801262390540961, + "grad_norm": 0.17193251848220825, + "learning_rate": 1.1820428248612731e-05, + "loss": 0.2566, + "step": 63020 + }, + { + "epoch": 2.801706894252567, + "grad_norm": 0.1579066962003708, + "learning_rate": 1.1807085948565245e-05, + "loss": 0.2572, + "step": 63030 + }, + { + "epoch": 2.802151397964173, + "grad_norm": 0.15019047260284424, + "learning_rate": 1.1793750174642172e-05, + "loss": 0.2548, + "step": 63040 + }, + { + "epoch": 2.802595901675779, + "grad_norm": 0.136814683675766, + "learning_rate": 1.1780420929122254e-05, + "loss": 0.2547, + "step": 63050 + }, + { + "epoch": 2.803040405387385, + "grad_norm": 0.1588737964630127, + "learning_rate": 1.176709821428309e-05, + "loss": 0.256, + "step": 63060 + }, + { + "epoch": 2.803484909098991, + "grad_norm": 0.1353418081998825, + "learning_rate": 1.1753782032401173e-05, + "loss": 0.256, + "step": 63070 + }, + { + "epoch": 2.803929412810597, + "grad_norm": 0.1253369301557541, + "learning_rate": 1.1740472385751866e-05, + "loss": 0.2544, + "step": 63080 + }, + { + "epoch": 2.804373916522203, + "grad_norm": 0.1431816816329956, + "learning_rate": 1.172716927660943e-05, + "loss": 0.2569, + "step": 63090 + }, + { + "epoch": 2.804818420233809, + "grad_norm": 0.13966424763202667, + "learning_rate": 1.1713872707247015e-05, + "loss": 0.2571, + "step": 63100 + }, + { + "epoch": 2.805262923945415, + "grad_norm": 0.12568022310733795, + "learning_rate": 1.1700582679936595e-05, + "loss": 0.2563, + "step": 63110 + }, + { + "epoch": 2.805707427657021, + "grad_norm": 0.1322663575410843, + "learning_rate": 1.1687299196949136e-05, + "loss": 0.2553, + "step": 63120 + }, + { + "epoch": 2.806151931368627, + "grad_norm": 0.14530900120735168, + "learning_rate": 1.1674022260554374e-05, + "loss": 0.2565, + "step": 63130 + }, + { + "epoch": 2.806596435080233, + "grad_norm": 0.13145340979099274, + "learning_rate": 1.1660751873020987e-05, + "loss": 0.2555, + "step": 63140 + }, + { + "epoch": 2.8070409387918387, + "grad_norm": 0.16346955299377441, + "learning_rate": 1.1647488036616538e-05, + "loss": 0.2564, + "step": 63150 + }, + { + "epoch": 2.807485442503445, + "grad_norm": 0.12269958853721619, + "learning_rate": 1.1634230753607417e-05, + "loss": 0.2537, + "step": 63160 + }, + { + "epoch": 2.807929946215051, + "grad_norm": 0.1454920470714569, + "learning_rate": 1.1620980026258982e-05, + "loss": 0.258, + "step": 63170 + }, + { + "epoch": 2.808374449926657, + "grad_norm": 0.11636282503604889, + "learning_rate": 1.1607735856835373e-05, + "loss": 0.2571, + "step": 63180 + }, + { + "epoch": 2.808818953638263, + "grad_norm": 0.15203791856765747, + "learning_rate": 1.1594498247599677e-05, + "loss": 0.2553, + "step": 63190 + }, + { + "epoch": 2.8092634573498687, + "grad_norm": 0.15028367936611176, + "learning_rate": 1.158126720081384e-05, + "loss": 0.2551, + "step": 63200 + }, + { + "epoch": 2.809707961061475, + "grad_norm": 0.13243798911571503, + "learning_rate": 1.156804271873868e-05, + "loss": 0.2539, + "step": 63210 + }, + { + "epoch": 2.810152464773081, + "grad_norm": 0.1458691656589508, + "learning_rate": 1.1554824803633924e-05, + "loss": 0.2585, + "step": 63220 + }, + { + "epoch": 2.810596968484687, + "grad_norm": 0.14995956420898438, + "learning_rate": 1.154161345775811e-05, + "loss": 0.2566, + "step": 63230 + }, + { + "epoch": 2.811041472196293, + "grad_norm": 0.15080170333385468, + "learning_rate": 1.1528408683368724e-05, + "loss": 0.253, + "step": 63240 + }, + { + "epoch": 2.8114859759078987, + "grad_norm": 0.1334092915058136, + "learning_rate": 1.1515210482722088e-05, + "loss": 0.2576, + "step": 63250 + }, + { + "epoch": 2.811930479619505, + "grad_norm": 0.1409965604543686, + "learning_rate": 1.150201885807342e-05, + "loss": 0.2584, + "step": 63260 + }, + { + "epoch": 2.812374983331111, + "grad_norm": 0.1522947996854782, + "learning_rate": 1.1488833811676807e-05, + "loss": 0.2563, + "step": 63270 + }, + { + "epoch": 2.8128194870427166, + "grad_norm": 0.16050484776496887, + "learning_rate": 1.1475655345785213e-05, + "loss": 0.256, + "step": 63280 + }, + { + "epoch": 2.8132639907543227, + "grad_norm": 0.11865438520908356, + "learning_rate": 1.1462483462650481e-05, + "loss": 0.2559, + "step": 63290 + }, + { + "epoch": 2.8137084944659287, + "grad_norm": 0.1424335092306137, + "learning_rate": 1.1449318164523331e-05, + "loss": 0.2554, + "step": 63300 + }, + { + "epoch": 2.814152998177535, + "grad_norm": 0.13947327435016632, + "learning_rate": 1.1436159453653334e-05, + "loss": 0.2544, + "step": 63310 + }, + { + "epoch": 2.814597501889141, + "grad_norm": 0.12343110889196396, + "learning_rate": 1.1423007332288955e-05, + "loss": 0.256, + "step": 63320 + }, + { + "epoch": 2.8150420056007466, + "grad_norm": 0.13110966980457306, + "learning_rate": 1.1409861802677546e-05, + "loss": 0.2541, + "step": 63330 + }, + { + "epoch": 2.8154865093123527, + "grad_norm": 0.12823829054832458, + "learning_rate": 1.1396722867065313e-05, + "loss": 0.2577, + "step": 63340 + }, + { + "epoch": 2.8159310130239588, + "grad_norm": 0.13070672750473022, + "learning_rate": 1.1383590527697352e-05, + "loss": 0.2592, + "step": 63350 + }, + { + "epoch": 2.816375516735565, + "grad_norm": 0.12779347598552704, + "learning_rate": 1.137046478681758e-05, + "loss": 0.2558, + "step": 63360 + }, + { + "epoch": 2.816820020447171, + "grad_norm": 0.10901453346014023, + "learning_rate": 1.1357345646668888e-05, + "loss": 0.255, + "step": 63370 + }, + { + "epoch": 2.8172645241587766, + "grad_norm": 0.15220139920711517, + "learning_rate": 1.1344233109492924e-05, + "loss": 0.2538, + "step": 63380 + }, + { + "epoch": 2.8177090278703827, + "grad_norm": 0.12609589099884033, + "learning_rate": 1.1331127177530292e-05, + "loss": 0.2541, + "step": 63390 + }, + { + "epoch": 2.818153531581989, + "grad_norm": 0.1397913545370102, + "learning_rate": 1.1318027853020441e-05, + "loss": 0.2554, + "step": 63400 + }, + { + "epoch": 2.8185980352935944, + "grad_norm": 0.11576990783214569, + "learning_rate": 1.1304935138201645e-05, + "loss": 0.2563, + "step": 63410 + }, + { + "epoch": 2.819042539005201, + "grad_norm": 0.12555550038814545, + "learning_rate": 1.1291849035311153e-05, + "loss": 0.2556, + "step": 63420 + }, + { + "epoch": 2.8194870427168066, + "grad_norm": 0.12562263011932373, + "learning_rate": 1.1278769546584972e-05, + "loss": 0.2547, + "step": 63430 + }, + { + "epoch": 2.8199315464284127, + "grad_norm": 0.14232808351516724, + "learning_rate": 1.1265696674258052e-05, + "loss": 0.2558, + "step": 63440 + }, + { + "epoch": 2.820376050140019, + "grad_norm": 0.1477234810590744, + "learning_rate": 1.1252630420564186e-05, + "loss": 0.2546, + "step": 63450 + }, + { + "epoch": 2.8208205538516244, + "grad_norm": 0.13321831822395325, + "learning_rate": 1.1239570787736036e-05, + "loss": 0.2543, + "step": 63460 + }, + { + "epoch": 2.8212650575632305, + "grad_norm": 0.12979312241077423, + "learning_rate": 1.1226517778005135e-05, + "loss": 0.2555, + "step": 63470 + }, + { + "epoch": 2.8217095612748366, + "grad_norm": 0.13257771730422974, + "learning_rate": 1.1213471393601893e-05, + "loss": 0.2541, + "step": 63480 + }, + { + "epoch": 2.8221540649864427, + "grad_norm": 0.12471724301576614, + "learning_rate": 1.1200431636755587e-05, + "loss": 0.2536, + "step": 63490 + }, + { + "epoch": 2.822598568698049, + "grad_norm": 0.15038685500621796, + "learning_rate": 1.1187398509694336e-05, + "loss": 0.2586, + "step": 63500 + }, + { + "epoch": 2.8230430724096545, + "grad_norm": 0.11786221712827682, + "learning_rate": 1.1174372014645146e-05, + "loss": 0.2531, + "step": 63510 + }, + { + "epoch": 2.8234875761212606, + "grad_norm": 0.14386963844299316, + "learning_rate": 1.1161352153833899e-05, + "loss": 0.2542, + "step": 63520 + }, + { + "epoch": 2.8239320798328666, + "grad_norm": 0.1337093710899353, + "learning_rate": 1.1148338929485325e-05, + "loss": 0.2554, + "step": 63530 + }, + { + "epoch": 2.8243765835444727, + "grad_norm": 0.1333010047674179, + "learning_rate": 1.113533234382304e-05, + "loss": 0.2534, + "step": 63540 + }, + { + "epoch": 2.824821087256079, + "grad_norm": 0.13655030727386475, + "learning_rate": 1.1122332399069513e-05, + "loss": 0.2551, + "step": 63550 + }, + { + "epoch": 2.8252655909676845, + "grad_norm": 0.12807141244411469, + "learning_rate": 1.1109339097446047e-05, + "loss": 0.254, + "step": 63560 + }, + { + "epoch": 2.8257100946792906, + "grad_norm": 0.13458430767059326, + "learning_rate": 1.1096352441172897e-05, + "loss": 0.2576, + "step": 63570 + }, + { + "epoch": 2.8261545983908967, + "grad_norm": 0.12475736439228058, + "learning_rate": 1.1083372432469086e-05, + "loss": 0.258, + "step": 63580 + }, + { + "epoch": 2.8265991021025023, + "grad_norm": 0.1287587434053421, + "learning_rate": 1.107039907355255e-05, + "loss": 0.2579, + "step": 63590 + }, + { + "epoch": 2.8270436058141084, + "grad_norm": 0.12165447324514389, + "learning_rate": 1.1057432366640103e-05, + "loss": 0.2525, + "step": 63600 + }, + { + "epoch": 2.8274881095257145, + "grad_norm": 0.1550668627023697, + "learning_rate": 1.1044472313947352e-05, + "loss": 0.2569, + "step": 63610 + }, + { + "epoch": 2.8279326132373206, + "grad_norm": 0.13330033421516418, + "learning_rate": 1.1031518917688877e-05, + "loss": 0.258, + "step": 63620 + }, + { + "epoch": 2.8283771169489267, + "grad_norm": 0.13412053883075714, + "learning_rate": 1.1018572180078007e-05, + "loss": 0.2567, + "step": 63630 + }, + { + "epoch": 2.8288216206605323, + "grad_norm": 0.11880525201559067, + "learning_rate": 1.1005632103327018e-05, + "loss": 0.2575, + "step": 63640 + }, + { + "epoch": 2.8292661243721384, + "grad_norm": 0.11774944514036179, + "learning_rate": 1.0992698689646996e-05, + "loss": 0.256, + "step": 63650 + }, + { + "epoch": 2.8297106280837445, + "grad_norm": 0.1153167188167572, + "learning_rate": 1.0979771941247919e-05, + "loss": 0.2548, + "step": 63660 + }, + { + "epoch": 2.8301551317953506, + "grad_norm": 0.130568265914917, + "learning_rate": 1.0966851860338611e-05, + "loss": 0.2569, + "step": 63670 + }, + { + "epoch": 2.8305996355069567, + "grad_norm": 0.11942778527736664, + "learning_rate": 1.0953938449126766e-05, + "loss": 0.2569, + "step": 63680 + }, + { + "epoch": 2.8310441392185623, + "grad_norm": 0.1369045227766037, + "learning_rate": 1.0941031709818933e-05, + "loss": 0.255, + "step": 63690 + }, + { + "epoch": 2.8314886429301684, + "grad_norm": 0.13938406109809875, + "learning_rate": 1.0928131644620509e-05, + "loss": 0.258, + "step": 63700 + }, + { + "epoch": 2.8319331466417745, + "grad_norm": 0.10413464903831482, + "learning_rate": 1.0915238255735766e-05, + "loss": 0.2573, + "step": 63710 + }, + { + "epoch": 2.83237765035338, + "grad_norm": 0.11461620777845383, + "learning_rate": 1.0902351545367833e-05, + "loss": 0.2585, + "step": 63720 + }, + { + "epoch": 2.8328221540649867, + "grad_norm": 0.14079731702804565, + "learning_rate": 1.0889471515718702e-05, + "loss": 0.2552, + "step": 63730 + }, + { + "epoch": 2.8332666577765924, + "grad_norm": 0.14217177033424377, + "learning_rate": 1.087659816898921e-05, + "loss": 0.2548, + "step": 63740 + }, + { + "epoch": 2.8337111614881985, + "grad_norm": 0.13470888137817383, + "learning_rate": 1.0863731507379082e-05, + "loss": 0.2557, + "step": 63750 + }, + { + "epoch": 2.8341556651998046, + "grad_norm": 0.13697253167629242, + "learning_rate": 1.0850871533086827e-05, + "loss": 0.2568, + "step": 63760 + }, + { + "epoch": 2.83460016891141, + "grad_norm": 0.12988504767417908, + "learning_rate": 1.0838018248309927e-05, + "loss": 0.2572, + "step": 63770 + }, + { + "epoch": 2.8350446726230163, + "grad_norm": 0.12620776891708374, + "learning_rate": 1.0825171655244615e-05, + "loss": 0.2559, + "step": 63780 + }, + { + "epoch": 2.8354891763346224, + "grad_norm": 0.12546566128730774, + "learning_rate": 1.0812331756086025e-05, + "loss": 0.2584, + "step": 63790 + }, + { + "epoch": 2.8359336800462285, + "grad_norm": 0.11935106664896011, + "learning_rate": 1.079949855302817e-05, + "loss": 0.2537, + "step": 63800 + }, + { + "epoch": 2.8363781837578346, + "grad_norm": 0.11679573357105255, + "learning_rate": 1.0786672048263852e-05, + "loss": 0.2539, + "step": 63810 + }, + { + "epoch": 2.83682268746944, + "grad_norm": 0.12475090473890305, + "learning_rate": 1.0773852243984817e-05, + "loss": 0.2585, + "step": 63820 + }, + { + "epoch": 2.8372671911810463, + "grad_norm": 0.12941418588161469, + "learning_rate": 1.0761039142381586e-05, + "loss": 0.2553, + "step": 63830 + }, + { + "epoch": 2.8377116948926524, + "grad_norm": 0.1375601589679718, + "learning_rate": 1.0748232745643577e-05, + "loss": 0.2538, + "step": 63840 + }, + { + "epoch": 2.8381561986042585, + "grad_norm": 0.13288824260234833, + "learning_rate": 1.0735433055959055e-05, + "loss": 0.2567, + "step": 63850 + }, + { + "epoch": 2.8386007023158646, + "grad_norm": 0.10630451887845993, + "learning_rate": 1.0722640075515133e-05, + "loss": 0.2539, + "step": 63860 + }, + { + "epoch": 2.8390452060274702, + "grad_norm": 0.1227797195315361, + "learning_rate": 1.0709853806497795e-05, + "loss": 0.2556, + "step": 63870 + }, + { + "epoch": 2.8394897097390763, + "grad_norm": 0.1346018761396408, + "learning_rate": 1.0697074251091831e-05, + "loss": 0.257, + "step": 63880 + }, + { + "epoch": 2.8399342134506824, + "grad_norm": 0.12933467328548431, + "learning_rate": 1.0684301411480962e-05, + "loss": 0.2561, + "step": 63890 + }, + { + "epoch": 2.840378717162288, + "grad_norm": 0.12182633578777313, + "learning_rate": 1.067153528984769e-05, + "loss": 0.2563, + "step": 63900 + }, + { + "epoch": 2.840823220873894, + "grad_norm": 0.12310411036014557, + "learning_rate": 1.0658775888373395e-05, + "loss": 0.2552, + "step": 63910 + }, + { + "epoch": 2.8412677245855003, + "grad_norm": 0.11798466742038727, + "learning_rate": 1.0646023209238314e-05, + "loss": 0.2548, + "step": 63920 + }, + { + "epoch": 2.8417122282971063, + "grad_norm": 0.13202199339866638, + "learning_rate": 1.0633277254621537e-05, + "loss": 0.2547, + "step": 63930 + }, + { + "epoch": 2.8421567320087124, + "grad_norm": 0.14631988108158112, + "learning_rate": 1.0620538026700994e-05, + "loss": 0.2571, + "step": 63940 + }, + { + "epoch": 2.842601235720318, + "grad_norm": 0.4040316641330719, + "learning_rate": 1.0607805527653486e-05, + "loss": 0.2582, + "step": 63950 + }, + { + "epoch": 2.843045739431924, + "grad_norm": 0.1613142490386963, + "learning_rate": 1.059507975965462e-05, + "loss": 0.2547, + "step": 63960 + }, + { + "epoch": 2.8434902431435303, + "grad_norm": 0.15043915808200836, + "learning_rate": 1.0582360724878898e-05, + "loss": 0.2568, + "step": 63970 + }, + { + "epoch": 2.8439347468551364, + "grad_norm": 0.1090584397315979, + "learning_rate": 1.0569648425499651e-05, + "loss": 0.2558, + "step": 63980 + }, + { + "epoch": 2.8443792505667425, + "grad_norm": 0.127759650349617, + "learning_rate": 1.0556942863689063e-05, + "loss": 0.2554, + "step": 63990 + }, + { + "epoch": 2.844823754278348, + "grad_norm": 0.11831694096326828, + "learning_rate": 1.054424404161819e-05, + "loss": 0.255, + "step": 64000 + }, + { + "epoch": 2.845268257989954, + "grad_norm": 0.1408146321773529, + "learning_rate": 1.053155196145686e-05, + "loss": 0.2598, + "step": 64010 + }, + { + "epoch": 2.8457127617015603, + "grad_norm": 0.1453629583120346, + "learning_rate": 1.0518866625373863e-05, + "loss": 0.2565, + "step": 64020 + }, + { + "epoch": 2.846157265413166, + "grad_norm": 0.11263525485992432, + "learning_rate": 1.0506188035536735e-05, + "loss": 0.2552, + "step": 64030 + }, + { + "epoch": 2.846601769124772, + "grad_norm": 0.1272595375776291, + "learning_rate": 1.0493516194111919e-05, + "loss": 0.2569, + "step": 64040 + }, + { + "epoch": 2.847046272836378, + "grad_norm": 0.14213202893733978, + "learning_rate": 1.0480851103264688e-05, + "loss": 0.2541, + "step": 64050 + }, + { + "epoch": 2.847490776547984, + "grad_norm": 0.14021563529968262, + "learning_rate": 1.046819276515913e-05, + "loss": 0.2562, + "step": 64060 + }, + { + "epoch": 2.8479352802595903, + "grad_norm": 0.1346338540315628, + "learning_rate": 1.0455541181958256e-05, + "loss": 0.2579, + "step": 64070 + }, + { + "epoch": 2.848379783971196, + "grad_norm": 0.13135847449302673, + "learning_rate": 1.0442896355823822e-05, + "loss": 0.2564, + "step": 64080 + }, + { + "epoch": 2.848824287682802, + "grad_norm": 0.1352503001689911, + "learning_rate": 1.0430258288916539e-05, + "loss": 0.2529, + "step": 64090 + }, + { + "epoch": 2.849268791394408, + "grad_norm": 0.14660795032978058, + "learning_rate": 1.0417626983395868e-05, + "loss": 0.255, + "step": 64100 + }, + { + "epoch": 2.8497132951060142, + "grad_norm": 0.13681839406490326, + "learning_rate": 1.0405002441420165e-05, + "loss": 0.2533, + "step": 64110 + }, + { + "epoch": 2.8501577988176203, + "grad_norm": 0.13104751706123352, + "learning_rate": 1.039238466514662e-05, + "loss": 0.2572, + "step": 64120 + }, + { + "epoch": 2.850602302529226, + "grad_norm": 0.14106346666812897, + "learning_rate": 1.0379773656731262e-05, + "loss": 0.2566, + "step": 64130 + }, + { + "epoch": 2.851046806240832, + "grad_norm": 0.12333682179450989, + "learning_rate": 1.0367169418328986e-05, + "loss": 0.2572, + "step": 64140 + }, + { + "epoch": 2.851491309952438, + "grad_norm": 0.136954203248024, + "learning_rate": 1.0354571952093484e-05, + "loss": 0.2563, + "step": 64150 + }, + { + "epoch": 2.8519358136640443, + "grad_norm": 0.12138081341981888, + "learning_rate": 1.034198126017733e-05, + "loss": 0.2555, + "step": 64160 + }, + { + "epoch": 2.8523803173756503, + "grad_norm": 0.13808055222034454, + "learning_rate": 1.0329397344731928e-05, + "loss": 0.2552, + "step": 64170 + }, + { + "epoch": 2.852824821087256, + "grad_norm": 0.14751094579696655, + "learning_rate": 1.0316820207907524e-05, + "loss": 0.2566, + "step": 64180 + }, + { + "epoch": 2.853269324798862, + "grad_norm": 0.14520004391670227, + "learning_rate": 1.0304249851853215e-05, + "loss": 0.2565, + "step": 64190 + }, + { + "epoch": 2.853713828510468, + "grad_norm": 0.13006874918937683, + "learning_rate": 1.0291686278716933e-05, + "loss": 0.2572, + "step": 64200 + }, + { + "epoch": 2.854158332222074, + "grad_norm": 0.12765175104141235, + "learning_rate": 1.0279129490645418e-05, + "loss": 0.2533, + "step": 64210 + }, + { + "epoch": 2.85460283593368, + "grad_norm": 0.1255061775445938, + "learning_rate": 1.0266579489784328e-05, + "loss": 0.2565, + "step": 64220 + }, + { + "epoch": 2.855047339645286, + "grad_norm": 0.13368889689445496, + "learning_rate": 1.0254036278278084e-05, + "loss": 0.2552, + "step": 64230 + }, + { + "epoch": 2.855491843356892, + "grad_norm": 0.12289457768201828, + "learning_rate": 1.0241499858269982e-05, + "loss": 0.2569, + "step": 64240 + }, + { + "epoch": 2.855936347068498, + "grad_norm": 0.13574466109275818, + "learning_rate": 1.0228970231902169e-05, + "loss": 0.2569, + "step": 64250 + }, + { + "epoch": 2.856380850780104, + "grad_norm": 0.13716816902160645, + "learning_rate": 1.0216447401315582e-05, + "loss": 0.2558, + "step": 64260 + }, + { + "epoch": 2.85682535449171, + "grad_norm": 0.11387615650892258, + "learning_rate": 1.020393136865007e-05, + "loss": 0.254, + "step": 64270 + }, + { + "epoch": 2.857269858203316, + "grad_norm": 0.1516907960176468, + "learning_rate": 1.0191422136044242e-05, + "loss": 0.2585, + "step": 64280 + }, + { + "epoch": 2.857714361914922, + "grad_norm": 0.12930981814861298, + "learning_rate": 1.017891970563563e-05, + "loss": 0.257, + "step": 64290 + }, + { + "epoch": 2.858158865626528, + "grad_norm": 0.1972431242465973, + "learning_rate": 1.0166424079560516e-05, + "loss": 0.2546, + "step": 64300 + }, + { + "epoch": 2.858603369338134, + "grad_norm": 0.14386942982673645, + "learning_rate": 1.0153935259954078e-05, + "loss": 0.2558, + "step": 64310 + }, + { + "epoch": 2.85904787304974, + "grad_norm": 0.24442940950393677, + "learning_rate": 1.0141453248950311e-05, + "loss": 0.2555, + "step": 64320 + }, + { + "epoch": 2.859492376761346, + "grad_norm": 0.12646692991256714, + "learning_rate": 1.0128978048682054e-05, + "loss": 0.253, + "step": 64330 + }, + { + "epoch": 2.8599368804729517, + "grad_norm": 0.14000754058361053, + "learning_rate": 1.0116509661280982e-05, + "loss": 0.2563, + "step": 64340 + }, + { + "epoch": 2.860381384184558, + "grad_norm": 0.11448907107114792, + "learning_rate": 1.0104048088877576e-05, + "loss": 0.2528, + "step": 64350 + }, + { + "epoch": 2.860825887896164, + "grad_norm": 0.14500665664672852, + "learning_rate": 1.0091593333601201e-05, + "loss": 0.2547, + "step": 64360 + }, + { + "epoch": 2.86127039160777, + "grad_norm": 0.12474451959133148, + "learning_rate": 1.0079145397580031e-05, + "loss": 0.2537, + "step": 64370 + }, + { + "epoch": 2.861714895319376, + "grad_norm": 0.12112493067979813, + "learning_rate": 1.006670428294107e-05, + "loss": 0.2565, + "step": 64380 + }, + { + "epoch": 2.8621593990309817, + "grad_norm": 0.122945636510849, + "learning_rate": 1.0054269991810166e-05, + "loss": 0.2571, + "step": 64390 + }, + { + "epoch": 2.862603902742588, + "grad_norm": 0.13343513011932373, + "learning_rate": 1.0041842526312024e-05, + "loss": 0.2581, + "step": 64400 + }, + { + "epoch": 2.863048406454194, + "grad_norm": 0.16124536097049713, + "learning_rate": 1.0029421888570101e-05, + "loss": 0.2571, + "step": 64410 + }, + { + "epoch": 2.8634929101658, + "grad_norm": 0.11592745780944824, + "learning_rate": 1.0017008080706813e-05, + "loss": 0.2537, + "step": 64420 + }, + { + "epoch": 2.863937413877406, + "grad_norm": 0.12351148575544357, + "learning_rate": 1.0004601104843287e-05, + "loss": 0.2587, + "step": 64430 + }, + { + "epoch": 2.8643819175890117, + "grad_norm": 0.12221788614988327, + "learning_rate": 9.992200963099562e-06, + "loss": 0.255, + "step": 64440 + }, + { + "epoch": 2.864826421300618, + "grad_norm": 0.12837062776088715, + "learning_rate": 9.979807657594486e-06, + "loss": 0.254, + "step": 64450 + }, + { + "epoch": 2.865270925012224, + "grad_norm": 0.1525469273328781, + "learning_rate": 9.967421190445703e-06, + "loss": 0.2577, + "step": 64460 + }, + { + "epoch": 2.86571542872383, + "grad_norm": 0.15151654183864594, + "learning_rate": 9.955041563769769e-06, + "loss": 0.2563, + "step": 64470 + }, + { + "epoch": 2.866159932435436, + "grad_norm": 0.1449752002954483, + "learning_rate": 9.942668779681974e-06, + "loss": 0.2557, + "step": 64480 + }, + { + "epoch": 2.8666044361470417, + "grad_norm": 0.12228480726480484, + "learning_rate": 9.930302840296541e-06, + "loss": 0.2532, + "step": 64490 + }, + { + "epoch": 2.867048939858648, + "grad_norm": 0.11693688482046127, + "learning_rate": 9.917943747726426e-06, + "loss": 0.255, + "step": 64500 + }, + { + "epoch": 2.867493443570254, + "grad_norm": 0.11584945023059845, + "learning_rate": 9.905591504083484e-06, + "loss": 0.2556, + "step": 64510 + }, + { + "epoch": 2.8679379472818596, + "grad_norm": 0.130958691239357, + "learning_rate": 9.893246111478382e-06, + "loss": 0.2532, + "step": 64520 + }, + { + "epoch": 2.8683824509934657, + "grad_norm": 0.13076837360858917, + "learning_rate": 9.88090757202057e-06, + "loss": 0.2576, + "step": 64530 + }, + { + "epoch": 2.8688269547050718, + "grad_norm": 0.11855817586183548, + "learning_rate": 9.868575887818421e-06, + "loss": 0.2559, + "step": 64540 + }, + { + "epoch": 2.869271458416678, + "grad_norm": 0.13542991876602173, + "learning_rate": 9.856251060979044e-06, + "loss": 0.2539, + "step": 64550 + }, + { + "epoch": 2.869715962128284, + "grad_norm": 0.139699324965477, + "learning_rate": 9.843933093608426e-06, + "loss": 0.2548, + "step": 64560 + }, + { + "epoch": 2.8701604658398896, + "grad_norm": 0.17216801643371582, + "learning_rate": 9.831621987811368e-06, + "loss": 0.2551, + "step": 64570 + }, + { + "epoch": 2.8706049695514957, + "grad_norm": 0.13656838238239288, + "learning_rate": 9.819317745691509e-06, + "loss": 0.2575, + "step": 64580 + }, + { + "epoch": 2.871049473263102, + "grad_norm": 0.12030531466007233, + "learning_rate": 9.8070203693513e-06, + "loss": 0.2546, + "step": 64590 + }, + { + "epoch": 2.871493976974708, + "grad_norm": 0.11601200699806213, + "learning_rate": 9.794729860892048e-06, + "loss": 0.2578, + "step": 64600 + }, + { + "epoch": 2.871938480686314, + "grad_norm": 0.13619042932987213, + "learning_rate": 9.782446222413827e-06, + "loss": 0.2539, + "step": 64610 + }, + { + "epoch": 2.8723829843979196, + "grad_norm": 0.12248896062374115, + "learning_rate": 9.770169456015598e-06, + "loss": 0.2558, + "step": 64620 + }, + { + "epoch": 2.8728274881095257, + "grad_norm": 0.11764942109584808, + "learning_rate": 9.75789956379512e-06, + "loss": 0.2553, + "step": 64630 + }, + { + "epoch": 2.873271991821132, + "grad_norm": 0.1205664649605751, + "learning_rate": 9.74563654784898e-06, + "loss": 0.2563, + "step": 64640 + }, + { + "epoch": 2.8737164955327374, + "grad_norm": 0.10287076234817505, + "learning_rate": 9.733380410272596e-06, + "loss": 0.254, + "step": 64650 + }, + { + "epoch": 2.8741609992443435, + "grad_norm": 0.12996535003185272, + "learning_rate": 9.721131153160207e-06, + "loss": 0.2564, + "step": 64660 + }, + { + "epoch": 2.8746055029559496, + "grad_norm": 0.14775128662586212, + "learning_rate": 9.708888778604886e-06, + "loss": 0.2558, + "step": 64670 + }, + { + "epoch": 2.8750500066675557, + "grad_norm": 0.14043028652668, + "learning_rate": 9.696653288698488e-06, + "loss": 0.256, + "step": 64680 + }, + { + "epoch": 2.875494510379162, + "grad_norm": 0.1451343595981598, + "learning_rate": 9.684424685531763e-06, + "loss": 0.2567, + "step": 64690 + }, + { + "epoch": 2.8759390140907675, + "grad_norm": 0.138384148478508, + "learning_rate": 9.672202971194216e-06, + "loss": 0.2562, + "step": 64700 + }, + { + "epoch": 2.8763835178023736, + "grad_norm": 0.10997457802295685, + "learning_rate": 9.659988147774213e-06, + "loss": 0.2551, + "step": 64710 + }, + { + "epoch": 2.8768280215139796, + "grad_norm": 0.12886185944080353, + "learning_rate": 9.647780217358942e-06, + "loss": 0.2558, + "step": 64720 + }, + { + "epoch": 2.8772725252255857, + "grad_norm": 0.11559990793466568, + "learning_rate": 9.635579182034376e-06, + "loss": 0.2557, + "step": 64730 + }, + { + "epoch": 2.877717028937192, + "grad_norm": 0.10695246607065201, + "learning_rate": 9.623385043885386e-06, + "loss": 0.2559, + "step": 64740 + }, + { + "epoch": 2.8781615326487975, + "grad_norm": 0.12435571104288101, + "learning_rate": 9.61119780499557e-06, + "loss": 0.2561, + "step": 64750 + }, + { + "epoch": 2.8786060363604036, + "grad_norm": 0.13456600904464722, + "learning_rate": 9.599017467447418e-06, + "loss": 0.2548, + "step": 64760 + }, + { + "epoch": 2.8790505400720097, + "grad_norm": 0.12548375129699707, + "learning_rate": 9.586844033322206e-06, + "loss": 0.2557, + "step": 64770 + }, + { + "epoch": 2.8794950437836153, + "grad_norm": 0.14965835213661194, + "learning_rate": 9.574677504700052e-06, + "loss": 0.2558, + "step": 64780 + }, + { + "epoch": 2.879939547495222, + "grad_norm": 0.1301298439502716, + "learning_rate": 9.562517883659877e-06, + "loss": 0.2555, + "step": 64790 + }, + { + "epoch": 2.8803840512068275, + "grad_norm": 0.1499919891357422, + "learning_rate": 9.55036517227944e-06, + "loss": 0.2561, + "step": 64800 + }, + { + "epoch": 2.8808285549184336, + "grad_norm": 0.13274483382701874, + "learning_rate": 9.538219372635282e-06, + "loss": 0.2549, + "step": 64810 + }, + { + "epoch": 2.8812730586300397, + "grad_norm": 0.12167077511548996, + "learning_rate": 9.526080486802802e-06, + "loss": 0.2563, + "step": 64820 + }, + { + "epoch": 2.8817175623416453, + "grad_norm": 0.10629270225763321, + "learning_rate": 9.513948516856203e-06, + "loss": 0.2585, + "step": 64830 + }, + { + "epoch": 2.8821620660532514, + "grad_norm": 0.137064129114151, + "learning_rate": 9.501823464868504e-06, + "loss": 0.2585, + "step": 64840 + }, + { + "epoch": 2.8826065697648575, + "grad_norm": 0.12207967787981033, + "learning_rate": 9.489705332911547e-06, + "loss": 0.2564, + "step": 64850 + }, + { + "epoch": 2.8830510734764636, + "grad_norm": 0.12429552525281906, + "learning_rate": 9.477594123055994e-06, + "loss": 0.2539, + "step": 64860 + }, + { + "epoch": 2.8834955771880697, + "grad_norm": 0.14226336777210236, + "learning_rate": 9.465489837371321e-06, + "loss": 0.2565, + "step": 64870 + }, + { + "epoch": 2.8839400808996754, + "grad_norm": 0.12746192514896393, + "learning_rate": 9.453392477925794e-06, + "loss": 0.2572, + "step": 64880 + }, + { + "epoch": 2.8843845846112814, + "grad_norm": 0.13395285606384277, + "learning_rate": 9.441302046786566e-06, + "loss": 0.2537, + "step": 64890 + }, + { + "epoch": 2.8848290883228875, + "grad_norm": 0.14432312548160553, + "learning_rate": 9.429218546019519e-06, + "loss": 0.2557, + "step": 64900 + }, + { + "epoch": 2.8852735920344936, + "grad_norm": 0.15101635456085205, + "learning_rate": 9.41714197768941e-06, + "loss": 0.2528, + "step": 64910 + }, + { + "epoch": 2.8857180957460997, + "grad_norm": 0.12672428786754608, + "learning_rate": 9.405072343859805e-06, + "loss": 0.2538, + "step": 64920 + }, + { + "epoch": 2.8861625994577054, + "grad_norm": 0.12335944920778275, + "learning_rate": 9.393009646593043e-06, + "loss": 0.2559, + "step": 64930 + }, + { + "epoch": 2.8866071031693115, + "grad_norm": 0.13146507740020752, + "learning_rate": 9.38095388795035e-06, + "loss": 0.2558, + "step": 64940 + }, + { + "epoch": 2.8870516068809176, + "grad_norm": 0.11474163085222244, + "learning_rate": 9.36890506999169e-06, + "loss": 0.2554, + "step": 64950 + }, + { + "epoch": 2.887496110592523, + "grad_norm": 0.12634514272212982, + "learning_rate": 9.356863194775894e-06, + "loss": 0.2559, + "step": 64960 + }, + { + "epoch": 2.8879406143041293, + "grad_norm": 0.13746023178100586, + "learning_rate": 9.344828264360583e-06, + "loss": 0.2531, + "step": 64970 + }, + { + "epoch": 2.8883851180157354, + "grad_norm": 0.13368189334869385, + "learning_rate": 9.332800280802201e-06, + "loss": 0.2569, + "step": 64980 + }, + { + "epoch": 2.8888296217273415, + "grad_norm": 0.11957535892724991, + "learning_rate": 9.32077924615602e-06, + "loss": 0.2577, + "step": 64990 + }, + { + "epoch": 2.8892741254389476, + "grad_norm": 0.14305680990219116, + "learning_rate": 9.308765162476063e-06, + "loss": 0.2565, + "step": 65000 + }, + { + "epoch": 2.889718629150553, + "grad_norm": 0.13457539677619934, + "learning_rate": 9.296758031815239e-06, + "loss": 0.2549, + "step": 65010 + }, + { + "epoch": 2.8901631328621593, + "grad_norm": 0.1349479854106903, + "learning_rate": 9.284757856225229e-06, + "loss": 0.2539, + "step": 65020 + }, + { + "epoch": 2.8906076365737654, + "grad_norm": 0.1208711788058281, + "learning_rate": 9.272764637756538e-06, + "loss": 0.2567, + "step": 65030 + }, + { + "epoch": 2.8910521402853715, + "grad_norm": 0.13656242191791534, + "learning_rate": 9.260778378458479e-06, + "loss": 0.255, + "step": 65040 + }, + { + "epoch": 2.8914966439969776, + "grad_norm": 0.1085580438375473, + "learning_rate": 9.248799080379172e-06, + "loss": 0.2527, + "step": 65050 + }, + { + "epoch": 2.8919411477085832, + "grad_norm": 0.15797553956508636, + "learning_rate": 9.236826745565558e-06, + "loss": 0.2551, + "step": 65060 + }, + { + "epoch": 2.8923856514201893, + "grad_norm": 0.13825850188732147, + "learning_rate": 9.224861376063388e-06, + "loss": 0.2543, + "step": 65070 + }, + { + "epoch": 2.8928301551317954, + "grad_norm": 0.12305935472249985, + "learning_rate": 9.212902973917192e-06, + "loss": 0.2536, + "step": 65080 + }, + { + "epoch": 2.893274658843401, + "grad_norm": 0.15814650058746338, + "learning_rate": 9.20095154117035e-06, + "loss": 0.2554, + "step": 65090 + }, + { + "epoch": 2.8937191625550076, + "grad_norm": 0.14222799241542816, + "learning_rate": 9.189007079865036e-06, + "loss": 0.2539, + "step": 65100 + }, + { + "epoch": 2.8941636662666133, + "grad_norm": 0.1415189802646637, + "learning_rate": 9.177069592042226e-06, + "loss": 0.2553, + "step": 65110 + }, + { + "epoch": 2.8946081699782193, + "grad_norm": 0.144038125872612, + "learning_rate": 9.165139079741724e-06, + "loss": 0.2562, + "step": 65120 + }, + { + "epoch": 2.8950526736898254, + "grad_norm": 0.11915759742259979, + "learning_rate": 9.153215545002098e-06, + "loss": 0.2545, + "step": 65130 + }, + { + "epoch": 2.895497177401431, + "grad_norm": 0.1385526806116104, + "learning_rate": 9.141298989860798e-06, + "loss": 0.2564, + "step": 65140 + }, + { + "epoch": 2.895941681113037, + "grad_norm": 0.1521984040737152, + "learning_rate": 9.129389416353994e-06, + "loss": 0.2589, + "step": 65150 + }, + { + "epoch": 2.8963861848246433, + "grad_norm": 0.13599540293216705, + "learning_rate": 9.11748682651673e-06, + "loss": 0.258, + "step": 65160 + }, + { + "epoch": 2.8968306885362494, + "grad_norm": 0.21325697004795074, + "learning_rate": 9.105591222382837e-06, + "loss": 0.2574, + "step": 65170 + }, + { + "epoch": 2.8972751922478555, + "grad_norm": 0.11341888457536697, + "learning_rate": 9.093702605984915e-06, + "loss": 0.2525, + "step": 65180 + }, + { + "epoch": 2.897719695959461, + "grad_norm": 0.1475938856601715, + "learning_rate": 9.081820979354455e-06, + "loss": 0.2546, + "step": 65190 + }, + { + "epoch": 2.898164199671067, + "grad_norm": 0.1219915971159935, + "learning_rate": 9.069946344521663e-06, + "loss": 0.2526, + "step": 65200 + }, + { + "epoch": 2.8986087033826733, + "grad_norm": 0.1399773806333542, + "learning_rate": 9.058078703515598e-06, + "loss": 0.2609, + "step": 65210 + }, + { + "epoch": 2.8990532070942794, + "grad_norm": 0.12832684814929962, + "learning_rate": 9.046218058364125e-06, + "loss": 0.2562, + "step": 65220 + }, + { + "epoch": 2.8994977108058855, + "grad_norm": 0.1354331523180008, + "learning_rate": 9.034364411093893e-06, + "loss": 0.2556, + "step": 65230 + }, + { + "epoch": 2.899942214517491, + "grad_norm": 0.1385434865951538, + "learning_rate": 9.022517763730371e-06, + "loss": 0.254, + "step": 65240 + }, + { + "epoch": 2.900386718229097, + "grad_norm": 0.1294080764055252, + "learning_rate": 9.010678118297827e-06, + "loss": 0.254, + "step": 65250 + }, + { + "epoch": 2.9008312219407033, + "grad_norm": 0.11572139710187912, + "learning_rate": 8.998845476819345e-06, + "loss": 0.2554, + "step": 65260 + }, + { + "epoch": 2.901275725652309, + "grad_norm": 0.12105385214090347, + "learning_rate": 8.987019841316773e-06, + "loss": 0.255, + "step": 65270 + }, + { + "epoch": 2.901720229363915, + "grad_norm": 0.12836311757564545, + "learning_rate": 8.975201213810802e-06, + "loss": 0.2555, + "step": 65280 + }, + { + "epoch": 2.902164733075521, + "grad_norm": 0.11660671979188919, + "learning_rate": 8.963389596320915e-06, + "loss": 0.2539, + "step": 65290 + }, + { + "epoch": 2.9026092367871272, + "grad_norm": 0.11098209768533707, + "learning_rate": 8.951584990865391e-06, + "loss": 0.2571, + "step": 65300 + }, + { + "epoch": 2.9030537404987333, + "grad_norm": 0.1433139592409134, + "learning_rate": 8.939787399461319e-06, + "loss": 0.2537, + "step": 65310 + }, + { + "epoch": 2.903498244210339, + "grad_norm": 0.10594891756772995, + "learning_rate": 8.927996824124591e-06, + "loss": 0.2547, + "step": 65320 + }, + { + "epoch": 2.903942747921945, + "grad_norm": 0.11232805252075195, + "learning_rate": 8.916213266869854e-06, + "loss": 0.2542, + "step": 65330 + }, + { + "epoch": 2.904387251633551, + "grad_norm": 0.14360898733139038, + "learning_rate": 8.904436729710658e-06, + "loss": 0.2545, + "step": 65340 + }, + { + "epoch": 2.9048317553451573, + "grad_norm": 0.1579005867242813, + "learning_rate": 8.892667214659245e-06, + "loss": 0.2563, + "step": 65350 + }, + { + "epoch": 2.9052762590567633, + "grad_norm": 0.11543988436460495, + "learning_rate": 8.880904723726713e-06, + "loss": 0.2554, + "step": 65360 + }, + { + "epoch": 2.905720762768369, + "grad_norm": 0.1063874214887619, + "learning_rate": 8.869149258922971e-06, + "loss": 0.2562, + "step": 65370 + }, + { + "epoch": 2.906165266479975, + "grad_norm": 0.11995133012533188, + "learning_rate": 8.857400822256662e-06, + "loss": 0.2567, + "step": 65380 + }, + { + "epoch": 2.906609770191581, + "grad_norm": 0.13027019798755646, + "learning_rate": 8.845659415735324e-06, + "loss": 0.2534, + "step": 65390 + }, + { + "epoch": 2.907054273903187, + "grad_norm": 0.13626354932785034, + "learning_rate": 8.83392504136521e-06, + "loss": 0.2554, + "step": 65400 + }, + { + "epoch": 2.907498777614793, + "grad_norm": 0.12238729000091553, + "learning_rate": 8.822197701151407e-06, + "loss": 0.2537, + "step": 65410 + }, + { + "epoch": 2.907943281326399, + "grad_norm": 0.1267772912979126, + "learning_rate": 8.810477397097804e-06, + "loss": 0.2545, + "step": 65420 + }, + { + "epoch": 2.908387785038005, + "grad_norm": 0.13642458617687225, + "learning_rate": 8.798764131207077e-06, + "loss": 0.2543, + "step": 65430 + }, + { + "epoch": 2.908832288749611, + "grad_norm": 0.12769323587417603, + "learning_rate": 8.787057905480706e-06, + "loss": 0.2554, + "step": 65440 + }, + { + "epoch": 2.909276792461217, + "grad_norm": 0.13594792783260345, + "learning_rate": 8.775358721918958e-06, + "loss": 0.2542, + "step": 65450 + }, + { + "epoch": 2.909721296172823, + "grad_norm": 0.1318831443786621, + "learning_rate": 8.763666582520923e-06, + "loss": 0.2544, + "step": 65460 + }, + { + "epoch": 2.910165799884429, + "grad_norm": 0.13133619725704193, + "learning_rate": 8.751981489284445e-06, + "loss": 0.2563, + "step": 65470 + }, + { + "epoch": 2.910610303596035, + "grad_norm": 0.12629421055316925, + "learning_rate": 8.740303444206188e-06, + "loss": 0.2548, + "step": 65480 + }, + { + "epoch": 2.911054807307641, + "grad_norm": 0.11131679266691208, + "learning_rate": 8.72863244928162e-06, + "loss": 0.2579, + "step": 65490 + }, + { + "epoch": 2.911499311019247, + "grad_norm": 0.29068121314048767, + "learning_rate": 8.716968506504991e-06, + "loss": 0.2568, + "step": 65500 + }, + { + "epoch": 2.911943814730853, + "grad_norm": 0.14686283469200134, + "learning_rate": 8.70531161786935e-06, + "loss": 0.2565, + "step": 65510 + }, + { + "epoch": 2.912388318442459, + "grad_norm": 0.1359931230545044, + "learning_rate": 8.693661785366558e-06, + "loss": 0.2548, + "step": 65520 + }, + { + "epoch": 2.912832822154065, + "grad_norm": 0.15048865973949432, + "learning_rate": 8.682019010987208e-06, + "loss": 0.2579, + "step": 65530 + }, + { + "epoch": 2.9132773258656712, + "grad_norm": 0.1650935709476471, + "learning_rate": 8.670383296720786e-06, + "loss": 0.2589, + "step": 65540 + }, + { + "epoch": 2.913721829577277, + "grad_norm": 0.1306859403848648, + "learning_rate": 8.658754644555478e-06, + "loss": 0.2538, + "step": 65550 + }, + { + "epoch": 2.914166333288883, + "grad_norm": 0.11886272579431534, + "learning_rate": 8.647133056478313e-06, + "loss": 0.2517, + "step": 65560 + }, + { + "epoch": 2.914610837000489, + "grad_norm": 0.12502621114253998, + "learning_rate": 8.635518534475123e-06, + "loss": 0.256, + "step": 65570 + }, + { + "epoch": 2.9150553407120947, + "grad_norm": 0.12658698856830597, + "learning_rate": 8.623911080530467e-06, + "loss": 0.2575, + "step": 65580 + }, + { + "epoch": 2.915499844423701, + "grad_norm": 0.12086085975170135, + "learning_rate": 8.61231069662779e-06, + "loss": 0.2561, + "step": 65590 + }, + { + "epoch": 2.915944348135307, + "grad_norm": 0.12812243402004242, + "learning_rate": 8.600717384749252e-06, + "loss": 0.2565, + "step": 65600 + }, + { + "epoch": 2.916388851846913, + "grad_norm": 0.14209912717342377, + "learning_rate": 8.58913114687584e-06, + "loss": 0.2563, + "step": 65610 + }, + { + "epoch": 2.916833355558519, + "grad_norm": 0.14739947021007538, + "learning_rate": 8.57755198498732e-06, + "loss": 0.2556, + "step": 65620 + }, + { + "epoch": 2.9172778592701247, + "grad_norm": 0.1626635044813156, + "learning_rate": 8.565979901062265e-06, + "loss": 0.2565, + "step": 65630 + }, + { + "epoch": 2.917722362981731, + "grad_norm": 0.14075128734111786, + "learning_rate": 8.554414897078033e-06, + "loss": 0.2533, + "step": 65640 + }, + { + "epoch": 2.918166866693337, + "grad_norm": 0.13574865460395813, + "learning_rate": 8.542856975010727e-06, + "loss": 0.2536, + "step": 65650 + }, + { + "epoch": 2.918611370404943, + "grad_norm": 0.12016942352056503, + "learning_rate": 8.531306136835337e-06, + "loss": 0.2547, + "step": 65660 + }, + { + "epoch": 2.919055874116549, + "grad_norm": 0.1165909469127655, + "learning_rate": 8.519762384525548e-06, + "loss": 0.2556, + "step": 65670 + }, + { + "epoch": 2.9195003778281547, + "grad_norm": 0.1124756932258606, + "learning_rate": 8.508225720053875e-06, + "loss": 0.2543, + "step": 65680 + }, + { + "epoch": 2.919944881539761, + "grad_norm": 0.14473558962345123, + "learning_rate": 8.496696145391625e-06, + "loss": 0.2532, + "step": 65690 + }, + { + "epoch": 2.920389385251367, + "grad_norm": 0.12637020647525787, + "learning_rate": 8.485173662508889e-06, + "loss": 0.2581, + "step": 65700 + }, + { + "epoch": 2.9208338889629726, + "grad_norm": 0.12277226150035858, + "learning_rate": 8.473658273374536e-06, + "loss": 0.2527, + "step": 65710 + }, + { + "epoch": 2.9212783926745787, + "grad_norm": 0.12479597330093384, + "learning_rate": 8.462149979956253e-06, + "loss": 0.2549, + "step": 65720 + }, + { + "epoch": 2.9217228963861848, + "grad_norm": 0.13191115856170654, + "learning_rate": 8.450648784220461e-06, + "loss": 0.2553, + "step": 65730 + }, + { + "epoch": 2.922167400097791, + "grad_norm": 0.12646447122097015, + "learning_rate": 8.439154688132417e-06, + "loss": 0.2547, + "step": 65740 + }, + { + "epoch": 2.922611903809397, + "grad_norm": 0.1147458627820015, + "learning_rate": 8.427667693656143e-06, + "loss": 0.257, + "step": 65750 + }, + { + "epoch": 2.9230564075210026, + "grad_norm": 0.1267818957567215, + "learning_rate": 8.416187802754454e-06, + "loss": 0.257, + "step": 65760 + }, + { + "epoch": 2.9235009112326087, + "grad_norm": 0.14576652646064758, + "learning_rate": 8.404715017388965e-06, + "loss": 0.2575, + "step": 65770 + }, + { + "epoch": 2.923945414944215, + "grad_norm": 0.12492913007736206, + "learning_rate": 8.393249339520015e-06, + "loss": 0.2554, + "step": 65780 + }, + { + "epoch": 2.924389918655821, + "grad_norm": 0.1118214800953865, + "learning_rate": 8.381790771106834e-06, + "loss": 0.2581, + "step": 65790 + }, + { + "epoch": 2.924834422367427, + "grad_norm": 0.12588588893413544, + "learning_rate": 8.370339314107339e-06, + "loss": 0.2545, + "step": 65800 + }, + { + "epoch": 2.9252789260790326, + "grad_norm": 0.1227419450879097, + "learning_rate": 8.358894970478281e-06, + "loss": 0.2554, + "step": 65810 + }, + { + "epoch": 2.9257234297906387, + "grad_norm": 0.11504572629928589, + "learning_rate": 8.347457742175196e-06, + "loss": 0.2556, + "step": 65820 + }, + { + "epoch": 2.926167933502245, + "grad_norm": 0.12418263405561447, + "learning_rate": 8.33602763115236e-06, + "loss": 0.2566, + "step": 65830 + }, + { + "epoch": 2.926612437213851, + "grad_norm": 0.13952277600765228, + "learning_rate": 8.324604639362916e-06, + "loss": 0.2547, + "step": 65840 + }, + { + "epoch": 2.927056940925457, + "grad_norm": 0.10669867694377899, + "learning_rate": 8.31318876875869e-06, + "loss": 0.2534, + "step": 65850 + }, + { + "epoch": 2.9275014446370626, + "grad_norm": 0.122939832508564, + "learning_rate": 8.30178002129039e-06, + "loss": 0.2542, + "step": 65860 + }, + { + "epoch": 2.9279459483486687, + "grad_norm": 0.10983947664499283, + "learning_rate": 8.290378398907423e-06, + "loss": 0.2547, + "step": 65870 + }, + { + "epoch": 2.928390452060275, + "grad_norm": 0.11875589936971664, + "learning_rate": 8.278983903558029e-06, + "loss": 0.2528, + "step": 65880 + }, + { + "epoch": 2.9288349557718805, + "grad_norm": 0.12858641147613525, + "learning_rate": 8.26759653718921e-06, + "loss": 0.2557, + "step": 65890 + }, + { + "epoch": 2.9292794594834866, + "grad_norm": 0.1316404938697815, + "learning_rate": 8.25621630174676e-06, + "loss": 0.2549, + "step": 65900 + }, + { + "epoch": 2.9297239631950927, + "grad_norm": 0.12947162985801697, + "learning_rate": 8.244843199175261e-06, + "loss": 0.2547, + "step": 65910 + }, + { + "epoch": 2.9301684669066987, + "grad_norm": 0.13515686988830566, + "learning_rate": 8.233477231418046e-06, + "loss": 0.2557, + "step": 65920 + }, + { + "epoch": 2.930612970618305, + "grad_norm": 0.11511726677417755, + "learning_rate": 8.22211840041725e-06, + "loss": 0.2562, + "step": 65930 + }, + { + "epoch": 2.9310574743299105, + "grad_norm": 0.132174551486969, + "learning_rate": 8.210766708113792e-06, + "loss": 0.2564, + "step": 65940 + }, + { + "epoch": 2.9315019780415166, + "grad_norm": 0.1300167292356491, + "learning_rate": 8.199422156447367e-06, + "loss": 0.2553, + "step": 65950 + }, + { + "epoch": 2.9319464817531227, + "grad_norm": 0.1129041463136673, + "learning_rate": 8.188084747356451e-06, + "loss": 0.2547, + "step": 65960 + }, + { + "epoch": 2.9323909854647288, + "grad_norm": 0.13543981313705444, + "learning_rate": 8.176754482778299e-06, + "loss": 0.2553, + "step": 65970 + }, + { + "epoch": 2.932835489176335, + "grad_norm": 0.12381604313850403, + "learning_rate": 8.165431364648918e-06, + "loss": 0.2547, + "step": 65980 + }, + { + "epoch": 2.9332799928879405, + "grad_norm": 0.13258571922779083, + "learning_rate": 8.154115394903162e-06, + "loss": 0.2548, + "step": 65990 + }, + { + "epoch": 2.9337244965995466, + "grad_norm": 0.125703826546669, + "learning_rate": 8.142806575474582e-06, + "loss": 0.2551, + "step": 66000 + }, + { + "epoch": 2.9341690003111527, + "grad_norm": 0.15147016942501068, + "learning_rate": 8.131504908295562e-06, + "loss": 0.2519, + "step": 66010 + }, + { + "epoch": 2.9346135040227583, + "grad_norm": 0.15233825147151947, + "learning_rate": 8.120210395297262e-06, + "loss": 0.2559, + "step": 66020 + }, + { + "epoch": 2.9350580077343644, + "grad_norm": 0.13593322038650513, + "learning_rate": 8.108923038409565e-06, + "loss": 0.2569, + "step": 66030 + }, + { + "epoch": 2.9355025114459705, + "grad_norm": 0.1286202073097229, + "learning_rate": 8.097642839561226e-06, + "loss": 0.2566, + "step": 66040 + }, + { + "epoch": 2.9359470151575766, + "grad_norm": 0.15086202323436737, + "learning_rate": 8.08636980067966e-06, + "loss": 0.2543, + "step": 66050 + }, + { + "epoch": 2.9363915188691827, + "grad_norm": 0.1175115704536438, + "learning_rate": 8.075103923691186e-06, + "loss": 0.2537, + "step": 66060 + }, + { + "epoch": 2.9368360225807884, + "grad_norm": 0.14103546738624573, + "learning_rate": 8.063845210520793e-06, + "loss": 0.2539, + "step": 66070 + }, + { + "epoch": 2.9372805262923944, + "grad_norm": 0.1257592737674713, + "learning_rate": 8.052593663092295e-06, + "loss": 0.2533, + "step": 66080 + }, + { + "epoch": 2.9377250300040005, + "grad_norm": 0.10489121824502945, + "learning_rate": 8.04134928332827e-06, + "loss": 0.2544, + "step": 66090 + }, + { + "epoch": 2.9381695337156066, + "grad_norm": 0.12265139818191528, + "learning_rate": 8.030112073150086e-06, + "loss": 0.2524, + "step": 66100 + }, + { + "epoch": 2.9386140374272127, + "grad_norm": 0.12186766415834427, + "learning_rate": 8.018882034477881e-06, + "loss": 0.2571, + "step": 66110 + }, + { + "epoch": 2.9390585411388184, + "grad_norm": 0.12499325722455978, + "learning_rate": 8.007659169230541e-06, + "loss": 0.254, + "step": 66120 + }, + { + "epoch": 2.9395030448504245, + "grad_norm": 0.1406761258840561, + "learning_rate": 7.996443479325755e-06, + "loss": 0.2528, + "step": 66130 + }, + { + "epoch": 2.9399475485620306, + "grad_norm": 0.10748114436864853, + "learning_rate": 7.985234966679977e-06, + "loss": 0.2529, + "step": 66140 + }, + { + "epoch": 2.9403920522736366, + "grad_norm": 0.12449635565280914, + "learning_rate": 7.974033633208438e-06, + "loss": 0.2545, + "step": 66150 + }, + { + "epoch": 2.9408365559852427, + "grad_norm": 0.1273946464061737, + "learning_rate": 7.962839480825135e-06, + "loss": 0.2567, + "step": 66160 + }, + { + "epoch": 2.9412810596968484, + "grad_norm": 0.12698641419410706, + "learning_rate": 7.951652511442858e-06, + "loss": 0.2535, + "step": 66170 + }, + { + "epoch": 2.9417255634084545, + "grad_norm": 0.13077110052108765, + "learning_rate": 7.940472726973125e-06, + "loss": 0.2544, + "step": 66180 + }, + { + "epoch": 2.9421700671200606, + "grad_norm": 0.11015108227729797, + "learning_rate": 7.929300129326289e-06, + "loss": 0.2553, + "step": 66190 + }, + { + "epoch": 2.942614570831666, + "grad_norm": 0.14561660587787628, + "learning_rate": 7.91813472041142e-06, + "loss": 0.2531, + "step": 66200 + }, + { + "epoch": 2.9430590745432723, + "grad_norm": 0.107585109770298, + "learning_rate": 7.906976502136376e-06, + "loss": 0.2532, + "step": 66210 + }, + { + "epoch": 2.9435035782548784, + "grad_norm": 0.123592309653759, + "learning_rate": 7.89582547640782e-06, + "loss": 0.2527, + "step": 66220 + }, + { + "epoch": 2.9439480819664845, + "grad_norm": 0.14683303236961365, + "learning_rate": 7.884681645131115e-06, + "loss": 0.2556, + "step": 66230 + }, + { + "epoch": 2.9443925856780906, + "grad_norm": 0.12136325240135193, + "learning_rate": 7.87354501021048e-06, + "loss": 0.255, + "step": 66240 + }, + { + "epoch": 2.9448370893896962, + "grad_norm": 0.1343410760164261, + "learning_rate": 7.86241557354882e-06, + "loss": 0.2546, + "step": 66250 + }, + { + "epoch": 2.9452815931013023, + "grad_norm": 0.11763359606266022, + "learning_rate": 7.8512933370479e-06, + "loss": 0.2558, + "step": 66260 + }, + { + "epoch": 2.9457260968129084, + "grad_norm": 0.11935857683420181, + "learning_rate": 7.840178302608158e-06, + "loss": 0.255, + "step": 66270 + }, + { + "epoch": 2.9461706005245145, + "grad_norm": 0.13838264346122742, + "learning_rate": 7.82907047212888e-06, + "loss": 0.2544, + "step": 66280 + }, + { + "epoch": 2.9466151042361206, + "grad_norm": 0.11149413138628006, + "learning_rate": 7.81796984750809e-06, + "loss": 0.2552, + "step": 66290 + }, + { + "epoch": 2.9470596079477263, + "grad_norm": 0.11278802901506424, + "learning_rate": 7.806876430642546e-06, + "loss": 0.2541, + "step": 66300 + }, + { + "epoch": 2.9475041116593323, + "grad_norm": 0.11885835230350494, + "learning_rate": 7.795790223427862e-06, + "loss": 0.2561, + "step": 66310 + }, + { + "epoch": 2.9479486153709384, + "grad_norm": 0.11506672948598862, + "learning_rate": 7.784711227758324e-06, + "loss": 0.2542, + "step": 66320 + }, + { + "epoch": 2.948393119082544, + "grad_norm": 0.1284295916557312, + "learning_rate": 7.773639445527053e-06, + "loss": 0.2522, + "step": 66330 + }, + { + "epoch": 2.94883762279415, + "grad_norm": 0.13058990240097046, + "learning_rate": 7.762574878625905e-06, + "loss": 0.2536, + "step": 66340 + }, + { + "epoch": 2.9492821265057563, + "grad_norm": 0.1364789754152298, + "learning_rate": 7.751517528945513e-06, + "loss": 0.2536, + "step": 66350 + }, + { + "epoch": 2.9497266302173624, + "grad_norm": 0.1406782865524292, + "learning_rate": 7.740467398375278e-06, + "loss": 0.2535, + "step": 66360 + }, + { + "epoch": 2.9501711339289685, + "grad_norm": 0.12830166518688202, + "learning_rate": 7.729424488803378e-06, + "loss": 0.2535, + "step": 66370 + }, + { + "epoch": 2.950615637640574, + "grad_norm": 0.11736436188220978, + "learning_rate": 7.71838880211671e-06, + "loss": 0.2534, + "step": 66380 + }, + { + "epoch": 2.95106014135218, + "grad_norm": 0.12134657800197601, + "learning_rate": 7.707360340200997e-06, + "loss": 0.2563, + "step": 66390 + }, + { + "epoch": 2.9515046450637863, + "grad_norm": 0.11388030648231506, + "learning_rate": 7.696339104940697e-06, + "loss": 0.252, + "step": 66400 + }, + { + "epoch": 2.9519491487753924, + "grad_norm": 0.11965347081422806, + "learning_rate": 7.685325098219038e-06, + "loss": 0.257, + "step": 66410 + }, + { + "epoch": 2.9523936524869985, + "grad_norm": 0.11152241379022598, + "learning_rate": 7.674318321918017e-06, + "loss": 0.2541, + "step": 66420 + }, + { + "epoch": 2.952838156198604, + "grad_norm": 0.11419903486967087, + "learning_rate": 7.663318777918366e-06, + "loss": 0.2541, + "step": 66430 + }, + { + "epoch": 2.95328265991021, + "grad_norm": 0.1244615763425827, + "learning_rate": 7.652326468099647e-06, + "loss": 0.2543, + "step": 66440 + }, + { + "epoch": 2.9537271636218163, + "grad_norm": 0.16405951976776123, + "learning_rate": 7.641341394340096e-06, + "loss": 0.2575, + "step": 66450 + }, + { + "epoch": 2.954171667333422, + "grad_norm": 0.12912924587726593, + "learning_rate": 7.630363558516818e-06, + "loss": 0.2549, + "step": 66460 + }, + { + "epoch": 2.9546161710450285, + "grad_norm": 0.1427520513534546, + "learning_rate": 7.619392962505578e-06, + "loss": 0.2545, + "step": 66470 + }, + { + "epoch": 2.955060674756634, + "grad_norm": 0.1480235904455185, + "learning_rate": 7.6084296081809725e-06, + "loss": 0.2564, + "step": 66480 + }, + { + "epoch": 2.9555051784682402, + "grad_norm": 0.11319751292467117, + "learning_rate": 7.597473497416347e-06, + "loss": 0.2551, + "step": 66490 + }, + { + "epoch": 2.9559496821798463, + "grad_norm": 0.14091086387634277, + "learning_rate": 7.586524632083764e-06, + "loss": 0.2531, + "step": 66500 + }, + { + "epoch": 2.956394185891452, + "grad_norm": 0.12506866455078125, + "learning_rate": 7.5755830140541326e-06, + "loss": 0.2571, + "step": 66510 + }, + { + "epoch": 2.956838689603058, + "grad_norm": 0.14413699507713318, + "learning_rate": 7.5646486451970475e-06, + "loss": 0.2541, + "step": 66520 + }, + { + "epoch": 2.957283193314664, + "grad_norm": 0.11437880992889404, + "learning_rate": 7.553721527380897e-06, + "loss": 0.2536, + "step": 66530 + }, + { + "epoch": 2.9577276970262703, + "grad_norm": 0.12260180711746216, + "learning_rate": 7.542801662472826e-06, + "loss": 0.2541, + "step": 66540 + }, + { + "epoch": 2.9581722007378763, + "grad_norm": 0.11290666460990906, + "learning_rate": 7.5318890523387475e-06, + "loss": 0.253, + "step": 66550 + }, + { + "epoch": 2.958616704449482, + "grad_norm": 0.1372634321451187, + "learning_rate": 7.52098369884332e-06, + "loss": 0.2554, + "step": 66560 + }, + { + "epoch": 2.959061208161088, + "grad_norm": 0.13005247712135315, + "learning_rate": 7.510085603849992e-06, + "loss": 0.2528, + "step": 66570 + }, + { + "epoch": 2.959505711872694, + "grad_norm": 0.14337925612926483, + "learning_rate": 7.499194769220918e-06, + "loss": 0.2568, + "step": 66580 + }, + { + "epoch": 2.9599502155843003, + "grad_norm": 0.12800821661949158, + "learning_rate": 7.48831119681706e-06, + "loss": 0.2507, + "step": 66590 + }, + { + "epoch": 2.9603947192959064, + "grad_norm": 0.13945363461971283, + "learning_rate": 7.477434888498119e-06, + "loss": 0.2547, + "step": 66600 + }, + { + "epoch": 2.960839223007512, + "grad_norm": 0.12036070972681046, + "learning_rate": 7.466565846122564e-06, + "loss": 0.2549, + "step": 66610 + }, + { + "epoch": 2.961283726719118, + "grad_norm": 0.12913164496421814, + "learning_rate": 7.455704071547626e-06, + "loss": 0.258, + "step": 66620 + }, + { + "epoch": 2.961728230430724, + "grad_norm": 0.12361360341310501, + "learning_rate": 7.444849566629247e-06, + "loss": 0.2573, + "step": 66630 + }, + { + "epoch": 2.96217273414233, + "grad_norm": 0.1224798932671547, + "learning_rate": 7.434002333222212e-06, + "loss": 0.2538, + "step": 66640 + }, + { + "epoch": 2.962617237853936, + "grad_norm": 0.1302109956741333, + "learning_rate": 7.423162373179976e-06, + "loss": 0.2558, + "step": 66650 + }, + { + "epoch": 2.963061741565542, + "grad_norm": 0.11478939652442932, + "learning_rate": 7.412329688354835e-06, + "loss": 0.255, + "step": 66660 + }, + { + "epoch": 2.963506245277148, + "grad_norm": 0.12079174071550369, + "learning_rate": 7.40150428059776e-06, + "loss": 0.254, + "step": 66670 + }, + { + "epoch": 2.963950748988754, + "grad_norm": 0.12906880676746368, + "learning_rate": 7.3906861517585354e-06, + "loss": 0.2576, + "step": 66680 + }, + { + "epoch": 2.96439525270036, + "grad_norm": 0.12887616455554962, + "learning_rate": 7.37987530368569e-06, + "loss": 0.2562, + "step": 66690 + }, + { + "epoch": 2.964839756411966, + "grad_norm": 0.13222381472587585, + "learning_rate": 7.369071738226474e-06, + "loss": 0.2532, + "step": 66700 + }, + { + "epoch": 2.965284260123572, + "grad_norm": 0.13705749809741974, + "learning_rate": 7.358275457226954e-06, + "loss": 0.254, + "step": 66710 + }, + { + "epoch": 2.965728763835178, + "grad_norm": 0.13452938199043274, + "learning_rate": 7.347486462531899e-06, + "loss": 0.254, + "step": 66720 + }, + { + "epoch": 2.9661732675467842, + "grad_norm": 0.11057136207818985, + "learning_rate": 7.336704755984858e-06, + "loss": 0.2553, + "step": 66730 + }, + { + "epoch": 2.96661777125839, + "grad_norm": 0.14263343811035156, + "learning_rate": 7.325930339428133e-06, + "loss": 0.2546, + "step": 66740 + }, + { + "epoch": 2.967062274969996, + "grad_norm": 0.12662076950073242, + "learning_rate": 7.315163214702769e-06, + "loss": 0.2527, + "step": 66750 + }, + { + "epoch": 2.967506778681602, + "grad_norm": 0.11936774849891663, + "learning_rate": 7.304403383648595e-06, + "loss": 0.2555, + "step": 66760 + }, + { + "epoch": 2.9679512823932077, + "grad_norm": 0.164521262049675, + "learning_rate": 7.293650848104139e-06, + "loss": 0.2546, + "step": 66770 + }, + { + "epoch": 2.9683957861048142, + "grad_norm": 0.11191209405660629, + "learning_rate": 7.2829056099067374e-06, + "loss": 0.2536, + "step": 66780 + }, + { + "epoch": 2.96884028981642, + "grad_norm": 0.1286129653453827, + "learning_rate": 7.2721676708924494e-06, + "loss": 0.2534, + "step": 66790 + }, + { + "epoch": 2.969284793528026, + "grad_norm": 0.12009326368570328, + "learning_rate": 7.261437032896096e-06, + "loss": 0.2551, + "step": 66800 + }, + { + "epoch": 2.969729297239632, + "grad_norm": 0.1433972418308258, + "learning_rate": 7.250713697751255e-06, + "loss": 0.2549, + "step": 66810 + }, + { + "epoch": 2.9701738009512377, + "grad_norm": 0.1374596804380417, + "learning_rate": 7.239997667290255e-06, + "loss": 0.2537, + "step": 66820 + }, + { + "epoch": 2.970618304662844, + "grad_norm": 0.13723629713058472, + "learning_rate": 7.2292889433441425e-06, + "loss": 0.2516, + "step": 66830 + }, + { + "epoch": 2.97106280837445, + "grad_norm": 0.13434067368507385, + "learning_rate": 7.218587527742793e-06, + "loss": 0.2544, + "step": 66840 + }, + { + "epoch": 2.971507312086056, + "grad_norm": 0.11125201731920242, + "learning_rate": 7.207893422314749e-06, + "loss": 0.2539, + "step": 66850 + }, + { + "epoch": 2.971951815797662, + "grad_norm": 0.13448093831539154, + "learning_rate": 7.1972066288873545e-06, + "loss": 0.257, + "step": 66860 + }, + { + "epoch": 2.9723963195092677, + "grad_norm": 0.13289141654968262, + "learning_rate": 7.186527149286687e-06, + "loss": 0.2552, + "step": 66870 + }, + { + "epoch": 2.972840823220874, + "grad_norm": 0.13031752407550812, + "learning_rate": 7.175854985337576e-06, + "loss": 0.2564, + "step": 66880 + }, + { + "epoch": 2.97328532693248, + "grad_norm": 0.1439138799905777, + "learning_rate": 7.16519013886362e-06, + "loss": 0.2533, + "step": 66890 + }, + { + "epoch": 2.973729830644086, + "grad_norm": 0.13129499554634094, + "learning_rate": 7.154532611687109e-06, + "loss": 0.2559, + "step": 66900 + }, + { + "epoch": 2.974174334355692, + "grad_norm": 0.11132361739873886, + "learning_rate": 7.143882405629177e-06, + "loss": 0.2557, + "step": 66910 + }, + { + "epoch": 2.9746188380672978, + "grad_norm": 0.13519886136054993, + "learning_rate": 7.133239522509605e-06, + "loss": 0.2532, + "step": 66920 + }, + { + "epoch": 2.975063341778904, + "grad_norm": 0.12194839864969254, + "learning_rate": 7.1226039641469956e-06, + "loss": 0.2565, + "step": 66930 + }, + { + "epoch": 2.97550784549051, + "grad_norm": 0.12855865061283112, + "learning_rate": 7.111975732358678e-06, + "loss": 0.2543, + "step": 66940 + }, + { + "epoch": 2.9759523492021156, + "grad_norm": 0.11373861879110336, + "learning_rate": 7.101354828960693e-06, + "loss": 0.2559, + "step": 66950 + }, + { + "epoch": 2.9763968529137217, + "grad_norm": 0.13297098875045776, + "learning_rate": 7.090741255767918e-06, + "loss": 0.2535, + "step": 66960 + }, + { + "epoch": 2.976841356625328, + "grad_norm": 0.11600934714078903, + "learning_rate": 7.080135014593875e-06, + "loss": 0.2535, + "step": 66970 + }, + { + "epoch": 2.977285860336934, + "grad_norm": 0.1347757875919342, + "learning_rate": 7.069536107250896e-06, + "loss": 0.2544, + "step": 66980 + }, + { + "epoch": 2.97773036404854, + "grad_norm": 0.1313946396112442, + "learning_rate": 7.058944535550049e-06, + "loss": 0.255, + "step": 66990 + }, + { + "epoch": 2.9781748677601456, + "grad_norm": 0.12537269294261932, + "learning_rate": 7.048360301301138e-06, + "loss": 0.2535, + "step": 67000 + }, + { + "epoch": 2.9786193714717517, + "grad_norm": 0.12216829508543015, + "learning_rate": 7.03778340631272e-06, + "loss": 0.2556, + "step": 67010 + }, + { + "epoch": 2.979063875183358, + "grad_norm": 0.12145327031612396, + "learning_rate": 7.0272138523921e-06, + "loss": 0.2549, + "step": 67020 + }, + { + "epoch": 2.979508378894964, + "grad_norm": 0.12886770069599152, + "learning_rate": 7.016651641345334e-06, + "loss": 0.2554, + "step": 67030 + }, + { + "epoch": 2.97995288260657, + "grad_norm": 0.12534232437610626, + "learning_rate": 7.0060967749772e-06, + "loss": 0.256, + "step": 67040 + }, + { + "epoch": 2.9803973863181756, + "grad_norm": 0.12466501444578171, + "learning_rate": 6.995549255091238e-06, + "loss": 0.2557, + "step": 67050 + }, + { + "epoch": 2.9808418900297817, + "grad_norm": 0.1438024640083313, + "learning_rate": 6.98500908348973e-06, + "loss": 0.2538, + "step": 67060 + }, + { + "epoch": 2.981286393741388, + "grad_norm": 0.11621192842721939, + "learning_rate": 6.974476261973711e-06, + "loss": 0.2553, + "step": 67070 + }, + { + "epoch": 2.9817308974529935, + "grad_norm": 0.10840819031000137, + "learning_rate": 6.963950792342949e-06, + "loss": 0.2533, + "step": 67080 + }, + { + "epoch": 2.9821754011645996, + "grad_norm": 0.13115933537483215, + "learning_rate": 6.9534326763959715e-06, + "loss": 0.2527, + "step": 67090 + }, + { + "epoch": 2.9826199048762057, + "grad_norm": 0.11881382763385773, + "learning_rate": 6.942921915929995e-06, + "loss": 0.2564, + "step": 67100 + }, + { + "epoch": 2.9830644085878117, + "grad_norm": 0.13605481386184692, + "learning_rate": 6.9324185127410735e-06, + "loss": 0.2519, + "step": 67110 + }, + { + "epoch": 2.983508912299418, + "grad_norm": 0.1293807327747345, + "learning_rate": 6.921922468623921e-06, + "loss": 0.2586, + "step": 67120 + }, + { + "epoch": 2.9839534160110235, + "grad_norm": 0.10642734169960022, + "learning_rate": 6.911433785372023e-06, + "loss": 0.2566, + "step": 67130 + }, + { + "epoch": 2.9843979197226296, + "grad_norm": 0.11783964931964874, + "learning_rate": 6.900952464777632e-06, + "loss": 0.2531, + "step": 67140 + }, + { + "epoch": 2.9848424234342357, + "grad_norm": 0.10917425900697708, + "learning_rate": 6.8904785086316815e-06, + "loss": 0.2545, + "step": 67150 + }, + { + "epoch": 2.9852869271458418, + "grad_norm": 0.13425055146217346, + "learning_rate": 6.880011918723927e-06, + "loss": 0.2579, + "step": 67160 + }, + { + "epoch": 2.985731430857448, + "grad_norm": 0.12652823328971863, + "learning_rate": 6.86955269684279e-06, + "loss": 0.2547, + "step": 67170 + }, + { + "epoch": 2.9861759345690535, + "grad_norm": 0.11734902113676071, + "learning_rate": 6.859100844775473e-06, + "loss": 0.2558, + "step": 67180 + }, + { + "epoch": 2.9866204382806596, + "grad_norm": 0.1265009641647339, + "learning_rate": 6.84865636430792e-06, + "loss": 0.2551, + "step": 67190 + }, + { + "epoch": 2.9870649419922657, + "grad_norm": 0.11433050781488419, + "learning_rate": 6.838219257224804e-06, + "loss": 0.2544, + "step": 67200 + }, + { + "epoch": 2.987509445703872, + "grad_norm": 0.10924949496984482, + "learning_rate": 6.827789525309536e-06, + "loss": 0.2527, + "step": 67210 + }, + { + "epoch": 2.987953949415478, + "grad_norm": 0.13508202135562897, + "learning_rate": 6.81736717034428e-06, + "loss": 0.2549, + "step": 67220 + }, + { + "epoch": 2.9883984531270835, + "grad_norm": 0.11418396979570389, + "learning_rate": 6.806952194109933e-06, + "loss": 0.2557, + "step": 67230 + }, + { + "epoch": 2.9888429568386896, + "grad_norm": 0.10869921743869781, + "learning_rate": 6.796544598386112e-06, + "loss": 0.2574, + "step": 67240 + }, + { + "epoch": 2.9892874605502957, + "grad_norm": 0.12390830367803574, + "learning_rate": 6.786144384951204e-06, + "loss": 0.2594, + "step": 67250 + }, + { + "epoch": 2.9897319642619014, + "grad_norm": 0.11977105587720871, + "learning_rate": 6.775751555582322e-06, + "loss": 0.2547, + "step": 67260 + }, + { + "epoch": 2.9901764679735074, + "grad_norm": 0.13446494936943054, + "learning_rate": 6.76536611205531e-06, + "loss": 0.254, + "step": 67270 + }, + { + "epoch": 2.9906209716851135, + "grad_norm": 0.13224396109580994, + "learning_rate": 6.754988056144762e-06, + "loss": 0.254, + "step": 67280 + }, + { + "epoch": 2.9910654753967196, + "grad_norm": 0.12224855273962021, + "learning_rate": 6.744617389624014e-06, + "loss": 0.2564, + "step": 67290 + }, + { + "epoch": 2.9915099791083257, + "grad_norm": 0.13363175094127655, + "learning_rate": 6.734254114265087e-06, + "loss": 0.2547, + "step": 67300 + }, + { + "epoch": 2.9919544828199314, + "grad_norm": 0.13687363266944885, + "learning_rate": 6.723898231838843e-06, + "loss": 0.254, + "step": 67310 + }, + { + "epoch": 2.9923989865315375, + "grad_norm": 0.13875459134578705, + "learning_rate": 6.713549744114766e-06, + "loss": 0.2549, + "step": 67320 + }, + { + "epoch": 2.9928434902431436, + "grad_norm": 0.1416965276002884, + "learning_rate": 6.703208652861159e-06, + "loss": 0.2555, + "step": 67330 + }, + { + "epoch": 2.9932879939547496, + "grad_norm": 0.1377587616443634, + "learning_rate": 6.69287495984503e-06, + "loss": 0.2541, + "step": 67340 + }, + { + "epoch": 2.9937324976663557, + "grad_norm": 0.1192009374499321, + "learning_rate": 6.6825486668321e-06, + "loss": 0.2534, + "step": 67350 + }, + { + "epoch": 2.9941770013779614, + "grad_norm": 0.1379418820142746, + "learning_rate": 6.672229775586886e-06, + "loss": 0.2548, + "step": 67360 + }, + { + "epoch": 2.9946215050895675, + "grad_norm": 0.1381494551897049, + "learning_rate": 6.661918287872576e-06, + "loss": 0.2552, + "step": 67370 + }, + { + "epoch": 2.9950660088011736, + "grad_norm": 0.12685216963291168, + "learning_rate": 6.6516142054511346e-06, + "loss": 0.2541, + "step": 67380 + }, + { + "epoch": 2.9955105125127792, + "grad_norm": 0.13351422548294067, + "learning_rate": 6.641317530083241e-06, + "loss": 0.2567, + "step": 67390 + }, + { + "epoch": 2.9959550162243853, + "grad_norm": 0.12326706200838089, + "learning_rate": 6.631028263528322e-06, + "loss": 0.2596, + "step": 67400 + }, + { + "epoch": 2.9963995199359914, + "grad_norm": 0.12171271443367004, + "learning_rate": 6.620746407544537e-06, + "loss": 0.2551, + "step": 67410 + }, + { + "epoch": 2.9968440236475975, + "grad_norm": 0.1210649386048317, + "learning_rate": 6.610471963888742e-06, + "loss": 0.2534, + "step": 67420 + }, + { + "epoch": 2.9972885273592036, + "grad_norm": 0.13069969415664673, + "learning_rate": 6.600204934316606e-06, + "loss": 0.2566, + "step": 67430 + }, + { + "epoch": 2.9977330310708092, + "grad_norm": 0.11453137546777725, + "learning_rate": 6.589945320582452e-06, + "loss": 0.2571, + "step": 67440 + }, + { + "epoch": 2.9981775347824153, + "grad_norm": 0.1335763782262802, + "learning_rate": 6.579693124439374e-06, + "loss": 0.2547, + "step": 67450 + }, + { + "epoch": 2.9986220384940214, + "grad_norm": 0.13784538209438324, + "learning_rate": 6.56944834763919e-06, + "loss": 0.2553, + "step": 67460 + }, + { + "epoch": 2.9990665422056275, + "grad_norm": 0.11695612967014313, + "learning_rate": 6.5592109919324575e-06, + "loss": 0.2546, + "step": 67470 + }, + { + "epoch": 2.9995110459172336, + "grad_norm": 0.1324315220117569, + "learning_rate": 6.54898105906846e-06, + "loss": 0.2564, + "step": 67480 + }, + { + "epoch": 2.9999555496288393, + "grad_norm": 0.1236993595957756, + "learning_rate": 6.5387585507952155e-06, + "loss": 0.2571, + "step": 67490 + }, + { + "epoch": 3.0004000533404453, + "grad_norm": 0.1386001855134964, + "learning_rate": 6.528543468859461e-06, + "loss": 0.253, + "step": 67500 + }, + { + "epoch": 3.0008445570520514, + "grad_norm": 0.13031339645385742, + "learning_rate": 6.518335815006682e-06, + "loss": 0.2539, + "step": 67510 + }, + { + "epoch": 3.0012890607636575, + "grad_norm": 0.14157716929912567, + "learning_rate": 6.5081355909810845e-06, + "loss": 0.2541, + "step": 67520 + }, + { + "epoch": 3.001733564475263, + "grad_norm": 0.13019245862960815, + "learning_rate": 6.497942798525608e-06, + "loss": 0.2537, + "step": 67530 + }, + { + "epoch": 3.0021780681868693, + "grad_norm": 0.13318435847759247, + "learning_rate": 6.487757439381936e-06, + "loss": 0.2521, + "step": 67540 + }, + { + "epoch": 3.0026225718984754, + "grad_norm": 0.13355615735054016, + "learning_rate": 6.477579515290433e-06, + "loss": 0.2536, + "step": 67550 + }, + { + "epoch": 3.0030670756100815, + "grad_norm": 0.11758948117494583, + "learning_rate": 6.467409027990273e-06, + "loss": 0.2522, + "step": 67560 + }, + { + "epoch": 3.0035115793216876, + "grad_norm": 0.13700301945209503, + "learning_rate": 6.457245979219279e-06, + "loss": 0.2548, + "step": 67570 + }, + { + "epoch": 3.003956083033293, + "grad_norm": 0.10595180839300156, + "learning_rate": 6.447090370714054e-06, + "loss": 0.2554, + "step": 67580 + }, + { + "epoch": 3.0044005867448993, + "grad_norm": 0.15505893528461456, + "learning_rate": 6.436942204209917e-06, + "loss": 0.2539, + "step": 67590 + }, + { + "epoch": 3.0048450904565054, + "grad_norm": 0.13128472864627838, + "learning_rate": 6.4268014814408804e-06, + "loss": 0.2546, + "step": 67600 + }, + { + "epoch": 3.0052895941681115, + "grad_norm": 0.14727279543876648, + "learning_rate": 6.41666820413977e-06, + "loss": 0.2541, + "step": 67610 + }, + { + "epoch": 3.005734097879717, + "grad_norm": 0.11496095359325409, + "learning_rate": 6.406542374038033e-06, + "loss": 0.2546, + "step": 67620 + }, + { + "epoch": 3.006178601591323, + "grad_norm": 0.10409387201070786, + "learning_rate": 6.396423992865935e-06, + "loss": 0.2545, + "step": 67630 + }, + { + "epoch": 3.0066231053029293, + "grad_norm": 0.12473950535058975, + "learning_rate": 6.386313062352412e-06, + "loss": 0.2562, + "step": 67640 + }, + { + "epoch": 3.0070676090145354, + "grad_norm": 0.10233323276042938, + "learning_rate": 6.376209584225152e-06, + "loss": 0.2559, + "step": 67650 + }, + { + "epoch": 3.007512112726141, + "grad_norm": 0.11382661759853363, + "learning_rate": 6.366113560210557e-06, + "loss": 0.2544, + "step": 67660 + }, + { + "epoch": 3.007956616437747, + "grad_norm": 0.1304425150156021, + "learning_rate": 6.356024992033766e-06, + "loss": 0.2553, + "step": 67670 + }, + { + "epoch": 3.0084011201493532, + "grad_norm": 0.12395476549863815, + "learning_rate": 6.345943881418648e-06, + "loss": 0.2553, + "step": 67680 + }, + { + "epoch": 3.0088456238609593, + "grad_norm": 0.15424656867980957, + "learning_rate": 6.33587023008777e-06, + "loss": 0.2555, + "step": 67690 + }, + { + "epoch": 3.0092901275725654, + "grad_norm": 0.1137647032737732, + "learning_rate": 6.32580403976245e-06, + "loss": 0.2536, + "step": 67700 + }, + { + "epoch": 3.009734631284171, + "grad_norm": 0.15759536623954773, + "learning_rate": 6.31574531216273e-06, + "loss": 0.2575, + "step": 67710 + }, + { + "epoch": 3.010179134995777, + "grad_norm": 0.11910150945186615, + "learning_rate": 6.305694049007371e-06, + "loss": 0.2515, + "step": 67720 + }, + { + "epoch": 3.0106236387073833, + "grad_norm": 0.12117782235145569, + "learning_rate": 6.2956502520138575e-06, + "loss": 0.258, + "step": 67730 + }, + { + "epoch": 3.0110681424189893, + "grad_norm": 0.10570075362920761, + "learning_rate": 6.285613922898409e-06, + "loss": 0.2572, + "step": 67740 + }, + { + "epoch": 3.011512646130595, + "grad_norm": 0.1209951713681221, + "learning_rate": 6.275585063375927e-06, + "loss": 0.2567, + "step": 67750 + }, + { + "epoch": 3.011957149842201, + "grad_norm": 0.11013645678758621, + "learning_rate": 6.265563675160113e-06, + "loss": 0.2548, + "step": 67760 + }, + { + "epoch": 3.012401653553807, + "grad_norm": 0.12408974021673203, + "learning_rate": 6.25554975996332e-06, + "loss": 0.2559, + "step": 67770 + }, + { + "epoch": 3.0128461572654133, + "grad_norm": 0.11693951487541199, + "learning_rate": 6.245543319496661e-06, + "loss": 0.2536, + "step": 67780 + }, + { + "epoch": 3.0132906609770194, + "grad_norm": 0.1250099092721939, + "learning_rate": 6.2355443554699685e-06, + "loss": 0.2577, + "step": 67790 + }, + { + "epoch": 3.013735164688625, + "grad_norm": 0.13812732696533203, + "learning_rate": 6.225552869591766e-06, + "loss": 0.2557, + "step": 67800 + }, + { + "epoch": 3.014179668400231, + "grad_norm": 0.10973779857158661, + "learning_rate": 6.215568863569365e-06, + "loss": 0.2521, + "step": 67810 + }, + { + "epoch": 3.014624172111837, + "grad_norm": 0.1051807552576065, + "learning_rate": 6.205592339108712e-06, + "loss": 0.2537, + "step": 67820 + }, + { + "epoch": 3.0150686758234433, + "grad_norm": 0.10924773663282394, + "learning_rate": 6.195623297914577e-06, + "loss": 0.256, + "step": 67830 + }, + { + "epoch": 3.015513179535049, + "grad_norm": 0.12231670320034027, + "learning_rate": 6.185661741690357e-06, + "loss": 0.2538, + "step": 67840 + }, + { + "epoch": 3.015957683246655, + "grad_norm": 0.12577685713768005, + "learning_rate": 6.1757076721382145e-06, + "loss": 0.2557, + "step": 67850 + }, + { + "epoch": 3.016402186958261, + "grad_norm": 0.1235257163643837, + "learning_rate": 6.165761090959038e-06, + "loss": 0.2547, + "step": 67860 + }, + { + "epoch": 3.016846690669867, + "grad_norm": 0.13727454841136932, + "learning_rate": 6.155821999852424e-06, + "loss": 0.2545, + "step": 67870 + }, + { + "epoch": 3.017291194381473, + "grad_norm": 0.12024060636758804, + "learning_rate": 6.145890400516696e-06, + "loss": 0.2566, + "step": 67880 + }, + { + "epoch": 3.017735698093079, + "grad_norm": 0.1271916627883911, + "learning_rate": 6.1359662946488816e-06, + "loss": 0.2543, + "step": 67890 + }, + { + "epoch": 3.018180201804685, + "grad_norm": 0.12858454883098602, + "learning_rate": 6.12604968394474e-06, + "loss": 0.2518, + "step": 67900 + }, + { + "epoch": 3.018624705516291, + "grad_norm": 0.15874728560447693, + "learning_rate": 6.11614057009875e-06, + "loss": 0.2528, + "step": 67910 + }, + { + "epoch": 3.0190692092278972, + "grad_norm": 0.14538078010082245, + "learning_rate": 6.106238954804111e-06, + "loss": 0.2542, + "step": 67920 + }, + { + "epoch": 3.019513712939503, + "grad_norm": 0.1335713118314743, + "learning_rate": 6.096344839752738e-06, + "loss": 0.2546, + "step": 67930 + }, + { + "epoch": 3.019958216651109, + "grad_norm": 0.11980660259723663, + "learning_rate": 6.086458226635278e-06, + "loss": 0.2544, + "step": 67940 + }, + { + "epoch": 3.020402720362715, + "grad_norm": 0.12411519140005112, + "learning_rate": 6.076579117141046e-06, + "loss": 0.2586, + "step": 67950 + }, + { + "epoch": 3.020847224074321, + "grad_norm": 0.13661377131938934, + "learning_rate": 6.066707512958153e-06, + "loss": 0.2512, + "step": 67960 + }, + { + "epoch": 3.021291727785927, + "grad_norm": 0.10260142385959625, + "learning_rate": 6.056843415773361e-06, + "loss": 0.2541, + "step": 67970 + }, + { + "epoch": 3.021736231497533, + "grad_norm": 0.12286768108606339, + "learning_rate": 6.0469868272721776e-06, + "loss": 0.2534, + "step": 67980 + }, + { + "epoch": 3.022180735209139, + "grad_norm": 0.11114879697561264, + "learning_rate": 6.037137749138844e-06, + "loss": 0.2542, + "step": 67990 + }, + { + "epoch": 3.022625238920745, + "grad_norm": 0.11141318827867508, + "learning_rate": 6.027296183056252e-06, + "loss": 0.2548, + "step": 68000 + }, + { + "epoch": 3.023069742632351, + "grad_norm": 0.14460279047489166, + "learning_rate": 6.017462130706114e-06, + "loss": 0.2548, + "step": 68010 + }, + { + "epoch": 3.023514246343957, + "grad_norm": 0.13449715077877045, + "learning_rate": 6.0076355937687516e-06, + "loss": 0.2544, + "step": 68020 + }, + { + "epoch": 3.023958750055563, + "grad_norm": 0.123336061835289, + "learning_rate": 5.9978165739232925e-06, + "loss": 0.2532, + "step": 68030 + }, + { + "epoch": 3.024403253767169, + "grad_norm": 0.10975094884634018, + "learning_rate": 5.988005072847508e-06, + "loss": 0.2538, + "step": 68040 + }, + { + "epoch": 3.024847757478775, + "grad_norm": 0.11825256049633026, + "learning_rate": 5.978201092217928e-06, + "loss": 0.2536, + "step": 68050 + }, + { + "epoch": 3.0252922611903807, + "grad_norm": 0.12112952768802643, + "learning_rate": 5.9684046337097895e-06, + "loss": 0.2565, + "step": 68060 + }, + { + "epoch": 3.025736764901987, + "grad_norm": 0.1272943913936615, + "learning_rate": 5.958615698997016e-06, + "loss": 0.2534, + "step": 68070 + }, + { + "epoch": 3.026181268613593, + "grad_norm": 0.12165531516075134, + "learning_rate": 5.948834289752303e-06, + "loss": 0.2537, + "step": 68080 + }, + { + "epoch": 3.026625772325199, + "grad_norm": 0.11432919651269913, + "learning_rate": 5.939060407646996e-06, + "loss": 0.2564, + "step": 68090 + }, + { + "epoch": 3.027070276036805, + "grad_norm": 0.13523675501346588, + "learning_rate": 5.929294054351198e-06, + "loss": 0.2533, + "step": 68100 + }, + { + "epoch": 3.0275147797484108, + "grad_norm": 0.13651251792907715, + "learning_rate": 5.919535231533707e-06, + "loss": 0.2547, + "step": 68110 + }, + { + "epoch": 3.027959283460017, + "grad_norm": 0.13350242376327515, + "learning_rate": 5.909783940862046e-06, + "loss": 0.2539, + "step": 68120 + }, + { + "epoch": 3.028403787171623, + "grad_norm": 0.12641067802906036, + "learning_rate": 5.900040184002436e-06, + "loss": 0.2526, + "step": 68130 + }, + { + "epoch": 3.028848290883229, + "grad_norm": 0.11384553462266922, + "learning_rate": 5.890303962619831e-06, + "loss": 0.2558, + "step": 68140 + }, + { + "epoch": 3.0292927945948347, + "grad_norm": 0.13708429038524628, + "learning_rate": 5.88057527837787e-06, + "loss": 0.2554, + "step": 68150 + }, + { + "epoch": 3.029737298306441, + "grad_norm": 0.14219090342521667, + "learning_rate": 5.870854132938924e-06, + "loss": 0.2527, + "step": 68160 + }, + { + "epoch": 3.030181802018047, + "grad_norm": 0.1258738934993744, + "learning_rate": 5.861140527964071e-06, + "loss": 0.2582, + "step": 68170 + }, + { + "epoch": 3.030626305729653, + "grad_norm": 0.1194017231464386, + "learning_rate": 5.851434465113098e-06, + "loss": 0.254, + "step": 68180 + }, + { + "epoch": 3.0310708094412586, + "grad_norm": 0.12514592707157135, + "learning_rate": 5.841735946044524e-06, + "loss": 0.2539, + "step": 68190 + }, + { + "epoch": 3.0315153131528647, + "grad_norm": 0.12399369478225708, + "learning_rate": 5.832044972415523e-06, + "loss": 0.2556, + "step": 68200 + }, + { + "epoch": 3.031959816864471, + "grad_norm": 0.11718958616256714, + "learning_rate": 5.8223615458820655e-06, + "loss": 0.2566, + "step": 68210 + }, + { + "epoch": 3.032404320576077, + "grad_norm": 0.11544522643089294, + "learning_rate": 5.812685668098733e-06, + "loss": 0.2541, + "step": 68220 + }, + { + "epoch": 3.032848824287683, + "grad_norm": 0.11524829268455505, + "learning_rate": 5.803017340718913e-06, + "loss": 0.2541, + "step": 68230 + }, + { + "epoch": 3.0332933279992886, + "grad_norm": 0.12145092338323593, + "learning_rate": 5.793356565394636e-06, + "loss": 0.2558, + "step": 68240 + }, + { + "epoch": 3.0337378317108947, + "grad_norm": 0.1275947540998459, + "learning_rate": 5.783703343776658e-06, + "loss": 0.2531, + "step": 68250 + }, + { + "epoch": 3.034182335422501, + "grad_norm": 0.14321933686733246, + "learning_rate": 5.774057677514477e-06, + "loss": 0.2529, + "step": 68260 + }, + { + "epoch": 3.034626839134107, + "grad_norm": 0.12606537342071533, + "learning_rate": 5.764419568256235e-06, + "loss": 0.2549, + "step": 68270 + }, + { + "epoch": 3.0350713428457126, + "grad_norm": 0.13860313594341278, + "learning_rate": 5.754789017648865e-06, + "loss": 0.2555, + "step": 68280 + }, + { + "epoch": 3.0355158465573187, + "grad_norm": 0.1286064088344574, + "learning_rate": 5.745166027337934e-06, + "loss": 0.2544, + "step": 68290 + }, + { + "epoch": 3.0359603502689247, + "grad_norm": 0.14917311072349548, + "learning_rate": 5.735550598967754e-06, + "loss": 0.2545, + "step": 68300 + }, + { + "epoch": 3.036404853980531, + "grad_norm": 0.11290983855724335, + "learning_rate": 5.725942734181339e-06, + "loss": 0.2561, + "step": 68310 + }, + { + "epoch": 3.036849357692137, + "grad_norm": 0.12528955936431885, + "learning_rate": 5.7163424346204146e-06, + "loss": 0.255, + "step": 68320 + }, + { + "epoch": 3.0372938614037426, + "grad_norm": 0.12841160595417023, + "learning_rate": 5.706749701925407e-06, + "loss": 0.2513, + "step": 68330 + }, + { + "epoch": 3.0377383651153487, + "grad_norm": 0.13654181361198425, + "learning_rate": 5.697164537735461e-06, + "loss": 0.2522, + "step": 68340 + }, + { + "epoch": 3.0381828688269548, + "grad_norm": 0.10944376140832901, + "learning_rate": 5.687586943688406e-06, + "loss": 0.2548, + "step": 68350 + }, + { + "epoch": 3.038627372538561, + "grad_norm": 0.13103432953357697, + "learning_rate": 5.678016921420792e-06, + "loss": 0.2539, + "step": 68360 + }, + { + "epoch": 3.0390718762501665, + "grad_norm": 0.11067929118871689, + "learning_rate": 5.668454472567875e-06, + "loss": 0.2532, + "step": 68370 + }, + { + "epoch": 3.0395163799617726, + "grad_norm": 0.11521153151988983, + "learning_rate": 5.658899598763617e-06, + "loss": 0.2524, + "step": 68380 + }, + { + "epoch": 3.0399608836733787, + "grad_norm": 0.10502781718969345, + "learning_rate": 5.6493523016407035e-06, + "loss": 0.2559, + "step": 68390 + }, + { + "epoch": 3.040405387384985, + "grad_norm": 0.11869766563177109, + "learning_rate": 5.6398125828304615e-06, + "loss": 0.2569, + "step": 68400 + }, + { + "epoch": 3.040849891096591, + "grad_norm": 0.1421334445476532, + "learning_rate": 5.630280443963015e-06, + "loss": 0.2551, + "step": 68410 + }, + { + "epoch": 3.0412943948081965, + "grad_norm": 0.12869477272033691, + "learning_rate": 5.620755886667112e-06, + "loss": 0.2536, + "step": 68420 + }, + { + "epoch": 3.0417388985198026, + "grad_norm": 0.14360997080802917, + "learning_rate": 5.611238912570271e-06, + "loss": 0.2574, + "step": 68430 + }, + { + "epoch": 3.0421834022314087, + "grad_norm": 0.13024641573429108, + "learning_rate": 5.601729523298649e-06, + "loss": 0.2547, + "step": 68440 + }, + { + "epoch": 3.042627905943015, + "grad_norm": 0.12204614281654358, + "learning_rate": 5.592227720477161e-06, + "loss": 0.2557, + "step": 68450 + }, + { + "epoch": 3.0430724096546204, + "grad_norm": 0.11751150339841843, + "learning_rate": 5.582733505729415e-06, + "loss": 0.253, + "step": 68460 + }, + { + "epoch": 3.0435169133662265, + "grad_norm": 0.12649480998516083, + "learning_rate": 5.573246880677668e-06, + "loss": 0.2542, + "step": 68470 + }, + { + "epoch": 3.0439614170778326, + "grad_norm": 0.11360074579715729, + "learning_rate": 5.56376784694298e-06, + "loss": 0.2528, + "step": 68480 + }, + { + "epoch": 3.0444059207894387, + "grad_norm": 0.1390717476606369, + "learning_rate": 5.554296406145027e-06, + "loss": 0.2537, + "step": 68490 + }, + { + "epoch": 3.0448504245010444, + "grad_norm": 0.15178535878658295, + "learning_rate": 5.544832559902219e-06, + "loss": 0.2529, + "step": 68500 + }, + { + "epoch": 3.0452949282126505, + "grad_norm": 0.13321582973003387, + "learning_rate": 5.53537630983168e-06, + "loss": 0.2545, + "step": 68510 + }, + { + "epoch": 3.0457394319242566, + "grad_norm": 0.11041420698165894, + "learning_rate": 5.5259276575492125e-06, + "loss": 0.2564, + "step": 68520 + }, + { + "epoch": 3.0461839356358626, + "grad_norm": 0.11689812690019608, + "learning_rate": 5.516486604669357e-06, + "loss": 0.2514, + "step": 68530 + }, + { + "epoch": 3.0466284393474687, + "grad_norm": 0.1372915804386139, + "learning_rate": 5.507053152805303e-06, + "loss": 0.2549, + "step": 68540 + }, + { + "epoch": 3.0470729430590744, + "grad_norm": 0.1427307277917862, + "learning_rate": 5.497627303568975e-06, + "loss": 0.2522, + "step": 68550 + }, + { + "epoch": 3.0475174467706805, + "grad_norm": 0.1358245313167572, + "learning_rate": 5.488209058571003e-06, + "loss": 0.2578, + "step": 68560 + }, + { + "epoch": 3.0479619504822866, + "grad_norm": 0.12169840186834335, + "learning_rate": 5.4787984194207054e-06, + "loss": 0.2553, + "step": 68570 + }, + { + "epoch": 3.0484064541938927, + "grad_norm": 0.13093341886997223, + "learning_rate": 5.469395387726095e-06, + "loss": 0.2564, + "step": 68580 + }, + { + "epoch": 3.0488509579054983, + "grad_norm": 0.13596756756305695, + "learning_rate": 5.45999996509391e-06, + "loss": 0.2533, + "step": 68590 + }, + { + "epoch": 3.0492954616171044, + "grad_norm": 0.13595233857631683, + "learning_rate": 5.450612153129536e-06, + "loss": 0.2536, + "step": 68600 + }, + { + "epoch": 3.0497399653287105, + "grad_norm": 0.13770240545272827, + "learning_rate": 5.4412319534371426e-06, + "loss": 0.2551, + "step": 68610 + }, + { + "epoch": 3.0501844690403166, + "grad_norm": 0.14640064537525177, + "learning_rate": 5.431859367619513e-06, + "loss": 0.255, + "step": 68620 + }, + { + "epoch": 3.0506289727519227, + "grad_norm": 0.1468803584575653, + "learning_rate": 5.422494397278172e-06, + "loss": 0.2579, + "step": 68630 + }, + { + "epoch": 3.0510734764635283, + "grad_norm": 0.1334802657365799, + "learning_rate": 5.413137044013344e-06, + "loss": 0.2534, + "step": 68640 + }, + { + "epoch": 3.0515179801751344, + "grad_norm": 0.14440099895000458, + "learning_rate": 5.403787309423941e-06, + "loss": 0.2538, + "step": 68650 + }, + { + "epoch": 3.0519624838867405, + "grad_norm": 0.1387137472629547, + "learning_rate": 5.3944451951075835e-06, + "loss": 0.2565, + "step": 68660 + }, + { + "epoch": 3.0524069875983466, + "grad_norm": 0.11641117185354233, + "learning_rate": 5.385110702660562e-06, + "loss": 0.2583, + "step": 68670 + }, + { + "epoch": 3.0528514913099523, + "grad_norm": 0.13525116443634033, + "learning_rate": 5.375783833677922e-06, + "loss": 0.2553, + "step": 68680 + }, + { + "epoch": 3.0532959950215584, + "grad_norm": 0.12484904378652573, + "learning_rate": 5.36646458975334e-06, + "loss": 0.2537, + "step": 68690 + }, + { + "epoch": 3.0537404987331644, + "grad_norm": 0.13681326806545258, + "learning_rate": 5.3571529724792294e-06, + "loss": 0.2553, + "step": 68700 + }, + { + "epoch": 3.0541850024447705, + "grad_norm": 0.1251208484172821, + "learning_rate": 5.347848983446702e-06, + "loss": 0.2557, + "step": 68710 + }, + { + "epoch": 3.0546295061563766, + "grad_norm": 0.11395544558763504, + "learning_rate": 5.3385526242455185e-06, + "loss": 0.2556, + "step": 68720 + }, + { + "epoch": 3.0550740098679823, + "grad_norm": 0.12004545331001282, + "learning_rate": 5.329263896464226e-06, + "loss": 0.2551, + "step": 68730 + }, + { + "epoch": 3.0555185135795884, + "grad_norm": 0.10626354068517685, + "learning_rate": 5.3199828016899715e-06, + "loss": 0.2544, + "step": 68740 + }, + { + "epoch": 3.0559630172911945, + "grad_norm": 0.10578214377164841, + "learning_rate": 5.310709341508657e-06, + "loss": 0.2536, + "step": 68750 + }, + { + "epoch": 3.0564075210028006, + "grad_norm": 0.11834462732076645, + "learning_rate": 5.301443517504861e-06, + "loss": 0.2546, + "step": 68760 + }, + { + "epoch": 3.056852024714406, + "grad_norm": 0.15614627301692963, + "learning_rate": 5.292185331261862e-06, + "loss": 0.2561, + "step": 68770 + }, + { + "epoch": 3.0572965284260123, + "grad_norm": 0.11654035747051239, + "learning_rate": 5.28293478436162e-06, + "loss": 0.2539, + "step": 68780 + }, + { + "epoch": 3.0577410321376184, + "grad_norm": 0.1269507259130478, + "learning_rate": 5.273691878384829e-06, + "loss": 0.2534, + "step": 68790 + }, + { + "epoch": 3.0581855358492245, + "grad_norm": 0.11430239677429199, + "learning_rate": 5.264456614910812e-06, + "loss": 0.2524, + "step": 68800 + }, + { + "epoch": 3.05863003956083, + "grad_norm": 0.12189708650112152, + "learning_rate": 5.255228995517647e-06, + "loss": 0.2539, + "step": 68810 + }, + { + "epoch": 3.059074543272436, + "grad_norm": 0.1211109608411789, + "learning_rate": 5.246009021782067e-06, + "loss": 0.2543, + "step": 68820 + }, + { + "epoch": 3.0595190469840423, + "grad_norm": 0.12450090795755386, + "learning_rate": 5.2367966952795225e-06, + "loss": 0.2568, + "step": 68830 + }, + { + "epoch": 3.0599635506956484, + "grad_norm": 0.12633183598518372, + "learning_rate": 5.2275920175841485e-06, + "loss": 0.2549, + "step": 68840 + }, + { + "epoch": 3.0604080544072545, + "grad_norm": 0.10599402338266373, + "learning_rate": 5.218394990268771e-06, + "loss": 0.2525, + "step": 68850 + }, + { + "epoch": 3.06085255811886, + "grad_norm": 0.1110086441040039, + "learning_rate": 5.209205614904916e-06, + "loss": 0.2553, + "step": 68860 + }, + { + "epoch": 3.0612970618304662, + "grad_norm": 0.11508075892925262, + "learning_rate": 5.200023893062777e-06, + "loss": 0.2544, + "step": 68870 + }, + { + "epoch": 3.0617415655420723, + "grad_norm": 0.14267507195472717, + "learning_rate": 5.190849826311289e-06, + "loss": 0.2557, + "step": 68880 + }, + { + "epoch": 3.0621860692536784, + "grad_norm": 0.10998307168483734, + "learning_rate": 5.181683416218025e-06, + "loss": 0.2552, + "step": 68890 + }, + { + "epoch": 3.062630572965284, + "grad_norm": 0.1412537544965744, + "learning_rate": 5.172524664349276e-06, + "loss": 0.2535, + "step": 68900 + }, + { + "epoch": 3.06307507667689, + "grad_norm": 0.1269318163394928, + "learning_rate": 5.1633735722700416e-06, + "loss": 0.254, + "step": 68910 + }, + { + "epoch": 3.0635195803884963, + "grad_norm": 0.12693005800247192, + "learning_rate": 5.154230141543958e-06, + "loss": 0.2523, + "step": 68920 + }, + { + "epoch": 3.0639640841001023, + "grad_norm": 0.1247042566537857, + "learning_rate": 5.145094373733433e-06, + "loss": 0.2545, + "step": 68930 + }, + { + "epoch": 3.0644085878117084, + "grad_norm": 0.12481244653463364, + "learning_rate": 5.135966270399478e-06, + "loss": 0.2547, + "step": 68940 + }, + { + "epoch": 3.064853091523314, + "grad_norm": 0.11727206408977509, + "learning_rate": 5.126845833101857e-06, + "loss": 0.2535, + "step": 68950 + }, + { + "epoch": 3.06529759523492, + "grad_norm": 0.11365552246570587, + "learning_rate": 5.1177330633989994e-06, + "loss": 0.2521, + "step": 68960 + }, + { + "epoch": 3.0657420989465263, + "grad_norm": 0.11100772768259048, + "learning_rate": 5.108627962848033e-06, + "loss": 0.2542, + "step": 68970 + }, + { + "epoch": 3.0661866026581324, + "grad_norm": 0.1152615174651146, + "learning_rate": 5.099530533004759e-06, + "loss": 0.2532, + "step": 68980 + }, + { + "epoch": 3.066631106369738, + "grad_norm": 0.10698670893907547, + "learning_rate": 5.090440775423699e-06, + "loss": 0.2566, + "step": 68990 + }, + { + "epoch": 3.067075610081344, + "grad_norm": 0.18573172390460968, + "learning_rate": 5.081358691658022e-06, + "loss": 0.2556, + "step": 69000 + }, + { + "epoch": 3.06752011379295, + "grad_norm": 0.11165861040353775, + "learning_rate": 5.072284283259621e-06, + "loss": 0.2555, + "step": 69010 + }, + { + "epoch": 3.0679646175045563, + "grad_norm": 0.1056896224617958, + "learning_rate": 5.063217551779053e-06, + "loss": 0.2526, + "step": 69020 + }, + { + "epoch": 3.0684091212161624, + "grad_norm": 0.11527332663536072, + "learning_rate": 5.054158498765588e-06, + "loss": 0.2538, + "step": 69030 + }, + { + "epoch": 3.068853624927768, + "grad_norm": 0.10775171965360641, + "learning_rate": 5.0451071257671625e-06, + "loss": 0.2514, + "step": 69040 + }, + { + "epoch": 3.069298128639374, + "grad_norm": 0.13973909616470337, + "learning_rate": 5.036063434330407e-06, + "loss": 0.2549, + "step": 69050 + }, + { + "epoch": 3.06974263235098, + "grad_norm": 0.12845873832702637, + "learning_rate": 5.027027426000652e-06, + "loss": 0.2551, + "step": 69060 + }, + { + "epoch": 3.0701871360625863, + "grad_norm": 0.12234854698181152, + "learning_rate": 5.017999102321886e-06, + "loss": 0.2552, + "step": 69070 + }, + { + "epoch": 3.070631639774192, + "grad_norm": 0.13045363128185272, + "learning_rate": 5.0089784648368224e-06, + "loss": 0.2532, + "step": 69080 + }, + { + "epoch": 3.071076143485798, + "grad_norm": 0.11202948540449142, + "learning_rate": 4.999965515086829e-06, + "loss": 0.2543, + "step": 69090 + }, + { + "epoch": 3.071520647197404, + "grad_norm": 0.12264956533908844, + "learning_rate": 4.990960254611976e-06, + "loss": 0.256, + "step": 69100 + }, + { + "epoch": 3.0719651509090102, + "grad_norm": 0.14114272594451904, + "learning_rate": 4.9819626849510194e-06, + "loss": 0.2551, + "step": 69110 + }, + { + "epoch": 3.072409654620616, + "grad_norm": 0.1355346292257309, + "learning_rate": 4.97297280764138e-06, + "loss": 0.254, + "step": 69120 + }, + { + "epoch": 3.072854158332222, + "grad_norm": 0.11621516942977905, + "learning_rate": 4.963990624219211e-06, + "loss": 0.2529, + "step": 69130 + }, + { + "epoch": 3.073298662043828, + "grad_norm": 0.11309037357568741, + "learning_rate": 4.955016136219298e-06, + "loss": 0.2547, + "step": 69140 + }, + { + "epoch": 3.073743165755434, + "grad_norm": 0.13991643488407135, + "learning_rate": 4.946049345175146e-06, + "loss": 0.2548, + "step": 69150 + }, + { + "epoch": 3.0741876694670403, + "grad_norm": 0.11729130148887634, + "learning_rate": 4.937090252618937e-06, + "loss": 0.254, + "step": 69160 + }, + { + "epoch": 3.074632173178646, + "grad_norm": 0.11989068239927292, + "learning_rate": 4.928138860081521e-06, + "loss": 0.2559, + "step": 69170 + }, + { + "epoch": 3.075076676890252, + "grad_norm": 0.11082381755113602, + "learning_rate": 4.919195169092472e-06, + "loss": 0.2552, + "step": 69180 + }, + { + "epoch": 3.075521180601858, + "grad_norm": 0.11312561482191086, + "learning_rate": 4.910259181179994e-06, + "loss": 0.2527, + "step": 69190 + }, + { + "epoch": 3.075965684313464, + "grad_norm": 0.11061035841703415, + "learning_rate": 4.901330897871015e-06, + "loss": 0.255, + "step": 69200 + }, + { + "epoch": 3.07641018802507, + "grad_norm": 0.11086498200893402, + "learning_rate": 4.8924103206911375e-06, + "loss": 0.254, + "step": 69210 + }, + { + "epoch": 3.076854691736676, + "grad_norm": 0.12975868582725525, + "learning_rate": 4.883497451164637e-06, + "loss": 0.2543, + "step": 69220 + }, + { + "epoch": 3.077299195448282, + "grad_norm": 0.116034597158432, + "learning_rate": 4.874592290814484e-06, + "loss": 0.2581, + "step": 69230 + }, + { + "epoch": 3.077743699159888, + "grad_norm": 0.11910267174243927, + "learning_rate": 4.865694841162327e-06, + "loss": 0.2528, + "step": 69240 + }, + { + "epoch": 3.0781882028714938, + "grad_norm": 0.12618060410022736, + "learning_rate": 4.856805103728496e-06, + "loss": 0.2532, + "step": 69250 + }, + { + "epoch": 3.0786327065831, + "grad_norm": 0.11401408910751343, + "learning_rate": 4.847923080032007e-06, + "loss": 0.2545, + "step": 69260 + }, + { + "epoch": 3.079077210294706, + "grad_norm": 0.13498826324939728, + "learning_rate": 4.8390487715905486e-06, + "loss": 0.2538, + "step": 69270 + }, + { + "epoch": 3.079521714006312, + "grad_norm": 0.12328755855560303, + "learning_rate": 4.8301821799205e-06, + "loss": 0.254, + "step": 69280 + }, + { + "epoch": 3.079966217717918, + "grad_norm": 0.11593641340732574, + "learning_rate": 4.821323306536918e-06, + "loss": 0.2533, + "step": 69290 + }, + { + "epoch": 3.0804107214295238, + "grad_norm": 0.10533329844474792, + "learning_rate": 4.8124721529535455e-06, + "loss": 0.2558, + "step": 69300 + }, + { + "epoch": 3.08085522514113, + "grad_norm": 0.1241772398352623, + "learning_rate": 4.803628720682807e-06, + "loss": 0.2514, + "step": 69310 + }, + { + "epoch": 3.081299728852736, + "grad_norm": 0.1210736557841301, + "learning_rate": 4.794793011235776e-06, + "loss": 0.253, + "step": 69320 + }, + { + "epoch": 3.081744232564342, + "grad_norm": 0.10716171562671661, + "learning_rate": 4.78596502612228e-06, + "loss": 0.2548, + "step": 69330 + }, + { + "epoch": 3.0821887362759477, + "grad_norm": 0.13636119663715363, + "learning_rate": 4.777144766850738e-06, + "loss": 0.2565, + "step": 69340 + }, + { + "epoch": 3.082633239987554, + "grad_norm": 0.11405032873153687, + "learning_rate": 4.768332234928313e-06, + "loss": 0.2561, + "step": 69350 + }, + { + "epoch": 3.08307774369916, + "grad_norm": 0.11843154579401016, + "learning_rate": 4.759527431860828e-06, + "loss": 0.2553, + "step": 69360 + }, + { + "epoch": 3.083522247410766, + "grad_norm": 0.11815988272428513, + "learning_rate": 4.750730359152755e-06, + "loss": 0.2541, + "step": 69370 + }, + { + "epoch": 3.083966751122372, + "grad_norm": 0.09603841602802277, + "learning_rate": 4.741941018307311e-06, + "loss": 0.2525, + "step": 69380 + }, + { + "epoch": 3.0844112548339777, + "grad_norm": 0.1058725044131279, + "learning_rate": 4.733159410826321e-06, + "loss": 0.2523, + "step": 69390 + }, + { + "epoch": 3.084855758545584, + "grad_norm": 0.1347385197877884, + "learning_rate": 4.724385538210357e-06, + "loss": 0.2537, + "step": 69400 + }, + { + "epoch": 3.08530026225719, + "grad_norm": 0.11411992460489273, + "learning_rate": 4.715619401958599e-06, + "loss": 0.2525, + "step": 69410 + }, + { + "epoch": 3.085744765968796, + "grad_norm": 0.11663329601287842, + "learning_rate": 4.706861003568958e-06, + "loss": 0.2543, + "step": 69420 + }, + { + "epoch": 3.0861892696804016, + "grad_norm": 0.1307532638311386, + "learning_rate": 4.698110344538003e-06, + "loss": 0.2549, + "step": 69430 + }, + { + "epoch": 3.0866337733920077, + "grad_norm": 0.11215682327747345, + "learning_rate": 4.689367426360975e-06, + "loss": 0.2536, + "step": 69440 + }, + { + "epoch": 3.087078277103614, + "grad_norm": 0.11003029346466064, + "learning_rate": 4.680632250531819e-06, + "loss": 0.2527, + "step": 69450 + }, + { + "epoch": 3.08752278081522, + "grad_norm": 0.11172330379486084, + "learning_rate": 4.671904818543115e-06, + "loss": 0.2506, + "step": 69460 + }, + { + "epoch": 3.087967284526826, + "grad_norm": 0.13441485166549683, + "learning_rate": 4.66318513188615e-06, + "loss": 0.254, + "step": 69470 + }, + { + "epoch": 3.0884117882384317, + "grad_norm": 0.11407431960105896, + "learning_rate": 4.654473192050884e-06, + "loss": 0.2533, + "step": 69480 + }, + { + "epoch": 3.0888562919500377, + "grad_norm": 0.12349465489387512, + "learning_rate": 4.64576900052594e-06, + "loss": 0.2534, + "step": 69490 + }, + { + "epoch": 3.089300795661644, + "grad_norm": 0.14133323729038239, + "learning_rate": 4.637072558798638e-06, + "loss": 0.2522, + "step": 69500 + }, + { + "epoch": 3.08974529937325, + "grad_norm": 0.11610574275255203, + "learning_rate": 4.628383868354969e-06, + "loss": 0.256, + "step": 69510 + }, + { + "epoch": 3.0901898030848556, + "grad_norm": 0.14685823023319244, + "learning_rate": 4.6197029306795595e-06, + "loss": 0.2556, + "step": 69520 + }, + { + "epoch": 3.0906343067964617, + "grad_norm": 0.10825861990451813, + "learning_rate": 4.611029747255779e-06, + "loss": 0.2526, + "step": 69530 + }, + { + "epoch": 3.0910788105080678, + "grad_norm": 0.1334003359079361, + "learning_rate": 4.6023643195656164e-06, + "loss": 0.2558, + "step": 69540 + }, + { + "epoch": 3.091523314219674, + "grad_norm": 0.10223514586687088, + "learning_rate": 4.593706649089768e-06, + "loss": 0.2536, + "step": 69550 + }, + { + "epoch": 3.0919678179312795, + "grad_norm": 0.1118551641702652, + "learning_rate": 4.585056737307597e-06, + "loss": 0.2535, + "step": 69560 + }, + { + "epoch": 3.0924123216428856, + "grad_norm": 0.1246521845459938, + "learning_rate": 4.576414585697103e-06, + "loss": 0.2514, + "step": 69570 + }, + { + "epoch": 3.0928568253544917, + "grad_norm": 0.114990234375, + "learning_rate": 4.567780195735044e-06, + "loss": 0.2554, + "step": 69580 + }, + { + "epoch": 3.093301329066098, + "grad_norm": 0.1182546392083168, + "learning_rate": 4.559153568896757e-06, + "loss": 0.2558, + "step": 69590 + }, + { + "epoch": 3.093745832777704, + "grad_norm": 0.12143559008836746, + "learning_rate": 4.550534706656329e-06, + "loss": 0.2538, + "step": 69600 + }, + { + "epoch": 3.0941903364893095, + "grad_norm": 0.09665518999099731, + "learning_rate": 4.541923610486465e-06, + "loss": 0.2563, + "step": 69610 + }, + { + "epoch": 3.0946348402009156, + "grad_norm": 0.13028225302696228, + "learning_rate": 4.533320281858578e-06, + "loss": 0.2549, + "step": 69620 + }, + { + "epoch": 3.0950793439125217, + "grad_norm": 0.10849273204803467, + "learning_rate": 4.52472472224274e-06, + "loss": 0.2549, + "step": 69630 + }, + { + "epoch": 3.095523847624128, + "grad_norm": 0.1234898492693901, + "learning_rate": 4.51613693310769e-06, + "loss": 0.2516, + "step": 69640 + }, + { + "epoch": 3.0959683513357334, + "grad_norm": 0.13919667899608612, + "learning_rate": 4.507556915920868e-06, + "loss": 0.2544, + "step": 69650 + }, + { + "epoch": 3.0964128550473395, + "grad_norm": 0.13215036690235138, + "learning_rate": 4.498984672148332e-06, + "loss": 0.2551, + "step": 69660 + }, + { + "epoch": 3.0968573587589456, + "grad_norm": 0.12428664416074753, + "learning_rate": 4.490420203254864e-06, + "loss": 0.2545, + "step": 69670 + }, + { + "epoch": 3.0973018624705517, + "grad_norm": 0.11060743778944016, + "learning_rate": 4.481863510703893e-06, + "loss": 0.2535, + "step": 69680 + }, + { + "epoch": 3.097746366182158, + "grad_norm": 0.12535929679870605, + "learning_rate": 4.473314595957523e-06, + "loss": 0.2519, + "step": 69690 + }, + { + "epoch": 3.0981908698937635, + "grad_norm": 0.12287583202123642, + "learning_rate": 4.464773460476535e-06, + "loss": 0.2547, + "step": 69700 + }, + { + "epoch": 3.0986353736053696, + "grad_norm": 0.12970149517059326, + "learning_rate": 4.456240105720372e-06, + "loss": 0.2561, + "step": 69710 + }, + { + "epoch": 3.0990798773169757, + "grad_norm": 0.1088172122836113, + "learning_rate": 4.4477145331471405e-06, + "loss": 0.2547, + "step": 69720 + }, + { + "epoch": 3.0995243810285817, + "grad_norm": 0.13342106342315674, + "learning_rate": 4.439196744213653e-06, + "loss": 0.255, + "step": 69730 + }, + { + "epoch": 3.0999688847401874, + "grad_norm": 0.10516543686389923, + "learning_rate": 4.430686740375339e-06, + "loss": 0.2532, + "step": 69740 + }, + { + "epoch": 3.1004133884517935, + "grad_norm": 0.12250601500272751, + "learning_rate": 4.422184523086342e-06, + "loss": 0.2527, + "step": 69750 + }, + { + "epoch": 3.1008578921633996, + "grad_norm": 0.09890040010213852, + "learning_rate": 4.41369009379946e-06, + "loss": 0.2554, + "step": 69760 + }, + { + "epoch": 3.1013023958750057, + "grad_norm": 0.11847808957099915, + "learning_rate": 4.405203453966139e-06, + "loss": 0.2545, + "step": 69770 + }, + { + "epoch": 3.1017468995866118, + "grad_norm": 0.11111760139465332, + "learning_rate": 4.396724605036539e-06, + "loss": 0.2537, + "step": 69780 + }, + { + "epoch": 3.1021914032982174, + "grad_norm": 0.12628133594989777, + "learning_rate": 4.388253548459437e-06, + "loss": 0.2523, + "step": 69790 + }, + { + "epoch": 3.1026359070098235, + "grad_norm": 0.13244973123073578, + "learning_rate": 4.3797902856823395e-06, + "loss": 0.253, + "step": 69800 + }, + { + "epoch": 3.1030804107214296, + "grad_norm": 0.1133352518081665, + "learning_rate": 4.371334818151357e-06, + "loss": 0.2535, + "step": 69810 + }, + { + "epoch": 3.1035249144330357, + "grad_norm": 0.126435786485672, + "learning_rate": 4.362887147311306e-06, + "loss": 0.2533, + "step": 69820 + }, + { + "epoch": 3.1039694181446413, + "grad_norm": 0.13369320333003998, + "learning_rate": 4.354447274605672e-06, + "loss": 0.2546, + "step": 69830 + }, + { + "epoch": 3.1044139218562474, + "grad_norm": 0.11155674606561661, + "learning_rate": 4.346015201476572e-06, + "loss": 0.2524, + "step": 69840 + }, + { + "epoch": 3.1048584255678535, + "grad_norm": 0.12350708991289139, + "learning_rate": 4.337590929364855e-06, + "loss": 0.254, + "step": 69850 + }, + { + "epoch": 3.1053029292794596, + "grad_norm": 0.10993107408285141, + "learning_rate": 4.329174459709973e-06, + "loss": 0.2541, + "step": 69860 + }, + { + "epoch": 3.1057474329910653, + "grad_norm": 0.13691920042037964, + "learning_rate": 4.320765793950071e-06, + "loss": 0.2533, + "step": 69870 + }, + { + "epoch": 3.1061919367026714, + "grad_norm": 0.12788939476013184, + "learning_rate": 4.312364933521962e-06, + "loss": 0.2537, + "step": 69880 + }, + { + "epoch": 3.1066364404142774, + "grad_norm": 0.12423336505889893, + "learning_rate": 4.303971879861129e-06, + "loss": 0.2551, + "step": 69890 + }, + { + "epoch": 3.1070809441258835, + "grad_norm": 0.10875929147005081, + "learning_rate": 4.295586634401716e-06, + "loss": 0.2543, + "step": 69900 + }, + { + "epoch": 3.1075254478374896, + "grad_norm": 0.11858390271663666, + "learning_rate": 4.287209198576536e-06, + "loss": 0.252, + "step": 69910 + }, + { + "epoch": 3.1079699515490953, + "grad_norm": 0.20325516164302826, + "learning_rate": 4.278839573817045e-06, + "loss": 0.2527, + "step": 69920 + }, + { + "epoch": 3.1084144552607014, + "grad_norm": 0.11283740401268005, + "learning_rate": 4.270477761553399e-06, + "loss": 0.2544, + "step": 69930 + }, + { + "epoch": 3.1088589589723075, + "grad_norm": 0.11306431889533997, + "learning_rate": 4.262123763214393e-06, + "loss": 0.2556, + "step": 69940 + }, + { + "epoch": 3.1093034626839136, + "grad_norm": 0.11925505846738815, + "learning_rate": 4.253777580227508e-06, + "loss": 0.2536, + "step": 69950 + }, + { + "epoch": 3.109747966395519, + "grad_norm": 0.1140943095088005, + "learning_rate": 4.2454392140188755e-06, + "loss": 0.2513, + "step": 69960 + }, + { + "epoch": 3.1101924701071253, + "grad_norm": 0.1361856609582901, + "learning_rate": 4.2371086660132785e-06, + "loss": 0.2549, + "step": 69970 + }, + { + "epoch": 3.1106369738187314, + "grad_norm": 0.11733357608318329, + "learning_rate": 4.228785937634205e-06, + "loss": 0.2551, + "step": 69980 + }, + { + "epoch": 3.1110814775303375, + "grad_norm": 0.11889299750328064, + "learning_rate": 4.220471030303758e-06, + "loss": 0.2534, + "step": 69990 + }, + { + "epoch": 3.1115259812419436, + "grad_norm": 0.14628718793392181, + "learning_rate": 4.212163945442754e-06, + "loss": 0.2546, + "step": 70000 + }, + { + "epoch": 3.111970484953549, + "grad_norm": 0.11761879175901413, + "learning_rate": 4.203864684470621e-06, + "loss": 0.2551, + "step": 70010 + }, + { + "epoch": 3.1124149886651553, + "grad_norm": 0.11756691336631775, + "learning_rate": 4.19557324880549e-06, + "loss": 0.2546, + "step": 70020 + }, + { + "epoch": 3.1128594923767614, + "grad_norm": 0.12250825762748718, + "learning_rate": 4.187289639864145e-06, + "loss": 0.2519, + "step": 70030 + }, + { + "epoch": 3.1133039960883675, + "grad_norm": 0.11493685096502304, + "learning_rate": 4.1790138590619974e-06, + "loss": 0.255, + "step": 70040 + }, + { + "epoch": 3.113748499799973, + "grad_norm": 0.1350746899843216, + "learning_rate": 4.170745907813195e-06, + "loss": 0.2544, + "step": 70050 + }, + { + "epoch": 3.1141930035115792, + "grad_norm": 0.1334272176027298, + "learning_rate": 4.162485787530479e-06, + "loss": 0.2557, + "step": 70060 + }, + { + "epoch": 3.1146375072231853, + "grad_norm": 0.09865330159664154, + "learning_rate": 4.154233499625282e-06, + "loss": 0.251, + "step": 70070 + }, + { + "epoch": 3.1150820109347914, + "grad_norm": 0.11858861893415451, + "learning_rate": 4.145989045507692e-06, + "loss": 0.2524, + "step": 70080 + }, + { + "epoch": 3.1155265146463975, + "grad_norm": 0.16175998747348785, + "learning_rate": 4.1377524265864666e-06, + "loss": 0.2557, + "step": 70090 + }, + { + "epoch": 3.115971018358003, + "grad_norm": 0.126003235578537, + "learning_rate": 4.1295236442690175e-06, + "loss": 0.2548, + "step": 70100 + }, + { + "epoch": 3.1164155220696093, + "grad_norm": 0.19075073301792145, + "learning_rate": 4.121302699961421e-06, + "loss": 0.2552, + "step": 70110 + }, + { + "epoch": 3.1168600257812153, + "grad_norm": 0.10945387184619904, + "learning_rate": 4.113089595068403e-06, + "loss": 0.2551, + "step": 70120 + }, + { + "epoch": 3.1173045294928214, + "grad_norm": 0.11466387659311295, + "learning_rate": 4.104884330993364e-06, + "loss": 0.2553, + "step": 70130 + }, + { + "epoch": 3.117749033204427, + "grad_norm": 0.11482972651720047, + "learning_rate": 4.0966869091383585e-06, + "loss": 0.2519, + "step": 70140 + }, + { + "epoch": 3.118193536916033, + "grad_norm": 0.12920542061328888, + "learning_rate": 4.088497330904101e-06, + "loss": 0.2567, + "step": 70150 + }, + { + "epoch": 3.1186380406276393, + "grad_norm": 0.1108599305152893, + "learning_rate": 4.080315597689976e-06, + "loss": 0.253, + "step": 70160 + }, + { + "epoch": 3.1190825443392454, + "grad_norm": 0.11142763495445251, + "learning_rate": 4.072141710893995e-06, + "loss": 0.256, + "step": 70170 + }, + { + "epoch": 3.119527048050851, + "grad_norm": 0.12429720163345337, + "learning_rate": 4.063975671912879e-06, + "loss": 0.2533, + "step": 70180 + }, + { + "epoch": 3.119971551762457, + "grad_norm": 0.13660231232643127, + "learning_rate": 4.055817482141949e-06, + "loss": 0.2533, + "step": 70190 + }, + { + "epoch": 3.120416055474063, + "grad_norm": 0.11507215350866318, + "learning_rate": 4.047667142975259e-06, + "loss": 0.2561, + "step": 70200 + }, + { + "epoch": 3.1208605591856693, + "grad_norm": 0.13682211935520172, + "learning_rate": 4.039524655805443e-06, + "loss": 0.2548, + "step": 70210 + }, + { + "epoch": 3.1213050628972754, + "grad_norm": 0.1250080019235611, + "learning_rate": 4.03139002202384e-06, + "loss": 0.2528, + "step": 70220 + }, + { + "epoch": 3.121749566608881, + "grad_norm": 0.11218200623989105, + "learning_rate": 4.023263243020447e-06, + "loss": 0.2528, + "step": 70230 + }, + { + "epoch": 3.122194070320487, + "grad_norm": 0.11664196848869324, + "learning_rate": 4.015144320183884e-06, + "loss": 0.2532, + "step": 70240 + }, + { + "epoch": 3.122638574032093, + "grad_norm": 0.1118745282292366, + "learning_rate": 4.007033254901482e-06, + "loss": 0.2528, + "step": 70250 + }, + { + "epoch": 3.1230830777436993, + "grad_norm": 0.12152013182640076, + "learning_rate": 3.9989300485591795e-06, + "loss": 0.2517, + "step": 70260 + }, + { + "epoch": 3.123527581455305, + "grad_norm": 0.1289733499288559, + "learning_rate": 3.990834702541601e-06, + "loss": 0.2519, + "step": 70270 + }, + { + "epoch": 3.123972085166911, + "grad_norm": 0.13282714784145355, + "learning_rate": 3.982747218232019e-06, + "loss": 0.2532, + "step": 70280 + }, + { + "epoch": 3.124416588878517, + "grad_norm": 0.1335310935974121, + "learning_rate": 3.97466759701236e-06, + "loss": 0.2525, + "step": 70290 + }, + { + "epoch": 3.1248610925901232, + "grad_norm": 0.11542974412441254, + "learning_rate": 3.966595840263226e-06, + "loss": 0.2524, + "step": 70300 + }, + { + "epoch": 3.1253055963017293, + "grad_norm": 0.14005792140960693, + "learning_rate": 3.958531949363836e-06, + "loss": 0.2534, + "step": 70310 + }, + { + "epoch": 3.125750100013335, + "grad_norm": 0.1133093535900116, + "learning_rate": 3.950475925692098e-06, + "loss": 0.2528, + "step": 70320 + }, + { + "epoch": 3.126194603724941, + "grad_norm": 0.1053919568657875, + "learning_rate": 3.9424277706245685e-06, + "loss": 0.2516, + "step": 70330 + }, + { + "epoch": 3.126639107436547, + "grad_norm": 0.1167570948600769, + "learning_rate": 3.934387485536451e-06, + "loss": 0.2534, + "step": 70340 + }, + { + "epoch": 3.1270836111481533, + "grad_norm": 0.10605566948652267, + "learning_rate": 3.926355071801619e-06, + "loss": 0.2532, + "step": 70350 + }, + { + "epoch": 3.127528114859759, + "grad_norm": 0.1284274309873581, + "learning_rate": 3.9183305307925965e-06, + "loss": 0.2507, + "step": 70360 + }, + { + "epoch": 3.127972618571365, + "grad_norm": 0.12327782809734344, + "learning_rate": 3.910313863880533e-06, + "loss": 0.2539, + "step": 70370 + }, + { + "epoch": 3.128417122282971, + "grad_norm": 0.1338113397359848, + "learning_rate": 3.902305072435292e-06, + "loss": 0.2527, + "step": 70380 + }, + { + "epoch": 3.128861625994577, + "grad_norm": 0.12494134902954102, + "learning_rate": 3.894304157825329e-06, + "loss": 0.2549, + "step": 70390 + }, + { + "epoch": 3.1293061297061833, + "grad_norm": 0.1251111775636673, + "learning_rate": 3.886311121417791e-06, + "loss": 0.2518, + "step": 70400 + }, + { + "epoch": 3.129750633417789, + "grad_norm": 0.14173202216625214, + "learning_rate": 3.878325964578472e-06, + "loss": 0.2534, + "step": 70410 + }, + { + "epoch": 3.130195137129395, + "grad_norm": 0.13754980266094208, + "learning_rate": 3.870348688671815e-06, + "loss": 0.254, + "step": 70420 + }, + { + "epoch": 3.130639640841001, + "grad_norm": 0.1454506814479828, + "learning_rate": 3.862379295060931e-06, + "loss": 0.2532, + "step": 70430 + }, + { + "epoch": 3.131084144552607, + "grad_norm": 0.1037796214222908, + "learning_rate": 3.854417785107534e-06, + "loss": 0.2516, + "step": 70440 + }, + { + "epoch": 3.131528648264213, + "grad_norm": 0.10013590008020401, + "learning_rate": 3.8464641601720755e-06, + "loss": 0.2516, + "step": 70450 + }, + { + "epoch": 3.131973151975819, + "grad_norm": 0.11087355762720108, + "learning_rate": 3.838518421613579e-06, + "loss": 0.2535, + "step": 70460 + }, + { + "epoch": 3.132417655687425, + "grad_norm": 0.11885561048984528, + "learning_rate": 3.830580570789766e-06, + "loss": 0.2561, + "step": 70470 + }, + { + "epoch": 3.132862159399031, + "grad_norm": 0.1327802538871765, + "learning_rate": 3.822650609057005e-06, + "loss": 0.252, + "step": 70480 + }, + { + "epoch": 3.1333066631106368, + "grad_norm": 0.13496847450733185, + "learning_rate": 3.814728537770285e-06, + "loss": 0.2544, + "step": 70490 + }, + { + "epoch": 3.133751166822243, + "grad_norm": 0.1155499666929245, + "learning_rate": 3.806814358283306e-06, + "loss": 0.2544, + "step": 70500 + }, + { + "epoch": 3.134195670533849, + "grad_norm": 0.10856959223747253, + "learning_rate": 3.7989080719483596e-06, + "loss": 0.255, + "step": 70510 + }, + { + "epoch": 3.134640174245455, + "grad_norm": 0.12457859516143799, + "learning_rate": 3.7910096801164143e-06, + "loss": 0.2553, + "step": 70520 + }, + { + "epoch": 3.135084677957061, + "grad_norm": 0.11357990652322769, + "learning_rate": 3.7831191841371016e-06, + "loss": 0.252, + "step": 70530 + }, + { + "epoch": 3.135529181668667, + "grad_norm": 0.10579542070627213, + "learning_rate": 3.775236585358688e-06, + "loss": 0.2527, + "step": 70540 + }, + { + "epoch": 3.135973685380273, + "grad_norm": 0.10035758465528488, + "learning_rate": 3.7673618851280843e-06, + "loss": 0.2551, + "step": 70550 + }, + { + "epoch": 3.136418189091879, + "grad_norm": 0.10481591522693634, + "learning_rate": 3.759495084790887e-06, + "loss": 0.2529, + "step": 70560 + }, + { + "epoch": 3.136862692803485, + "grad_norm": 0.10514596104621887, + "learning_rate": 3.751636185691282e-06, + "loss": 0.2534, + "step": 70570 + }, + { + "epoch": 3.1373071965150907, + "grad_norm": 0.11314674466848373, + "learning_rate": 3.7437851891721607e-06, + "loss": 0.2563, + "step": 70580 + }, + { + "epoch": 3.137751700226697, + "grad_norm": 0.1174054741859436, + "learning_rate": 3.7359420965750404e-06, + "loss": 0.2526, + "step": 70590 + }, + { + "epoch": 3.138196203938303, + "grad_norm": 0.12056607007980347, + "learning_rate": 3.7281069092400922e-06, + "loss": 0.2546, + "step": 70600 + }, + { + "epoch": 3.138640707649909, + "grad_norm": 0.14224155247211456, + "learning_rate": 3.7202796285061348e-06, + "loss": 0.2533, + "step": 70610 + }, + { + "epoch": 3.1390852113615146, + "grad_norm": 0.11570823192596436, + "learning_rate": 3.712460255710637e-06, + "loss": 0.2528, + "step": 70620 + }, + { + "epoch": 3.1395297150731207, + "grad_norm": 0.11495889723300934, + "learning_rate": 3.704648792189719e-06, + "loss": 0.2521, + "step": 70630 + }, + { + "epoch": 3.139974218784727, + "grad_norm": 0.12061673402786255, + "learning_rate": 3.696845239278124e-06, + "loss": 0.2553, + "step": 70640 + }, + { + "epoch": 3.140418722496333, + "grad_norm": 0.09523720294237137, + "learning_rate": 3.689049598309302e-06, + "loss": 0.2533, + "step": 70650 + }, + { + "epoch": 3.140863226207939, + "grad_norm": 0.1256812959909439, + "learning_rate": 3.681261870615288e-06, + "loss": 0.2533, + "step": 70660 + }, + { + "epoch": 3.1413077299195447, + "grad_norm": 0.13534703850746155, + "learning_rate": 3.6734820575268004e-06, + "loss": 0.2547, + "step": 70670 + }, + { + "epoch": 3.1417522336311507, + "grad_norm": 0.1028999462723732, + "learning_rate": 3.665710160373209e-06, + "loss": 0.2553, + "step": 70680 + }, + { + "epoch": 3.142196737342757, + "grad_norm": 0.13981111347675323, + "learning_rate": 3.65794618048248e-06, + "loss": 0.2535, + "step": 70690 + }, + { + "epoch": 3.142641241054363, + "grad_norm": 0.11176612973213196, + "learning_rate": 3.6501901191813125e-06, + "loss": 0.2522, + "step": 70700 + }, + { + "epoch": 3.143085744765969, + "grad_norm": 0.11959245800971985, + "learning_rate": 3.642441977794975e-06, + "loss": 0.2532, + "step": 70710 + }, + { + "epoch": 3.1435302484775747, + "grad_norm": 0.1249842569231987, + "learning_rate": 3.634701757647424e-06, + "loss": 0.2565, + "step": 70720 + }, + { + "epoch": 3.1439747521891808, + "grad_norm": 0.1331760585308075, + "learning_rate": 3.6269694600612468e-06, + "loss": 0.2553, + "step": 70730 + }, + { + "epoch": 3.144419255900787, + "grad_norm": 0.12042113393545151, + "learning_rate": 3.619245086357681e-06, + "loss": 0.2554, + "step": 70740 + }, + { + "epoch": 3.144863759612393, + "grad_norm": 0.6763691902160645, + "learning_rate": 3.611528637856615e-06, + "loss": 0.2519, + "step": 70750 + }, + { + "epoch": 3.1453082633239986, + "grad_norm": 0.12142838537693024, + "learning_rate": 3.6038201158765884e-06, + "loss": 0.2537, + "step": 70760 + }, + { + "epoch": 3.1457527670356047, + "grad_norm": 0.11228915303945541, + "learning_rate": 3.5961195217347534e-06, + "loss": 0.2543, + "step": 70770 + }, + { + "epoch": 3.146197270747211, + "grad_norm": 0.12563255429267883, + "learning_rate": 3.588426856746946e-06, + "loss": 0.2533, + "step": 70780 + }, + { + "epoch": 3.146641774458817, + "grad_norm": 0.1284250169992447, + "learning_rate": 3.5807421222276316e-06, + "loss": 0.2533, + "step": 70790 + }, + { + "epoch": 3.1470862781704225, + "grad_norm": 0.116474948823452, + "learning_rate": 3.5730653194899156e-06, + "loss": 0.2549, + "step": 70800 + }, + { + "epoch": 3.1475307818820286, + "grad_norm": 0.10577793419361115, + "learning_rate": 3.565396449845554e-06, + "loss": 0.2551, + "step": 70810 + }, + { + "epoch": 3.1479752855936347, + "grad_norm": 0.11761356890201569, + "learning_rate": 3.557735514604954e-06, + "loss": 0.254, + "step": 70820 + }, + { + "epoch": 3.148419789305241, + "grad_norm": 0.1430046707391739, + "learning_rate": 3.5500825150771633e-06, + "loss": 0.2542, + "step": 70830 + }, + { + "epoch": 3.148864293016847, + "grad_norm": 0.14412979781627655, + "learning_rate": 3.5424374525698466e-06, + "loss": 0.2552, + "step": 70840 + }, + { + "epoch": 3.1493087967284525, + "grad_norm": 0.12672002613544464, + "learning_rate": 3.5348003283893704e-06, + "loss": 0.2558, + "step": 70850 + }, + { + "epoch": 3.1497533004400586, + "grad_norm": 0.11944542825222015, + "learning_rate": 3.527171143840685e-06, + "loss": 0.2536, + "step": 70860 + }, + { + "epoch": 3.1501978041516647, + "grad_norm": 0.10939139872789383, + "learning_rate": 3.519549900227409e-06, + "loss": 0.2539, + "step": 70870 + }, + { + "epoch": 3.150642307863271, + "grad_norm": 0.14104437828063965, + "learning_rate": 3.511936598851828e-06, + "loss": 0.2543, + "step": 70880 + }, + { + "epoch": 3.1510868115748765, + "grad_norm": 0.12376759201288223, + "learning_rate": 3.504331241014813e-06, + "loss": 0.2541, + "step": 70890 + }, + { + "epoch": 3.1515313152864826, + "grad_norm": 0.12007924169301987, + "learning_rate": 3.4967338280159414e-06, + "loss": 0.2542, + "step": 70900 + }, + { + "epoch": 3.1519758189980887, + "grad_norm": 0.13189788162708282, + "learning_rate": 3.4891443611533846e-06, + "loss": 0.2528, + "step": 70910 + }, + { + "epoch": 3.1524203227096947, + "grad_norm": 0.1121068075299263, + "learning_rate": 3.481562841723984e-06, + "loss": 0.2535, + "step": 70920 + }, + { + "epoch": 3.1528648264213004, + "grad_norm": 0.11587785929441452, + "learning_rate": 3.473989271023215e-06, + "loss": 0.2543, + "step": 70930 + }, + { + "epoch": 3.1533093301329065, + "grad_norm": 0.12230861186981201, + "learning_rate": 3.4664236503451854e-06, + "loss": 0.2538, + "step": 70940 + }, + { + "epoch": 3.1537538338445126, + "grad_norm": 0.10898078233003616, + "learning_rate": 3.4588659809826674e-06, + "loss": 0.2561, + "step": 70950 + }, + { + "epoch": 3.1541983375561187, + "grad_norm": 0.12699241936206818, + "learning_rate": 3.451316264227039e-06, + "loss": 0.2533, + "step": 70960 + }, + { + "epoch": 3.1546428412677248, + "grad_norm": 0.14347797632217407, + "learning_rate": 3.4437745013683574e-06, + "loss": 0.2552, + "step": 70970 + }, + { + "epoch": 3.1550873449793304, + "grad_norm": 0.10240756720304489, + "learning_rate": 3.4362406936952916e-06, + "loss": 0.251, + "step": 70980 + }, + { + "epoch": 3.1555318486909365, + "grad_norm": 0.1239950880408287, + "learning_rate": 3.4287148424951786e-06, + "loss": 0.2525, + "step": 70990 + }, + { + "epoch": 3.1559763524025426, + "grad_norm": 0.10616132616996765, + "learning_rate": 3.4211969490539617e-06, + "loss": 0.2526, + "step": 71000 + }, + { + "epoch": 3.1564208561141487, + "grad_norm": 0.12539838254451752, + "learning_rate": 3.4136870146562584e-06, + "loss": 0.2547, + "step": 71010 + }, + { + "epoch": 3.156865359825755, + "grad_norm": 0.12680475413799286, + "learning_rate": 3.406185040585308e-06, + "loss": 0.2555, + "step": 71020 + }, + { + "epoch": 3.1573098635373604, + "grad_norm": 0.10059531778097153, + "learning_rate": 3.3986910281229966e-06, + "loss": 0.2546, + "step": 71030 + }, + { + "epoch": 3.1577543672489665, + "grad_norm": 0.10415945947170258, + "learning_rate": 3.391204978549828e-06, + "loss": 0.2548, + "step": 71040 + }, + { + "epoch": 3.1581988709605726, + "grad_norm": 0.13276007771492004, + "learning_rate": 3.3837268931449785e-06, + "loss": 0.2566, + "step": 71050 + }, + { + "epoch": 3.1586433746721787, + "grad_norm": 0.13303223252296448, + "learning_rate": 3.376256773186248e-06, + "loss": 0.2515, + "step": 71060 + }, + { + "epoch": 3.1590878783837844, + "grad_norm": 0.11491266638040543, + "learning_rate": 3.3687946199500664e-06, + "loss": 0.2545, + "step": 71070 + }, + { + "epoch": 3.1595323820953904, + "grad_norm": 0.12941761314868927, + "learning_rate": 3.3613404347115295e-06, + "loss": 0.254, + "step": 71080 + }, + { + "epoch": 3.1599768858069965, + "grad_norm": 0.13926859200000763, + "learning_rate": 3.35389421874433e-06, + "loss": 0.2569, + "step": 71090 + }, + { + "epoch": 3.1604213895186026, + "grad_norm": 0.13097792863845825, + "learning_rate": 3.3464559733208446e-06, + "loss": 0.2556, + "step": 71100 + }, + { + "epoch": 3.1608658932302083, + "grad_norm": 0.11278603971004486, + "learning_rate": 3.3390256997120505e-06, + "loss": 0.2547, + "step": 71110 + }, + { + "epoch": 3.1613103969418144, + "grad_norm": 0.12075934559106827, + "learning_rate": 3.331603399187583e-06, + "loss": 0.2577, + "step": 71120 + }, + { + "epoch": 3.1617549006534205, + "grad_norm": 0.1222538873553276, + "learning_rate": 3.324189073015721e-06, + "loss": 0.2525, + "step": 71130 + }, + { + "epoch": 3.1621994043650266, + "grad_norm": 0.11506839096546173, + "learning_rate": 3.3167827224633465e-06, + "loss": 0.254, + "step": 71140 + }, + { + "epoch": 3.1626439080766326, + "grad_norm": 0.11836103349924088, + "learning_rate": 3.309384348796024e-06, + "loss": 0.256, + "step": 71150 + }, + { + "epoch": 3.1630884117882383, + "grad_norm": 0.10300443321466446, + "learning_rate": 3.301993953277921e-06, + "loss": 0.253, + "step": 71160 + }, + { + "epoch": 3.1635329154998444, + "grad_norm": 0.1003442257642746, + "learning_rate": 3.29461153717186e-06, + "loss": 0.2517, + "step": 71170 + }, + { + "epoch": 3.1639774192114505, + "grad_norm": 0.11951043456792831, + "learning_rate": 3.287237101739293e-06, + "loss": 0.2536, + "step": 71180 + }, + { + "epoch": 3.1644219229230566, + "grad_norm": 0.1348366141319275, + "learning_rate": 3.2798706482403075e-06, + "loss": 0.2537, + "step": 71190 + }, + { + "epoch": 3.164866426634662, + "grad_norm": 0.11965237557888031, + "learning_rate": 3.2725121779336285e-06, + "loss": 0.2527, + "step": 71200 + }, + { + "epoch": 3.1653109303462683, + "grad_norm": 0.12924863398075104, + "learning_rate": 3.265161692076618e-06, + "loss": 0.2556, + "step": 71210 + }, + { + "epoch": 3.1657554340578744, + "grad_norm": 0.1328747570514679, + "learning_rate": 3.257819191925282e-06, + "loss": 0.2558, + "step": 71220 + }, + { + "epoch": 3.1661999377694805, + "grad_norm": 0.11092754453420639, + "learning_rate": 3.2504846787342392e-06, + "loss": 0.2526, + "step": 71230 + }, + { + "epoch": 3.166644441481086, + "grad_norm": 0.12903428077697754, + "learning_rate": 3.243158153756759e-06, + "loss": 0.2534, + "step": 71240 + }, + { + "epoch": 3.1670889451926922, + "grad_norm": 0.11598073691129684, + "learning_rate": 3.2358396182447514e-06, + "loss": 0.2532, + "step": 71250 + }, + { + "epoch": 3.1675334489042983, + "grad_norm": 0.13795827329158783, + "learning_rate": 3.2285290734487496e-06, + "loss": 0.2537, + "step": 71260 + }, + { + "epoch": 3.1679779526159044, + "grad_norm": 0.1092182844877243, + "learning_rate": 3.221226520617926e-06, + "loss": 0.2534, + "step": 71270 + }, + { + "epoch": 3.1684224563275105, + "grad_norm": 0.11343754827976227, + "learning_rate": 3.213931961000094e-06, + "loss": 0.2527, + "step": 71280 + }, + { + "epoch": 3.168866960039116, + "grad_norm": 0.13263541460037231, + "learning_rate": 3.2066453958416733e-06, + "loss": 0.2518, + "step": 71290 + }, + { + "epoch": 3.1693114637507223, + "grad_norm": 0.11213325709104538, + "learning_rate": 3.199366826387773e-06, + "loss": 0.2566, + "step": 71300 + }, + { + "epoch": 3.1697559674623283, + "grad_norm": 0.1277564913034439, + "learning_rate": 3.192096253882071e-06, + "loss": 0.2537, + "step": 71310 + }, + { + "epoch": 3.1702004711739344, + "grad_norm": 0.12113859504461288, + "learning_rate": 3.1848336795669177e-06, + "loss": 0.2555, + "step": 71320 + }, + { + "epoch": 3.17064497488554, + "grad_norm": 0.11644003540277481, + "learning_rate": 3.1775791046833035e-06, + "loss": 0.2554, + "step": 71330 + }, + { + "epoch": 3.171089478597146, + "grad_norm": 0.10767582058906555, + "learning_rate": 3.170332530470804e-06, + "loss": 0.251, + "step": 71340 + }, + { + "epoch": 3.1715339823087523, + "grad_norm": 0.11463995277881622, + "learning_rate": 3.1630939581677012e-06, + "loss": 0.2511, + "step": 71350 + }, + { + "epoch": 3.1719784860203584, + "grad_norm": 0.12235850095748901, + "learning_rate": 3.155863389010838e-06, + "loss": 0.2551, + "step": 71360 + }, + { + "epoch": 3.1724229897319645, + "grad_norm": 0.14589908719062805, + "learning_rate": 3.1486408242357323e-06, + "loss": 0.2542, + "step": 71370 + }, + { + "epoch": 3.17286749344357, + "grad_norm": 0.10526791214942932, + "learning_rate": 3.1414262650765248e-06, + "loss": 0.2525, + "step": 71380 + }, + { + "epoch": 3.173311997155176, + "grad_norm": 0.1252698302268982, + "learning_rate": 3.134219712765979e-06, + "loss": 0.2535, + "step": 71390 + }, + { + "epoch": 3.1737565008667823, + "grad_norm": 0.11405643075704575, + "learning_rate": 3.1270211685355044e-06, + "loss": 0.2533, + "step": 71400 + }, + { + "epoch": 3.1742010045783884, + "grad_norm": 0.10343662649393082, + "learning_rate": 3.1198306336151338e-06, + "loss": 0.2528, + "step": 71410 + }, + { + "epoch": 3.174645508289994, + "grad_norm": 0.12468758225440979, + "learning_rate": 3.11264810923354e-06, + "loss": 0.2527, + "step": 71420 + }, + { + "epoch": 3.1750900120016, + "grad_norm": 0.1054542139172554, + "learning_rate": 3.105473596618008e-06, + "loss": 0.2521, + "step": 71430 + }, + { + "epoch": 3.175534515713206, + "grad_norm": 0.1360396295785904, + "learning_rate": 3.0983070969944683e-06, + "loss": 0.2526, + "step": 71440 + }, + { + "epoch": 3.1759790194248123, + "grad_norm": 0.1053558886051178, + "learning_rate": 3.09114861158748e-06, + "loss": 0.2533, + "step": 71450 + }, + { + "epoch": 3.1764235231364184, + "grad_norm": 0.12190845608711243, + "learning_rate": 3.0839981416202314e-06, + "loss": 0.2516, + "step": 71460 + }, + { + "epoch": 3.176868026848024, + "grad_norm": 0.12048584222793579, + "learning_rate": 3.0768556883145507e-06, + "loss": 0.254, + "step": 71470 + }, + { + "epoch": 3.17731253055963, + "grad_norm": 0.10348131507635117, + "learning_rate": 3.0697212528908834e-06, + "loss": 0.256, + "step": 71480 + }, + { + "epoch": 3.1777570342712362, + "grad_norm": 0.12294737994670868, + "learning_rate": 3.0625948365683e-06, + "loss": 0.2551, + "step": 71490 + }, + { + "epoch": 3.1782015379828423, + "grad_norm": 0.10699911415576935, + "learning_rate": 3.055476440564531e-06, + "loss": 0.2533, + "step": 71500 + }, + { + "epoch": 3.178646041694448, + "grad_norm": 0.1310427039861679, + "learning_rate": 3.0483660660958924e-06, + "loss": 0.2505, + "step": 71510 + }, + { + "epoch": 3.179090545406054, + "grad_norm": 0.11608758568763733, + "learning_rate": 3.0412637143773624e-06, + "loss": 0.2549, + "step": 71520 + }, + { + "epoch": 3.17953504911766, + "grad_norm": 0.12219421565532684, + "learning_rate": 3.034169386622554e-06, + "loss": 0.2551, + "step": 71530 + }, + { + "epoch": 3.1799795528292663, + "grad_norm": 0.1182231605052948, + "learning_rate": 3.027083084043658e-06, + "loss": 0.2521, + "step": 71540 + }, + { + "epoch": 3.180424056540872, + "grad_norm": 0.11363919824361801, + "learning_rate": 3.0200048078515676e-06, + "loss": 0.2546, + "step": 71550 + }, + { + "epoch": 3.180868560252478, + "grad_norm": 0.13669320940971375, + "learning_rate": 3.012934559255742e-06, + "loss": 0.2524, + "step": 71560 + }, + { + "epoch": 3.181313063964084, + "grad_norm": 0.12524478137493134, + "learning_rate": 3.0058723394642994e-06, + "loss": 0.2553, + "step": 71570 + }, + { + "epoch": 3.18175756767569, + "grad_norm": 0.12255988270044327, + "learning_rate": 2.998818149683985e-06, + "loss": 0.2535, + "step": 71580 + }, + { + "epoch": 3.1822020713872963, + "grad_norm": 0.1195375919342041, + "learning_rate": 2.9917719911201627e-06, + "loss": 0.2516, + "step": 71590 + }, + { + "epoch": 3.182646575098902, + "grad_norm": 0.11331253498792648, + "learning_rate": 2.9847338649768352e-06, + "loss": 0.2547, + "step": 71600 + }, + { + "epoch": 3.183091078810508, + "grad_norm": 0.11705736815929413, + "learning_rate": 2.977703772456608e-06, + "loss": 0.2522, + "step": 71610 + }, + { + "epoch": 3.183535582522114, + "grad_norm": 0.11153371632099152, + "learning_rate": 2.9706817147607535e-06, + "loss": 0.2539, + "step": 71620 + }, + { + "epoch": 3.18398008623372, + "grad_norm": 0.10764383524656296, + "learning_rate": 2.9636676930891393e-06, + "loss": 0.2565, + "step": 71630 + }, + { + "epoch": 3.184424589945326, + "grad_norm": 0.11436019092798233, + "learning_rate": 2.9566617086402625e-06, + "loss": 0.254, + "step": 71640 + }, + { + "epoch": 3.184869093656932, + "grad_norm": 0.10446679592132568, + "learning_rate": 2.9496637626112655e-06, + "loss": 0.2538, + "step": 71650 + }, + { + "epoch": 3.185313597368538, + "grad_norm": 0.1227007582783699, + "learning_rate": 2.9426738561979027e-06, + "loss": 0.2532, + "step": 71660 + }, + { + "epoch": 3.185758101080144, + "grad_norm": 0.1036883220076561, + "learning_rate": 2.9356919905945524e-06, + "loss": 0.2522, + "step": 71670 + }, + { + "epoch": 3.18620260479175, + "grad_norm": 0.10558394342660904, + "learning_rate": 2.928718166994243e-06, + "loss": 0.252, + "step": 71680 + }, + { + "epoch": 3.186647108503356, + "grad_norm": 0.1204184964299202, + "learning_rate": 2.9217523865885833e-06, + "loss": 0.2545, + "step": 71690 + }, + { + "epoch": 3.187091612214962, + "grad_norm": 0.14524362981319427, + "learning_rate": 2.9147946505678537e-06, + "loss": 0.254, + "step": 71700 + }, + { + "epoch": 3.187536115926568, + "grad_norm": 0.13455632328987122, + "learning_rate": 2.9078449601209313e-06, + "loss": 0.2541, + "step": 71710 + }, + { + "epoch": 3.187980619638174, + "grad_norm": 0.13444851338863373, + "learning_rate": 2.900903316435333e-06, + "loss": 0.2536, + "step": 71720 + }, + { + "epoch": 3.18842512334978, + "grad_norm": 0.12046289443969727, + "learning_rate": 2.8939697206971983e-06, + "loss": 0.2546, + "step": 71730 + }, + { + "epoch": 3.188869627061386, + "grad_norm": 0.13731873035430908, + "learning_rate": 2.8870441740912746e-06, + "loss": 0.2545, + "step": 71740 + }, + { + "epoch": 3.189314130772992, + "grad_norm": 0.14554817974567413, + "learning_rate": 2.8801266778009762e-06, + "loss": 0.2534, + "step": 71750 + }, + { + "epoch": 3.189758634484598, + "grad_norm": 0.10654442012310028, + "learning_rate": 2.8732172330082798e-06, + "loss": 0.2541, + "step": 71760 + }, + { + "epoch": 3.190203138196204, + "grad_norm": 0.10612310469150543, + "learning_rate": 2.8663158408938517e-06, + "loss": 0.2522, + "step": 71770 + }, + { + "epoch": 3.19064764190781, + "grad_norm": 0.131936714053154, + "learning_rate": 2.859422502636938e-06, + "loss": 0.2525, + "step": 71780 + }, + { + "epoch": 3.191092145619416, + "grad_norm": 0.09976344555616379, + "learning_rate": 2.8525372194154076e-06, + "loss": 0.2536, + "step": 71790 + }, + { + "epoch": 3.191536649331022, + "grad_norm": 0.11363942921161652, + "learning_rate": 2.8456599924057913e-06, + "loss": 0.254, + "step": 71800 + }, + { + "epoch": 3.191981153042628, + "grad_norm": 0.10490661859512329, + "learning_rate": 2.8387908227831995e-06, + "loss": 0.253, + "step": 71810 + }, + { + "epoch": 3.1924256567542337, + "grad_norm": 0.13699719309806824, + "learning_rate": 2.831929711721404e-06, + "loss": 0.2555, + "step": 71820 + }, + { + "epoch": 3.19287016046584, + "grad_norm": 0.11449293792247772, + "learning_rate": 2.825076660392767e-06, + "loss": 0.2537, + "step": 71830 + }, + { + "epoch": 3.193314664177446, + "grad_norm": 0.11198017001152039, + "learning_rate": 2.8182316699682908e-06, + "loss": 0.2533, + "step": 71840 + }, + { + "epoch": 3.193759167889052, + "grad_norm": 0.11333394795656204, + "learning_rate": 2.811394741617601e-06, + "loss": 0.2562, + "step": 71850 + }, + { + "epoch": 3.1942036716006577, + "grad_norm": 0.12309335172176361, + "learning_rate": 2.8045658765089356e-06, + "loss": 0.253, + "step": 71860 + }, + { + "epoch": 3.1946481753122637, + "grad_norm": 0.12017951160669327, + "learning_rate": 2.7977450758091605e-06, + "loss": 0.2552, + "step": 71870 + }, + { + "epoch": 3.19509267902387, + "grad_norm": 0.119387187063694, + "learning_rate": 2.790932340683783e-06, + "loss": 0.2535, + "step": 71880 + }, + { + "epoch": 3.195537182735476, + "grad_norm": 0.11005876213312149, + "learning_rate": 2.7841276722968823e-06, + "loss": 0.2538, + "step": 71890 + }, + { + "epoch": 3.195981686447082, + "grad_norm": 0.10461889207363129, + "learning_rate": 2.7773310718112067e-06, + "loss": 0.2531, + "step": 71900 + }, + { + "epoch": 3.1964261901586877, + "grad_norm": 0.11056673526763916, + "learning_rate": 2.7705425403881102e-06, + "loss": 0.253, + "step": 71910 + }, + { + "epoch": 3.1968706938702938, + "grad_norm": 0.11167009174823761, + "learning_rate": 2.763762079187565e-06, + "loss": 0.2511, + "step": 71920 + }, + { + "epoch": 3.1973151975819, + "grad_norm": 0.10686136782169342, + "learning_rate": 2.756989689368178e-06, + "loss": 0.2536, + "step": 71930 + }, + { + "epoch": 3.197759701293506, + "grad_norm": 0.12525756657123566, + "learning_rate": 2.750225372087134e-06, + "loss": 0.254, + "step": 71940 + }, + { + "epoch": 3.1982042050051116, + "grad_norm": 0.10101905465126038, + "learning_rate": 2.7434691285003033e-06, + "loss": 0.2557, + "step": 71950 + }, + { + "epoch": 3.1986487087167177, + "grad_norm": 0.11628183722496033, + "learning_rate": 2.7367209597621178e-06, + "loss": 0.2532, + "step": 71960 + }, + { + "epoch": 3.199093212428324, + "grad_norm": 0.1165333017706871, + "learning_rate": 2.729980867025683e-06, + "loss": 0.2521, + "step": 71970 + }, + { + "epoch": 3.19953771613993, + "grad_norm": 0.12595748901367188, + "learning_rate": 2.7232488514426724e-06, + "loss": 0.2525, + "step": 71980 + }, + { + "epoch": 3.1999822198515355, + "grad_norm": 0.10543874651193619, + "learning_rate": 2.7165249141634097e-06, + "loss": 0.2581, + "step": 71990 + }, + { + "epoch": 3.2004267235631416, + "grad_norm": 0.10541494190692902, + "learning_rate": 2.709809056336837e-06, + "loss": 0.254, + "step": 72000 + }, + { + "epoch": 3.2008712272747477, + "grad_norm": 0.12028522044420242, + "learning_rate": 2.703101279110498e-06, + "loss": 0.2519, + "step": 72010 + }, + { + "epoch": 3.201315730986354, + "grad_norm": 0.11726956814527512, + "learning_rate": 2.696401583630587e-06, + "loss": 0.2497, + "step": 72020 + }, + { + "epoch": 3.20176023469796, + "grad_norm": 0.1196126639842987, + "learning_rate": 2.689709971041887e-06, + "loss": 0.2543, + "step": 72030 + }, + { + "epoch": 3.2022047384095655, + "grad_norm": 0.11664266884326935, + "learning_rate": 2.6830264424878126e-06, + "loss": 0.2524, + "step": 72040 + }, + { + "epoch": 3.2026492421211716, + "grad_norm": 0.11106055974960327, + "learning_rate": 2.676350999110394e-06, + "loss": 0.2509, + "step": 72050 + }, + { + "epoch": 3.2030937458327777, + "grad_norm": 0.1167106032371521, + "learning_rate": 2.6696836420502856e-06, + "loss": 0.2522, + "step": 72060 + }, + { + "epoch": 3.203538249544384, + "grad_norm": 0.09821639955043793, + "learning_rate": 2.663024372446765e-06, + "loss": 0.2519, + "step": 72070 + }, + { + "epoch": 3.20398275325599, + "grad_norm": 0.1306006908416748, + "learning_rate": 2.6563731914377055e-06, + "loss": 0.2547, + "step": 72080 + }, + { + "epoch": 3.2044272569675956, + "grad_norm": 0.11644207686185837, + "learning_rate": 2.6497301001596087e-06, + "loss": 0.2548, + "step": 72090 + }, + { + "epoch": 3.2048717606792017, + "grad_norm": 0.11623615771532059, + "learning_rate": 2.643095099747611e-06, + "loss": 0.2562, + "step": 72100 + }, + { + "epoch": 3.2053162643908077, + "grad_norm": 0.11300303786993027, + "learning_rate": 2.636468191335445e-06, + "loss": 0.2528, + "step": 72110 + }, + { + "epoch": 3.205760768102414, + "grad_norm": 0.10102412849664688, + "learning_rate": 2.629849376055471e-06, + "loss": 0.2524, + "step": 72120 + }, + { + "epoch": 3.2062052718140195, + "grad_norm": 0.11722298711538315, + "learning_rate": 2.6232386550386678e-06, + "loss": 0.2514, + "step": 72130 + }, + { + "epoch": 3.2066497755256256, + "grad_norm": 0.10274036973714828, + "learning_rate": 2.6166360294146097e-06, + "loss": 0.2547, + "step": 72140 + }, + { + "epoch": 3.2070942792372317, + "grad_norm": 0.10871770977973938, + "learning_rate": 2.6100415003115275e-06, + "loss": 0.2548, + "step": 72150 + }, + { + "epoch": 3.2075387829488378, + "grad_norm": 0.1021648645401001, + "learning_rate": 2.603455068856225e-06, + "loss": 0.2537, + "step": 72160 + }, + { + "epoch": 3.2079832866604434, + "grad_norm": 0.11953914910554886, + "learning_rate": 2.5968767361741584e-06, + "loss": 0.255, + "step": 72170 + }, + { + "epoch": 3.2084277903720495, + "grad_norm": 0.12215439230203629, + "learning_rate": 2.590306503389378e-06, + "loss": 0.2526, + "step": 72180 + }, + { + "epoch": 3.2088722940836556, + "grad_norm": 0.11580648273229599, + "learning_rate": 2.583744371624558e-06, + "loss": 0.255, + "step": 72190 + }, + { + "epoch": 3.2093167977952617, + "grad_norm": 0.11826343089342117, + "learning_rate": 2.57719034200099e-06, + "loss": 0.2545, + "step": 72200 + }, + { + "epoch": 3.209761301506868, + "grad_norm": 0.12015173584222794, + "learning_rate": 2.570644415638568e-06, + "loss": 0.2569, + "step": 72210 + }, + { + "epoch": 3.2102058052184734, + "grad_norm": 0.1047849953174591, + "learning_rate": 2.5641065936558295e-06, + "loss": 0.2539, + "step": 72220 + }, + { + "epoch": 3.2106503089300795, + "grad_norm": 0.1211632490158081, + "learning_rate": 2.557576877169898e-06, + "loss": 0.2542, + "step": 72230 + }, + { + "epoch": 3.2110948126416856, + "grad_norm": 0.12661074101924896, + "learning_rate": 2.5510552672965205e-06, + "loss": 0.2536, + "step": 72240 + }, + { + "epoch": 3.2115393163532917, + "grad_norm": 0.11598068475723267, + "learning_rate": 2.544541765150077e-06, + "loss": 0.254, + "step": 72250 + }, + { + "epoch": 3.2119838200648974, + "grad_norm": 0.1083156019449234, + "learning_rate": 2.5380363718435165e-06, + "loss": 0.253, + "step": 72260 + }, + { + "epoch": 3.2124283237765034, + "grad_norm": 0.10548500716686249, + "learning_rate": 2.5315390884884714e-06, + "loss": 0.2525, + "step": 72270 + }, + { + "epoch": 3.2128728274881095, + "grad_norm": 0.12900926172733307, + "learning_rate": 2.5250499161951214e-06, + "loss": 0.2537, + "step": 72280 + }, + { + "epoch": 3.2133173311997156, + "grad_norm": 0.10754180699586868, + "learning_rate": 2.518568856072301e-06, + "loss": 0.2547, + "step": 72290 + }, + { + "epoch": 3.2137618349113213, + "grad_norm": 0.14626100659370422, + "learning_rate": 2.512095909227441e-06, + "loss": 0.2558, + "step": 72300 + }, + { + "epoch": 3.2142063386229274, + "grad_norm": 0.12235689163208008, + "learning_rate": 2.50563107676659e-06, + "loss": 0.2547, + "step": 72310 + }, + { + "epoch": 3.2146508423345335, + "grad_norm": 0.10981988161802292, + "learning_rate": 2.4991743597944152e-06, + "loss": 0.2524, + "step": 72320 + }, + { + "epoch": 3.2150953460461396, + "grad_norm": 0.1188119500875473, + "learning_rate": 2.492725759414205e-06, + "loss": 0.2528, + "step": 72330 + }, + { + "epoch": 3.2155398497577456, + "grad_norm": 0.12136612832546234, + "learning_rate": 2.4862852767278234e-06, + "loss": 0.255, + "step": 72340 + }, + { + "epoch": 3.2159843534693513, + "grad_norm": 0.11379490792751312, + "learning_rate": 2.479852912835784e-06, + "loss": 0.2561, + "step": 72350 + }, + { + "epoch": 3.2164288571809574, + "grad_norm": 0.12350461632013321, + "learning_rate": 2.4734286688372075e-06, + "loss": 0.2541, + "step": 72360 + }, + { + "epoch": 3.2168733608925635, + "grad_norm": 0.1095104068517685, + "learning_rate": 2.467012545829811e-06, + "loss": 0.2516, + "step": 72370 + }, + { + "epoch": 3.2173178646041696, + "grad_norm": 0.10989875346422195, + "learning_rate": 2.460604544909945e-06, + "loss": 0.2534, + "step": 72380 + }, + { + "epoch": 3.2177623683157757, + "grad_norm": 0.12112440913915634, + "learning_rate": 2.454204667172555e-06, + "loss": 0.2519, + "step": 72390 + }, + { + "epoch": 3.2182068720273813, + "grad_norm": 0.09656178951263428, + "learning_rate": 2.447812913711217e-06, + "loss": 0.2529, + "step": 72400 + }, + { + "epoch": 3.2186513757389874, + "grad_norm": 0.15516681969165802, + "learning_rate": 2.44142928561808e-06, + "loss": 0.2531, + "step": 72410 + }, + { + "epoch": 3.2190958794505935, + "grad_norm": 0.11902325600385666, + "learning_rate": 2.435053783983965e-06, + "loss": 0.2513, + "step": 72420 + }, + { + "epoch": 3.2195403831621996, + "grad_norm": 0.10246681421995163, + "learning_rate": 2.4286864098982453e-06, + "loss": 0.2483, + "step": 72430 + }, + { + "epoch": 3.2199848868738052, + "grad_norm": 0.10960652679204941, + "learning_rate": 2.4223271644489397e-06, + "loss": 0.2551, + "step": 72440 + }, + { + "epoch": 3.2204293905854113, + "grad_norm": 0.12968765199184418, + "learning_rate": 2.415976048722679e-06, + "loss": 0.2522, + "step": 72450 + }, + { + "epoch": 3.2208738942970174, + "grad_norm": 0.13652636110782623, + "learning_rate": 2.4096330638046673e-06, + "loss": 0.2557, + "step": 72460 + }, + { + "epoch": 3.2213183980086235, + "grad_norm": 0.10415946692228317, + "learning_rate": 2.4032982107787816e-06, + "loss": 0.2524, + "step": 72470 + }, + { + "epoch": 3.221762901720229, + "grad_norm": 0.1324087679386139, + "learning_rate": 2.396971490727451e-06, + "loss": 0.255, + "step": 72480 + }, + { + "epoch": 3.2222074054318353, + "grad_norm": 0.11740428954362869, + "learning_rate": 2.3906529047317493e-06, + "loss": 0.2516, + "step": 72490 + }, + { + "epoch": 3.2226519091434414, + "grad_norm": 0.1314082145690918, + "learning_rate": 2.3843424538713465e-06, + "loss": 0.2575, + "step": 72500 + }, + { + "epoch": 3.2230964128550474, + "grad_norm": 0.11047051846981049, + "learning_rate": 2.3780401392245298e-06, + "loss": 0.2513, + "step": 72510 + }, + { + "epoch": 3.2235409165666535, + "grad_norm": 0.10099642723798752, + "learning_rate": 2.3717459618681883e-06, + "loss": 0.2532, + "step": 72520 + }, + { + "epoch": 3.223985420278259, + "grad_norm": 0.16256438195705414, + "learning_rate": 2.3654599228778274e-06, + "loss": 0.2531, + "step": 72530 + }, + { + "epoch": 3.2244299239898653, + "grad_norm": 0.13522587716579437, + "learning_rate": 2.3591820233275607e-06, + "loss": 0.254, + "step": 72540 + }, + { + "epoch": 3.2248744277014714, + "grad_norm": 0.13843336701393127, + "learning_rate": 2.3529122642901024e-06, + "loss": 0.2524, + "step": 72550 + }, + { + "epoch": 3.2253189314130775, + "grad_norm": 0.13597621023654938, + "learning_rate": 2.3466506468367845e-06, + "loss": 0.2538, + "step": 72560 + }, + { + "epoch": 3.225763435124683, + "grad_norm": 0.11935582756996155, + "learning_rate": 2.3403971720375506e-06, + "loss": 0.2558, + "step": 72570 + }, + { + "epoch": 3.226207938836289, + "grad_norm": 0.14473173022270203, + "learning_rate": 2.334151840960952e-06, + "loss": 0.2545, + "step": 72580 + }, + { + "epoch": 3.2266524425478953, + "grad_norm": 0.11721871048212051, + "learning_rate": 2.3279146546741347e-06, + "loss": 0.2521, + "step": 72590 + }, + { + "epoch": 3.2270969462595014, + "grad_norm": 0.10529728978872299, + "learning_rate": 2.32168561424288e-06, + "loss": 0.2563, + "step": 72600 + }, + { + "epoch": 3.227541449971107, + "grad_norm": 0.11329837888479233, + "learning_rate": 2.3154647207315307e-06, + "loss": 0.2546, + "step": 72610 + }, + { + "epoch": 3.227985953682713, + "grad_norm": 0.1196863129734993, + "learning_rate": 2.309251975203103e-06, + "loss": 0.2539, + "step": 72620 + }, + { + "epoch": 3.228430457394319, + "grad_norm": 0.09072627872228622, + "learning_rate": 2.303047378719159e-06, + "loss": 0.2512, + "step": 72630 + }, + { + "epoch": 3.2288749611059253, + "grad_norm": 0.12169235199689865, + "learning_rate": 2.2968509323399e-06, + "loss": 0.2549, + "step": 72640 + }, + { + "epoch": 3.2293194648175314, + "grad_norm": 0.1155213788151741, + "learning_rate": 2.290662637124147e-06, + "loss": 0.2536, + "step": 72650 + }, + { + "epoch": 3.229763968529137, + "grad_norm": 0.0987236499786377, + "learning_rate": 2.2844824941292807e-06, + "loss": 0.2537, + "step": 72660 + }, + { + "epoch": 3.230208472240743, + "grad_norm": 0.1307121366262436, + "learning_rate": 2.2783105044113406e-06, + "loss": 0.2524, + "step": 72670 + }, + { + "epoch": 3.2306529759523492, + "grad_norm": 0.13026322424411774, + "learning_rate": 2.272146669024944e-06, + "loss": 0.2532, + "step": 72680 + }, + { + "epoch": 3.2310974796639553, + "grad_norm": 0.10053199529647827, + "learning_rate": 2.26599098902332e-06, + "loss": 0.2534, + "step": 72690 + }, + { + "epoch": 3.231541983375561, + "grad_norm": 0.13063132762908936, + "learning_rate": 2.2598434654583113e-06, + "loss": 0.2545, + "step": 72700 + }, + { + "epoch": 3.231986487087167, + "grad_norm": 0.11055922508239746, + "learning_rate": 2.253704099380355e-06, + "loss": 0.2532, + "step": 72710 + }, + { + "epoch": 3.232430990798773, + "grad_norm": 0.11539547145366669, + "learning_rate": 2.247572891838512e-06, + "loss": 0.254, + "step": 72720 + }, + { + "epoch": 3.2328754945103793, + "grad_norm": 0.10818605870008469, + "learning_rate": 2.241449843880422e-06, + "loss": 0.2501, + "step": 72730 + }, + { + "epoch": 3.2333199982219853, + "grad_norm": 0.12832504510879517, + "learning_rate": 2.235334956552354e-06, + "loss": 0.2541, + "step": 72740 + }, + { + "epoch": 3.233764501933591, + "grad_norm": 0.10961310565471649, + "learning_rate": 2.2292282308991775e-06, + "loss": 0.254, + "step": 72750 + }, + { + "epoch": 3.234209005645197, + "grad_norm": 0.12599368393421173, + "learning_rate": 2.223129667964363e-06, + "loss": 0.2564, + "step": 72760 + }, + { + "epoch": 3.234653509356803, + "grad_norm": 0.11963877826929092, + "learning_rate": 2.2170392687899834e-06, + "loss": 0.255, + "step": 72770 + }, + { + "epoch": 3.2350980130684093, + "grad_norm": 0.10368388146162033, + "learning_rate": 2.210957034416733e-06, + "loss": 0.2569, + "step": 72780 + }, + { + "epoch": 3.235542516780015, + "grad_norm": 0.11836168169975281, + "learning_rate": 2.2048829658838867e-06, + "loss": 0.2525, + "step": 72790 + }, + { + "epoch": 3.235987020491621, + "grad_norm": 0.13431327044963837, + "learning_rate": 2.1988170642293525e-06, + "loss": 0.2557, + "step": 72800 + }, + { + "epoch": 3.236431524203227, + "grad_norm": 0.10714791715145111, + "learning_rate": 2.1927593304896075e-06, + "loss": 0.254, + "step": 72810 + }, + { + "epoch": 3.236876027914833, + "grad_norm": 0.11356789618730545, + "learning_rate": 2.1867097656997626e-06, + "loss": 0.25, + "step": 72820 + }, + { + "epoch": 3.2373205316264393, + "grad_norm": 0.10980979353189468, + "learning_rate": 2.180668370893524e-06, + "loss": 0.2532, + "step": 72830 + }, + { + "epoch": 3.237765035338045, + "grad_norm": 0.126657634973526, + "learning_rate": 2.174635147103199e-06, + "loss": 0.2516, + "step": 72840 + }, + { + "epoch": 3.238209539049651, + "grad_norm": 0.10882432758808136, + "learning_rate": 2.1686100953597075e-06, + "loss": 0.2536, + "step": 72850 + }, + { + "epoch": 3.238654042761257, + "grad_norm": 0.09116330742835999, + "learning_rate": 2.1625932166925433e-06, + "loss": 0.2543, + "step": 72860 + }, + { + "epoch": 3.239098546472863, + "grad_norm": 0.1028183326125145, + "learning_rate": 2.1565845121298556e-06, + "loss": 0.2537, + "step": 72870 + }, + { + "epoch": 3.239543050184469, + "grad_norm": 0.13062746822834015, + "learning_rate": 2.150583982698351e-06, + "loss": 0.2545, + "step": 72880 + }, + { + "epoch": 3.239987553896075, + "grad_norm": 0.12309830635786057, + "learning_rate": 2.144591629423359e-06, + "loss": 0.2524, + "step": 72890 + }, + { + "epoch": 3.240432057607681, + "grad_norm": 0.11044788360595703, + "learning_rate": 2.138607453328817e-06, + "loss": 0.2543, + "step": 72900 + }, + { + "epoch": 3.240876561319287, + "grad_norm": 0.10852076858282089, + "learning_rate": 2.132631455437234e-06, + "loss": 0.2532, + "step": 72910 + }, + { + "epoch": 3.241321065030893, + "grad_norm": 0.12122233211994171, + "learning_rate": 2.1266636367697714e-06, + "loss": 0.2548, + "step": 72920 + }, + { + "epoch": 3.241765568742499, + "grad_norm": 0.13202127814292908, + "learning_rate": 2.120703998346152e-06, + "loss": 0.2533, + "step": 72930 + }, + { + "epoch": 3.242210072454105, + "grad_norm": 0.11751238256692886, + "learning_rate": 2.1147525411847114e-06, + "loss": 0.2508, + "step": 72940 + }, + { + "epoch": 3.242654576165711, + "grad_norm": 0.11616389453411102, + "learning_rate": 2.108809266302403e-06, + "loss": 0.2564, + "step": 72950 + }, + { + "epoch": 3.243099079877317, + "grad_norm": 0.10413708537817001, + "learning_rate": 2.102874174714764e-06, + "loss": 0.2513, + "step": 72960 + }, + { + "epoch": 3.243543583588923, + "grad_norm": 0.12464520335197449, + "learning_rate": 2.0969472674359393e-06, + "loss": 0.2532, + "step": 72970 + }, + { + "epoch": 3.243988087300529, + "grad_norm": 0.10488101094961166, + "learning_rate": 2.091028545478668e-06, + "loss": 0.2539, + "step": 72980 + }, + { + "epoch": 3.244432591012135, + "grad_norm": 0.10651035606861115, + "learning_rate": 2.08511800985432e-06, + "loss": 0.251, + "step": 72990 + }, + { + "epoch": 3.244877094723741, + "grad_norm": 0.11943750083446503, + "learning_rate": 2.07921566157282e-06, + "loss": 0.253, + "step": 73000 + }, + { + "epoch": 3.2453215984353467, + "grad_norm": 0.11045433580875397, + "learning_rate": 2.073321501642728e-06, + "loss": 0.2521, + "step": 73010 + }, + { + "epoch": 3.245766102146953, + "grad_norm": 0.12808416783809662, + "learning_rate": 2.0674355310711937e-06, + "loss": 0.2495, + "step": 73020 + }, + { + "epoch": 3.246210605858559, + "grad_norm": 0.1198795959353447, + "learning_rate": 2.0615577508639682e-06, + "loss": 0.2536, + "step": 73030 + }, + { + "epoch": 3.246655109570165, + "grad_norm": 0.14651361107826233, + "learning_rate": 2.055688162025399e-06, + "loss": 0.2535, + "step": 73040 + }, + { + "epoch": 3.247099613281771, + "grad_norm": 0.10839033871889114, + "learning_rate": 2.0498267655584546e-06, + "loss": 0.2521, + "step": 73050 + }, + { + "epoch": 3.2475441169933768, + "grad_norm": 0.10816188156604767, + "learning_rate": 2.0439735624646626e-06, + "loss": 0.2549, + "step": 73060 + }, + { + "epoch": 3.247988620704983, + "grad_norm": 0.1064918041229248, + "learning_rate": 2.0381285537442008e-06, + "loss": 0.2543, + "step": 73070 + }, + { + "epoch": 3.248433124416589, + "grad_norm": 0.13038721680641174, + "learning_rate": 2.032291740395803e-06, + "loss": 0.252, + "step": 73080 + }, + { + "epoch": 3.248877628128195, + "grad_norm": 0.11850755661725998, + "learning_rate": 2.0264631234168276e-06, + "loss": 0.2552, + "step": 73090 + }, + { + "epoch": 3.2493221318398007, + "grad_norm": 0.11780934035778046, + "learning_rate": 2.0206427038032326e-06, + "loss": 0.2535, + "step": 73100 + }, + { + "epoch": 3.2497666355514068, + "grad_norm": 0.11874327808618546, + "learning_rate": 2.0148304825495457e-06, + "loss": 0.2514, + "step": 73110 + }, + { + "epoch": 3.250211139263013, + "grad_norm": 0.1069134771823883, + "learning_rate": 2.0090264606489496e-06, + "loss": 0.2534, + "step": 73120 + }, + { + "epoch": 3.250655642974619, + "grad_norm": 0.09992261230945587, + "learning_rate": 2.003230639093162e-06, + "loss": 0.2543, + "step": 73130 + }, + { + "epoch": 3.251100146686225, + "grad_norm": 0.1224932074546814, + "learning_rate": 1.9974430188725525e-06, + "loss": 0.2525, + "step": 73140 + }, + { + "epoch": 3.2515446503978307, + "grad_norm": 0.12171421945095062, + "learning_rate": 1.9916636009760513e-06, + "loss": 0.2536, + "step": 73150 + }, + { + "epoch": 3.251989154109437, + "grad_norm": 0.1192706748843193, + "learning_rate": 1.985892386391208e-06, + "loss": 0.2559, + "step": 73160 + }, + { + "epoch": 3.252433657821043, + "grad_norm": 0.09738549590110779, + "learning_rate": 1.980129376104173e-06, + "loss": 0.2535, + "step": 73170 + }, + { + "epoch": 3.252878161532649, + "grad_norm": 0.11181453615427017, + "learning_rate": 1.9743745710996796e-06, + "loss": 0.2517, + "step": 73180 + }, + { + "epoch": 3.2533226652442546, + "grad_norm": 0.09694001823663712, + "learning_rate": 1.9686279723610757e-06, + "loss": 0.2533, + "step": 73190 + }, + { + "epoch": 3.2537671689558607, + "grad_norm": 0.1048237606883049, + "learning_rate": 1.962889580870281e-06, + "loss": 0.2532, + "step": 73200 + }, + { + "epoch": 3.254211672667467, + "grad_norm": 0.11108321696519852, + "learning_rate": 1.9571593976078385e-06, + "loss": 0.254, + "step": 73210 + }, + { + "epoch": 3.254656176379073, + "grad_norm": 0.09439919888973236, + "learning_rate": 1.9514374235528876e-06, + "loss": 0.2523, + "step": 73220 + }, + { + "epoch": 3.2551006800906785, + "grad_norm": 0.11257009953260422, + "learning_rate": 1.9457236596831408e-06, + "loss": 0.2568, + "step": 73230 + }, + { + "epoch": 3.2555451838022846, + "grad_norm": 0.12318666279315948, + "learning_rate": 1.940018106974939e-06, + "loss": 0.2551, + "step": 73240 + }, + { + "epoch": 3.2559896875138907, + "grad_norm": 0.10925205051898956, + "learning_rate": 1.9343207664032025e-06, + "loss": 0.2541, + "step": 73250 + }, + { + "epoch": 3.256434191225497, + "grad_norm": 0.13262823224067688, + "learning_rate": 1.928631638941436e-06, + "loss": 0.2532, + "step": 73260 + }, + { + "epoch": 3.256878694937103, + "grad_norm": 0.10451443493366241, + "learning_rate": 1.922950725561784e-06, + "loss": 0.2568, + "step": 73270 + }, + { + "epoch": 3.2573231986487086, + "grad_norm": 0.12963274121284485, + "learning_rate": 1.9172780272349312e-06, + "loss": 0.2557, + "step": 73280 + }, + { + "epoch": 3.2577677023603147, + "grad_norm": 0.11789838969707489, + "learning_rate": 1.911613544930202e-06, + "loss": 0.2529, + "step": 73290 + }, + { + "epoch": 3.2582122060719207, + "grad_norm": 0.12160918861627579, + "learning_rate": 1.9059572796155e-06, + "loss": 0.253, + "step": 73300 + }, + { + "epoch": 3.258656709783527, + "grad_norm": 0.1257299929857254, + "learning_rate": 1.9003092322573192e-06, + "loss": 0.2511, + "step": 73310 + }, + { + "epoch": 3.259101213495133, + "grad_norm": 0.12371662259101868, + "learning_rate": 1.8946694038207647e-06, + "loss": 0.2552, + "step": 73320 + }, + { + "epoch": 3.2595457172067386, + "grad_norm": 0.12411262094974518, + "learning_rate": 1.8890377952695215e-06, + "loss": 0.253, + "step": 73330 + }, + { + "epoch": 3.2599902209183447, + "grad_norm": 0.1081814244389534, + "learning_rate": 1.8834144075658865e-06, + "loss": 0.2543, + "step": 73340 + }, + { + "epoch": 3.2604347246299508, + "grad_norm": 0.11022903770208359, + "learning_rate": 1.8777992416707357e-06, + "loss": 0.2543, + "step": 73350 + }, + { + "epoch": 3.2608792283415564, + "grad_norm": 0.12787394225597382, + "learning_rate": 1.8721922985435458e-06, + "loss": 0.2521, + "step": 73360 + }, + { + "epoch": 3.2613237320531625, + "grad_norm": 0.10648510605096817, + "learning_rate": 1.8665935791424006e-06, + "loss": 0.2518, + "step": 73370 + }, + { + "epoch": 3.2617682357647686, + "grad_norm": 0.11541634052991867, + "learning_rate": 1.8610030844239512e-06, + "loss": 0.2499, + "step": 73380 + }, + { + "epoch": 3.2622127394763747, + "grad_norm": 0.09918425232172012, + "learning_rate": 1.8554208153434838e-06, + "loss": 0.2525, + "step": 73390 + }, + { + "epoch": 3.262657243187981, + "grad_norm": 0.12489426136016846, + "learning_rate": 1.8498467728548296e-06, + "loss": 0.2544, + "step": 73400 + }, + { + "epoch": 3.2631017468995864, + "grad_norm": 0.10602641850709915, + "learning_rate": 1.8442809579104547e-06, + "loss": 0.2525, + "step": 73410 + }, + { + "epoch": 3.2635462506111925, + "grad_norm": 0.10974564403295517, + "learning_rate": 1.8387233714614039e-06, + "loss": 0.2543, + "step": 73420 + }, + { + "epoch": 3.2639907543227986, + "grad_norm": 0.10958171635866165, + "learning_rate": 1.8331740144573174e-06, + "loss": 0.2551, + "step": 73430 + }, + { + "epoch": 3.2644352580344047, + "grad_norm": 0.10895751416683197, + "learning_rate": 1.8276328878464255e-06, + "loss": 0.2533, + "step": 73440 + }, + { + "epoch": 3.264879761746011, + "grad_norm": 0.10284784436225891, + "learning_rate": 1.82209999257556e-06, + "loss": 0.2541, + "step": 73450 + }, + { + "epoch": 3.2653242654576164, + "grad_norm": 0.1112259104847908, + "learning_rate": 1.8165753295901311e-06, + "loss": 0.2521, + "step": 73460 + }, + { + "epoch": 3.2657687691692225, + "grad_norm": 0.11563394218683243, + "learning_rate": 1.8110588998341616e-06, + "loss": 0.2543, + "step": 73470 + }, + { + "epoch": 3.2662132728808286, + "grad_norm": 0.12162502855062485, + "learning_rate": 1.805550704250253e-06, + "loss": 0.252, + "step": 73480 + }, + { + "epoch": 3.2666577765924347, + "grad_norm": 0.10788556933403015, + "learning_rate": 1.8000507437796077e-06, + "loss": 0.2531, + "step": 73490 + }, + { + "epoch": 3.2671022803040404, + "grad_norm": 0.11588045209646225, + "learning_rate": 1.7945590193620242e-06, + "loss": 0.2535, + "step": 73500 + }, + { + "epoch": 3.2675467840156465, + "grad_norm": 0.11763675510883331, + "learning_rate": 1.7890755319358742e-06, + "loss": 0.2512, + "step": 73510 + }, + { + "epoch": 3.2679912877272526, + "grad_norm": 0.09715669602155685, + "learning_rate": 1.7836002824381525e-06, + "loss": 0.2537, + "step": 73520 + }, + { + "epoch": 3.2684357914388587, + "grad_norm": 0.12952689826488495, + "learning_rate": 1.7781332718044165e-06, + "loss": 0.2552, + "step": 73530 + }, + { + "epoch": 3.2688802951504643, + "grad_norm": 0.127903550863266, + "learning_rate": 1.772674500968835e-06, + "loss": 0.2542, + "step": 73540 + }, + { + "epoch": 3.2693247988620704, + "grad_norm": 0.11643896996974945, + "learning_rate": 1.7672239708641624e-06, + "loss": 0.2526, + "step": 73550 + }, + { + "epoch": 3.2697693025736765, + "grad_norm": 0.11011640727519989, + "learning_rate": 1.7617816824217371e-06, + "loss": 0.2528, + "step": 73560 + }, + { + "epoch": 3.2702138062852826, + "grad_norm": 0.09892547130584717, + "learning_rate": 1.7563476365715148e-06, + "loss": 0.2565, + "step": 73570 + }, + { + "epoch": 3.2706583099968887, + "grad_norm": 0.11970457434654236, + "learning_rate": 1.750921834242003e-06, + "loss": 0.2529, + "step": 73580 + }, + { + "epoch": 3.2711028137084943, + "grad_norm": 0.09854160249233246, + "learning_rate": 1.7455042763603436e-06, + "loss": 0.2548, + "step": 73590 + }, + { + "epoch": 3.2715473174201004, + "grad_norm": 0.10051064193248749, + "learning_rate": 1.7400949638522345e-06, + "loss": 0.2514, + "step": 73600 + }, + { + "epoch": 3.2719918211317065, + "grad_norm": 0.12193713337182999, + "learning_rate": 1.734693897641987e-06, + "loss": 0.2531, + "step": 73610 + }, + { + "epoch": 3.2724363248433126, + "grad_norm": 0.12631124258041382, + "learning_rate": 1.7293010786524955e-06, + "loss": 0.2549, + "step": 73620 + }, + { + "epoch": 3.2728808285549182, + "grad_norm": 0.11067657917737961, + "learning_rate": 1.72391650780524e-06, + "loss": 0.2524, + "step": 73630 + }, + { + "epoch": 3.2733253322665243, + "grad_norm": 0.11938219517469406, + "learning_rate": 1.718540186020301e-06, + "loss": 0.255, + "step": 73640 + }, + { + "epoch": 3.2737698359781304, + "grad_norm": 0.1118733286857605, + "learning_rate": 1.7131721142163437e-06, + "loss": 0.2509, + "step": 73650 + }, + { + "epoch": 3.2742143396897365, + "grad_norm": 0.1147545576095581, + "learning_rate": 1.7078122933106233e-06, + "loss": 0.2512, + "step": 73660 + }, + { + "epoch": 3.274658843401342, + "grad_norm": 0.1002689078450203, + "learning_rate": 1.7024607242189905e-06, + "loss": 0.2519, + "step": 73670 + }, + { + "epoch": 3.2751033471129483, + "grad_norm": 0.11568443477153778, + "learning_rate": 1.6971174078558749e-06, + "loss": 0.2533, + "step": 73680 + }, + { + "epoch": 3.2755478508245544, + "grad_norm": 0.1300036758184433, + "learning_rate": 1.6917823451343073e-06, + "loss": 0.2533, + "step": 73690 + }, + { + "epoch": 3.2759923545361604, + "grad_norm": 0.11067961901426315, + "learning_rate": 1.6864555369659142e-06, + "loss": 0.2536, + "step": 73700 + }, + { + "epoch": 3.2764368582477665, + "grad_norm": 0.11131244897842407, + "learning_rate": 1.681136984260878e-06, + "loss": 0.2543, + "step": 73710 + }, + { + "epoch": 3.276881361959372, + "grad_norm": 0.1226600930094719, + "learning_rate": 1.6758266879280172e-06, + "loss": 0.253, + "step": 73720 + }, + { + "epoch": 3.2773258656709783, + "grad_norm": 0.1179950088262558, + "learning_rate": 1.6705246488746996e-06, + "loss": 0.2531, + "step": 73730 + }, + { + "epoch": 3.2777703693825844, + "grad_norm": 0.16116195917129517, + "learning_rate": 1.6652308680069062e-06, + "loss": 0.2552, + "step": 73740 + }, + { + "epoch": 3.2782148730941905, + "grad_norm": 0.12077091634273529, + "learning_rate": 1.6599453462292081e-06, + "loss": 0.2529, + "step": 73750 + }, + { + "epoch": 3.2786593768057966, + "grad_norm": 0.11210287362337112, + "learning_rate": 1.6546680844447326e-06, + "loss": 0.2536, + "step": 73760 + }, + { + "epoch": 3.279103880517402, + "grad_norm": 0.10542245209217072, + "learning_rate": 1.6493990835552475e-06, + "loss": 0.2546, + "step": 73770 + }, + { + "epoch": 3.2795483842290083, + "grad_norm": 0.10948027670383453, + "learning_rate": 1.6441383444610547e-06, + "loss": 0.2542, + "step": 73780 + }, + { + "epoch": 3.2799928879406144, + "grad_norm": 0.10692090541124344, + "learning_rate": 1.6388858680610908e-06, + "loss": 0.2549, + "step": 73790 + }, + { + "epoch": 3.2804373916522205, + "grad_norm": 0.1444368213415146, + "learning_rate": 1.6336416552528543e-06, + "loss": 0.2547, + "step": 73800 + }, + { + "epoch": 3.280881895363826, + "grad_norm": 0.11502568423748016, + "learning_rate": 1.6284057069324288e-06, + "loss": 0.254, + "step": 73810 + }, + { + "epoch": 3.281326399075432, + "grad_norm": 0.1048140674829483, + "learning_rate": 1.623178023994504e-06, + "loss": 0.252, + "step": 73820 + }, + { + "epoch": 3.2817709027870383, + "grad_norm": 0.1332840770483017, + "learning_rate": 1.6179586073323483e-06, + "loss": 0.2552, + "step": 73830 + }, + { + "epoch": 3.2822154064986444, + "grad_norm": 0.1082984209060669, + "learning_rate": 1.612747457837821e-06, + "loss": 0.2545, + "step": 73840 + }, + { + "epoch": 3.28265991021025, + "grad_norm": 0.11445146799087524, + "learning_rate": 1.607544576401354e-06, + "loss": 0.2535, + "step": 73850 + }, + { + "epoch": 3.283104413921856, + "grad_norm": 0.10911338776350021, + "learning_rate": 1.6023499639119754e-06, + "loss": 0.2518, + "step": 73860 + }, + { + "epoch": 3.2835489176334622, + "grad_norm": 0.12462568283081055, + "learning_rate": 1.5971636212573138e-06, + "loss": 0.2529, + "step": 73870 + }, + { + "epoch": 3.2839934213450683, + "grad_norm": 0.12682214379310608, + "learning_rate": 1.5919855493235714e-06, + "loss": 0.2528, + "step": 73880 + }, + { + "epoch": 3.2844379250566744, + "grad_norm": 0.11276566237211227, + "learning_rate": 1.586815748995535e-06, + "loss": 0.2545, + "step": 73890 + }, + { + "epoch": 3.28488242876828, + "grad_norm": 0.11858906596899033, + "learning_rate": 1.5816542211565866e-06, + "loss": 0.2512, + "step": 73900 + }, + { + "epoch": 3.285326932479886, + "grad_norm": 0.1179419606924057, + "learning_rate": 1.5765009666886765e-06, + "loss": 0.2541, + "step": 73910 + }, + { + "epoch": 3.2857714361914923, + "grad_norm": 0.12330885976552963, + "learning_rate": 1.5713559864723781e-06, + "loss": 0.2551, + "step": 73920 + }, + { + "epoch": 3.2862159399030983, + "grad_norm": 0.12485255300998688, + "learning_rate": 1.5662192813868104e-06, + "loss": 0.2529, + "step": 73930 + }, + { + "epoch": 3.286660443614704, + "grad_norm": 0.11245708912611008, + "learning_rate": 1.561090852309699e-06, + "loss": 0.2535, + "step": 73940 + }, + { + "epoch": 3.28710494732631, + "grad_norm": 0.17175787687301636, + "learning_rate": 1.5559707001173651e-06, + "loss": 0.2548, + "step": 73950 + }, + { + "epoch": 3.287549451037916, + "grad_norm": 0.11366959661245346, + "learning_rate": 1.5508588256846757e-06, + "loss": 0.2528, + "step": 73960 + }, + { + "epoch": 3.2879939547495223, + "grad_norm": 0.12336679548025131, + "learning_rate": 1.5457552298851319e-06, + "loss": 0.2502, + "step": 73970 + }, + { + "epoch": 3.288438458461128, + "grad_norm": 0.09548065811395645, + "learning_rate": 1.5406599135907918e-06, + "loss": 0.2532, + "step": 73980 + }, + { + "epoch": 3.288882962172734, + "grad_norm": 0.10309015214443207, + "learning_rate": 1.5355728776723088e-06, + "loss": 0.2522, + "step": 73990 + }, + { + "epoch": 3.28932746588434, + "grad_norm": 0.12444614619016647, + "learning_rate": 1.5304941229989155e-06, + "loss": 0.2535, + "step": 74000 + }, + { + "epoch": 3.289771969595946, + "grad_norm": 0.12402177602052689, + "learning_rate": 1.5254236504384345e-06, + "loss": 0.2533, + "step": 74010 + }, + { + "epoch": 3.2902164733075523, + "grad_norm": 0.10099875181913376, + "learning_rate": 1.5203614608572726e-06, + "loss": 0.2549, + "step": 74020 + }, + { + "epoch": 3.290660977019158, + "grad_norm": 0.11711933463811874, + "learning_rate": 1.5153075551204044e-06, + "loss": 0.2523, + "step": 74030 + }, + { + "epoch": 3.291105480730764, + "grad_norm": 0.11410145461559296, + "learning_rate": 1.5102619340914225e-06, + "loss": 0.2509, + "step": 74040 + }, + { + "epoch": 3.29154998444237, + "grad_norm": 0.11913321912288666, + "learning_rate": 1.505224598632482e-06, + "loss": 0.2517, + "step": 74050 + }, + { + "epoch": 3.291994488153976, + "grad_norm": 0.11474928259849548, + "learning_rate": 1.5001955496043164e-06, + "loss": 0.254, + "step": 74060 + }, + { + "epoch": 3.2924389918655823, + "grad_norm": 0.14020508527755737, + "learning_rate": 1.4951747878662604e-06, + "loss": 0.2541, + "step": 74070 + }, + { + "epoch": 3.292883495577188, + "grad_norm": 0.12612496316432953, + "learning_rate": 1.4901623142762221e-06, + "loss": 0.2584, + "step": 74080 + }, + { + "epoch": 3.293327999288794, + "grad_norm": 0.11214447021484375, + "learning_rate": 1.4851581296907001e-06, + "loss": 0.2543, + "step": 74090 + }, + { + "epoch": 3.2937725030004, + "grad_norm": 0.10783911496400833, + "learning_rate": 1.4801622349647714e-06, + "loss": 0.2515, + "step": 74100 + }, + { + "epoch": 3.2942170067120062, + "grad_norm": 0.11297532171010971, + "learning_rate": 1.4751746309520976e-06, + "loss": 0.254, + "step": 74110 + }, + { + "epoch": 3.294661510423612, + "grad_norm": 0.15114976465702057, + "learning_rate": 1.4701953185049132e-06, + "loss": 0.2537, + "step": 74120 + }, + { + "epoch": 3.295106014135218, + "grad_norm": 0.11205513775348663, + "learning_rate": 1.4652242984740661e-06, + "loss": 0.2527, + "step": 74130 + }, + { + "epoch": 3.295550517846824, + "grad_norm": 0.12497445195913315, + "learning_rate": 1.4602615717089484e-06, + "loss": 0.2527, + "step": 74140 + }, + { + "epoch": 3.29599502155843, + "grad_norm": 0.09892677515745163, + "learning_rate": 1.4553071390575657e-06, + "loss": 0.2533, + "step": 74150 + }, + { + "epoch": 3.296439525270036, + "grad_norm": 0.12221455574035645, + "learning_rate": 1.450361001366496e-06, + "loss": 0.2524, + "step": 74160 + }, + { + "epoch": 3.296884028981642, + "grad_norm": 0.11597615480422974, + "learning_rate": 1.4454231594809021e-06, + "loss": 0.2511, + "step": 74170 + }, + { + "epoch": 3.297328532693248, + "grad_norm": 0.1278439611196518, + "learning_rate": 1.4404936142445036e-06, + "loss": 0.2536, + "step": 74180 + }, + { + "epoch": 3.297773036404854, + "grad_norm": 0.10439930856227875, + "learning_rate": 1.4355723664996546e-06, + "loss": 0.2522, + "step": 74190 + }, + { + "epoch": 3.29821754011646, + "grad_norm": 0.11083025485277176, + "learning_rate": 1.4306594170872433e-06, + "loss": 0.255, + "step": 74200 + }, + { + "epoch": 3.298662043828066, + "grad_norm": 0.12335475534200668, + "learning_rate": 1.4257547668467598e-06, + "loss": 0.252, + "step": 74210 + }, + { + "epoch": 3.299106547539672, + "grad_norm": 0.13165685534477234, + "learning_rate": 1.4208584166162886e-06, + "loss": 0.2549, + "step": 74220 + }, + { + "epoch": 3.299551051251278, + "grad_norm": 0.1215907409787178, + "learning_rate": 1.4159703672324553e-06, + "loss": 0.2539, + "step": 74230 + }, + { + "epoch": 3.299995554962884, + "grad_norm": 0.12225715070962906, + "learning_rate": 1.4110906195305252e-06, + "loss": 0.2531, + "step": 74240 + }, + { + "epoch": 3.3004400586744898, + "grad_norm": 0.09131564944982529, + "learning_rate": 1.406219174344292e-06, + "loss": 0.2553, + "step": 74250 + }, + { + "epoch": 3.300884562386096, + "grad_norm": 0.12733030319213867, + "learning_rate": 1.401356032506157e-06, + "loss": 0.2533, + "step": 74260 + }, + { + "epoch": 3.301329066097702, + "grad_norm": 0.10959919542074203, + "learning_rate": 1.3965011948471051e-06, + "loss": 0.2533, + "step": 74270 + }, + { + "epoch": 3.301773569809308, + "grad_norm": 0.11952294409275055, + "learning_rate": 1.3916546621966842e-06, + "loss": 0.2513, + "step": 74280 + }, + { + "epoch": 3.3022180735209137, + "grad_norm": 0.1072138249874115, + "learning_rate": 1.3868164353830482e-06, + "loss": 0.2522, + "step": 74290 + }, + { + "epoch": 3.3026625772325198, + "grad_norm": 0.10413268208503723, + "learning_rate": 1.3819865152329082e-06, + "loss": 0.2549, + "step": 74300 + }, + { + "epoch": 3.303107080944126, + "grad_norm": 0.11674657464027405, + "learning_rate": 1.377164902571565e-06, + "loss": 0.2564, + "step": 74310 + }, + { + "epoch": 3.303551584655732, + "grad_norm": 0.1203649491071701, + "learning_rate": 1.372351598222904e-06, + "loss": 0.2533, + "step": 74320 + }, + { + "epoch": 3.303996088367338, + "grad_norm": 0.10905587673187256, + "learning_rate": 1.3675466030093843e-06, + "loss": 0.2514, + "step": 74330 + }, + { + "epoch": 3.3044405920789437, + "grad_norm": 0.11359282582998276, + "learning_rate": 1.3627499177520486e-06, + "loss": 0.2527, + "step": 74340 + }, + { + "epoch": 3.30488509579055, + "grad_norm": 0.10380400717258453, + "learning_rate": 1.3579615432705196e-06, + "loss": 0.2536, + "step": 74350 + }, + { + "epoch": 3.305329599502156, + "grad_norm": 0.10427138954401016, + "learning_rate": 1.3531814803829978e-06, + "loss": 0.2578, + "step": 74360 + }, + { + "epoch": 3.305774103213762, + "grad_norm": 0.10957617312669754, + "learning_rate": 1.3484097299062803e-06, + "loss": 0.2526, + "step": 74370 + }, + { + "epoch": 3.306218606925368, + "grad_norm": 0.12210964411497116, + "learning_rate": 1.343646292655698e-06, + "loss": 0.2571, + "step": 74380 + }, + { + "epoch": 3.3066631106369737, + "grad_norm": 0.12234576791524887, + "learning_rate": 1.3388911694452277e-06, + "loss": 0.2521, + "step": 74390 + }, + { + "epoch": 3.30710761434858, + "grad_norm": 0.11922285705804825, + "learning_rate": 1.3341443610873638e-06, + "loss": 0.2514, + "step": 74400 + }, + { + "epoch": 3.307552118060186, + "grad_norm": 0.1063443273305893, + "learning_rate": 1.329405868393213e-06, + "loss": 0.2551, + "step": 74410 + }, + { + "epoch": 3.3079966217717915, + "grad_norm": 0.09602199494838715, + "learning_rate": 1.3246756921724613e-06, + "loss": 0.2538, + "step": 74420 + }, + { + "epoch": 3.3084411254833976, + "grad_norm": 0.11487378180027008, + "learning_rate": 1.3199538332333506e-06, + "loss": 0.252, + "step": 74430 + }, + { + "epoch": 3.3088856291950037, + "grad_norm": 0.14150209724903107, + "learning_rate": 1.3152402923827411e-06, + "loss": 0.2525, + "step": 74440 + }, + { + "epoch": 3.30933013290661, + "grad_norm": 0.11626731604337692, + "learning_rate": 1.3105350704260277e-06, + "loss": 0.2544, + "step": 74450 + }, + { + "epoch": 3.309774636618216, + "grad_norm": 0.11134200543165207, + "learning_rate": 1.305838168167206e-06, + "loss": 0.2524, + "step": 74460 + }, + { + "epoch": 3.3102191403298216, + "grad_norm": 0.11570102721452713, + "learning_rate": 1.301149586408862e-06, + "loss": 0.2521, + "step": 74470 + }, + { + "epoch": 3.3106636440414277, + "grad_norm": 0.12921176850795746, + "learning_rate": 1.2964693259521322e-06, + "loss": 0.2557, + "step": 74480 + }, + { + "epoch": 3.3111081477530337, + "grad_norm": 0.10424291342496872, + "learning_rate": 1.2917973875967548e-06, + "loss": 0.2531, + "step": 74490 + }, + { + "epoch": 3.31155265146464, + "grad_norm": 0.12514998018741608, + "learning_rate": 1.2871337721410249e-06, + "loss": 0.2523, + "step": 74500 + }, + { + "epoch": 3.311997155176246, + "grad_norm": 0.10327691584825516, + "learning_rate": 1.2824784803818379e-06, + "loss": 0.2517, + "step": 74510 + }, + { + "epoch": 3.3124416588878516, + "grad_norm": 0.10766824334859848, + "learning_rate": 1.2778315131146467e-06, + "loss": 0.253, + "step": 74520 + }, + { + "epoch": 3.3128861625994577, + "grad_norm": 0.11549217998981476, + "learning_rate": 1.2731928711334994e-06, + "loss": 0.2508, + "step": 74530 + }, + { + "epoch": 3.3133306663110638, + "grad_norm": 0.10273275524377823, + "learning_rate": 1.2685625552310122e-06, + "loss": 0.2518, + "step": 74540 + }, + { + "epoch": 3.31377517002267, + "grad_norm": 0.11063544452190399, + "learning_rate": 1.263940566198374e-06, + "loss": 0.2523, + "step": 74550 + }, + { + "epoch": 3.3142196737342755, + "grad_norm": 0.11343222111463547, + "learning_rate": 1.259326904825353e-06, + "loss": 0.2526, + "step": 74560 + }, + { + "epoch": 3.3146641774458816, + "grad_norm": 0.12298902869224548, + "learning_rate": 1.2547215719003137e-06, + "loss": 0.2553, + "step": 74570 + }, + { + "epoch": 3.3151086811574877, + "grad_norm": 0.10783204436302185, + "learning_rate": 1.2501245682101703e-06, + "loss": 0.2554, + "step": 74580 + }, + { + "epoch": 3.315553184869094, + "grad_norm": 0.10217072069644928, + "learning_rate": 1.2455358945404171e-06, + "loss": 0.2534, + "step": 74590 + }, + { + "epoch": 3.3159976885806994, + "grad_norm": 0.10369119793176651, + "learning_rate": 1.2409555516751493e-06, + "loss": 0.2549, + "step": 74600 + }, + { + "epoch": 3.3164421922923055, + "grad_norm": 0.10830378532409668, + "learning_rate": 1.2363835403970125e-06, + "loss": 0.2518, + "step": 74610 + }, + { + "epoch": 3.3168866960039116, + "grad_norm": 0.11439992487430573, + "learning_rate": 1.2318198614872489e-06, + "loss": 0.2536, + "step": 74620 + }, + { + "epoch": 3.3173311997155177, + "grad_norm": 0.12482511252164841, + "learning_rate": 1.2272645157256457e-06, + "loss": 0.2557, + "step": 74630 + }, + { + "epoch": 3.317775703427124, + "grad_norm": 0.12030918896198273, + "learning_rate": 1.222717503890608e-06, + "loss": 0.2553, + "step": 74640 + }, + { + "epoch": 3.3182202071387294, + "grad_norm": 0.10711748152971268, + "learning_rate": 1.218178826759081e-06, + "loss": 0.2547, + "step": 74650 + }, + { + "epoch": 3.3186647108503355, + "grad_norm": 0.12216057628393173, + "learning_rate": 1.213648485106611e-06, + "loss": 0.2548, + "step": 74660 + }, + { + "epoch": 3.3191092145619416, + "grad_norm": 0.11786505579948425, + "learning_rate": 1.2091264797073066e-06, + "loss": 0.2505, + "step": 74670 + }, + { + "epoch": 3.3195537182735477, + "grad_norm": 0.11880619078874588, + "learning_rate": 1.2046128113338494e-06, + "loss": 0.2548, + "step": 74680 + }, + { + "epoch": 3.319998221985154, + "grad_norm": 0.09997987747192383, + "learning_rate": 1.200107480757512e-06, + "loss": 0.2509, + "step": 74690 + }, + { + "epoch": 3.3204427256967595, + "grad_norm": 0.11296887695789337, + "learning_rate": 1.1956104887481168e-06, + "loss": 0.252, + "step": 74700 + }, + { + "epoch": 3.3208872294083656, + "grad_norm": 0.11846257001161575, + "learning_rate": 1.191121836074094e-06, + "loss": 0.2529, + "step": 74710 + }, + { + "epoch": 3.3213317331199717, + "grad_norm": 0.12168418616056442, + "learning_rate": 1.1866415235024186e-06, + "loss": 0.2521, + "step": 74720 + }, + { + "epoch": 3.3217762368315773, + "grad_norm": 0.11507046222686768, + "learning_rate": 1.182169551798662e-06, + "loss": 0.2545, + "step": 74730 + }, + { + "epoch": 3.3222207405431834, + "grad_norm": 0.12203577905893326, + "learning_rate": 1.177705921726957e-06, + "loss": 0.2526, + "step": 74740 + }, + { + "epoch": 3.3226652442547895, + "grad_norm": 0.10256487876176834, + "learning_rate": 1.173250634050016e-06, + "loss": 0.2532, + "step": 74750 + }, + { + "epoch": 3.3231097479663956, + "grad_norm": 0.10451134294271469, + "learning_rate": 1.1688036895291354e-06, + "loss": 0.2505, + "step": 74760 + }, + { + "epoch": 3.3235542516780017, + "grad_norm": 0.11562536656856537, + "learning_rate": 1.1643650889241574e-06, + "loss": 0.251, + "step": 74770 + }, + { + "epoch": 3.3239987553896073, + "grad_norm": 0.09910263866186142, + "learning_rate": 1.1599348329935311e-06, + "loss": 0.2525, + "step": 74780 + }, + { + "epoch": 3.3244432591012134, + "grad_norm": 0.11252099275588989, + "learning_rate": 1.1555129224942673e-06, + "loss": 0.2519, + "step": 74790 + }, + { + "epoch": 3.3248877628128195, + "grad_norm": 0.12060289084911346, + "learning_rate": 1.1510993581819396e-06, + "loss": 0.2538, + "step": 74800 + }, + { + "epoch": 3.3253322665244256, + "grad_norm": 0.12889714539051056, + "learning_rate": 1.1466941408107112e-06, + "loss": 0.2552, + "step": 74810 + }, + { + "epoch": 3.3257767702360317, + "grad_norm": 0.1401834636926651, + "learning_rate": 1.1422972711333247e-06, + "loss": 0.2547, + "step": 74820 + }, + { + "epoch": 3.3262212739476373, + "grad_norm": 0.10477221757173538, + "learning_rate": 1.1379087499010565e-06, + "loss": 0.2517, + "step": 74830 + }, + { + "epoch": 3.3266657776592434, + "grad_norm": 0.10878565162420273, + "learning_rate": 1.1335285778638127e-06, + "loss": 0.2512, + "step": 74840 + }, + { + "epoch": 3.3271102813708495, + "grad_norm": 0.12011997401714325, + "learning_rate": 1.1291567557700333e-06, + "loss": 0.2504, + "step": 74850 + }, + { + "epoch": 3.3275547850824556, + "grad_norm": 0.09941351413726807, + "learning_rate": 1.124793284366743e-06, + "loss": 0.2539, + "step": 74860 + }, + { + "epoch": 3.3279992887940613, + "grad_norm": 0.09731435030698776, + "learning_rate": 1.1204381643995455e-06, + "loss": 0.2522, + "step": 74870 + }, + { + "epoch": 3.3284437925056674, + "grad_norm": 0.1083095595240593, + "learning_rate": 1.116091396612595e-06, + "loss": 0.2527, + "step": 74880 + }, + { + "epoch": 3.3288882962172734, + "grad_norm": 0.1275091916322708, + "learning_rate": 1.1117529817486538e-06, + "loss": 0.257, + "step": 74890 + }, + { + "epoch": 3.3293327999288795, + "grad_norm": 0.11535681784152985, + "learning_rate": 1.107422920549034e-06, + "loss": 0.253, + "step": 74900 + }, + { + "epoch": 3.329777303640485, + "grad_norm": 0.10769994556903839, + "learning_rate": 1.1031012137536157e-06, + "loss": 0.2556, + "step": 74910 + }, + { + "epoch": 3.3302218073520913, + "grad_norm": 0.10446496307849884, + "learning_rate": 1.0987878621008695e-06, + "loss": 0.252, + "step": 74920 + }, + { + "epoch": 3.3306663110636974, + "grad_norm": 0.10547155886888504, + "learning_rate": 1.0944828663278284e-06, + "loss": 0.2544, + "step": 74930 + }, + { + "epoch": 3.3311108147753035, + "grad_norm": 0.11939844489097595, + "learning_rate": 1.0901862271700925e-06, + "loss": 0.2528, + "step": 74940 + }, + { + "epoch": 3.3315553184869096, + "grad_norm": 0.10891233384609222, + "learning_rate": 1.0858979453618467e-06, + "loss": 0.2539, + "step": 74950 + }, + { + "epoch": 3.331999822198515, + "grad_norm": 0.103889100253582, + "learning_rate": 1.0816180216358441e-06, + "loss": 0.2541, + "step": 74960 + }, + { + "epoch": 3.3324443259101213, + "grad_norm": 0.11621509492397308, + "learning_rate": 1.0773464567233937e-06, + "loss": 0.251, + "step": 74970 + }, + { + "epoch": 3.3328888296217274, + "grad_norm": 0.12183089554309845, + "learning_rate": 1.0730832513543953e-06, + "loss": 0.2514, + "step": 74980 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.12253188341856003, + "learning_rate": 1.0688284062573218e-06, + "loss": 0.2535, + "step": 74990 + }, + { + "epoch": 3.3337778370449396, + "grad_norm": 0.12135832011699677, + "learning_rate": 1.0645819221591969e-06, + "loss": 0.2519, + "step": 75000 + }, + { + "epoch": 3.334222340756545, + "grad_norm": 0.12365761399269104, + "learning_rate": 1.0603437997856347e-06, + "loss": 0.2546, + "step": 75010 + }, + { + "epoch": 3.3346668444681513, + "grad_norm": 0.1371905356645584, + "learning_rate": 1.0561140398608228e-06, + "loss": 0.2536, + "step": 75020 + }, + { + "epoch": 3.3351113481797574, + "grad_norm": 0.10312709957361221, + "learning_rate": 1.0518926431074937e-06, + "loss": 0.2547, + "step": 75030 + }, + { + "epoch": 3.335555851891363, + "grad_norm": 0.12706661224365234, + "learning_rate": 1.0476796102469877e-06, + "loss": 0.2564, + "step": 75040 + }, + { + "epoch": 3.336000355602969, + "grad_norm": 0.13174960017204285, + "learning_rate": 1.0434749419991786e-06, + "loss": 0.2538, + "step": 75050 + }, + { + "epoch": 3.3364448593145752, + "grad_norm": 0.13281945884227753, + "learning_rate": 1.0392786390825415e-06, + "loss": 0.2544, + "step": 75060 + }, + { + "epoch": 3.3368893630261813, + "grad_norm": 0.10764951258897781, + "learning_rate": 1.0350907022141087e-06, + "loss": 0.2521, + "step": 75070 + }, + { + "epoch": 3.3373338667377874, + "grad_norm": 0.1102169007062912, + "learning_rate": 1.0309111321094744e-06, + "loss": 0.2534, + "step": 75080 + }, + { + "epoch": 3.337778370449393, + "grad_norm": 0.09989578276872635, + "learning_rate": 1.026739929482834e-06, + "loss": 0.2543, + "step": 75090 + }, + { + "epoch": 3.338222874160999, + "grad_norm": 0.10838083922863007, + "learning_rate": 1.022577095046906e-06, + "loss": 0.2525, + "step": 75100 + }, + { + "epoch": 3.3386673778726053, + "grad_norm": 0.11132333427667618, + "learning_rate": 1.0184226295130217e-06, + "loss": 0.2513, + "step": 75110 + }, + { + "epoch": 3.3391118815842113, + "grad_norm": 0.13270916044712067, + "learning_rate": 1.0142765335910575e-06, + "loss": 0.2551, + "step": 75120 + }, + { + "epoch": 3.3395563852958174, + "grad_norm": 0.1176847442984581, + "learning_rate": 1.0101388079894746e-06, + "loss": 0.2523, + "step": 75130 + }, + { + "epoch": 3.340000889007423, + "grad_norm": 0.11561672389507294, + "learning_rate": 1.0060094534152908e-06, + "loss": 0.2549, + "step": 75140 + }, + { + "epoch": 3.340445392719029, + "grad_norm": 0.11358087509870529, + "learning_rate": 1.0018884705741028e-06, + "loss": 0.2516, + "step": 75150 + }, + { + "epoch": 3.3408898964306353, + "grad_norm": 0.10653896629810333, + "learning_rate": 9.977758601700805e-07, + "loss": 0.2515, + "step": 75160 + }, + { + "epoch": 3.3413344001422414, + "grad_norm": 0.11288595199584961, + "learning_rate": 9.93671622905945e-07, + "loss": 0.2537, + "step": 75170 + }, + { + "epoch": 3.341778903853847, + "grad_norm": 0.0950169488787651, + "learning_rate": 9.895757594830024e-07, + "loss": 0.2529, + "step": 75180 + }, + { + "epoch": 3.342223407565453, + "grad_norm": 0.11892913281917572, + "learning_rate": 9.854882706011204e-07, + "loss": 0.2534, + "step": 75190 + }, + { + "epoch": 3.342667911277059, + "grad_norm": 0.11015285551548004, + "learning_rate": 9.814091569587513e-07, + "loss": 0.2565, + "step": 75200 + }, + { + "epoch": 3.3431124149886653, + "grad_norm": 0.09738341718912125, + "learning_rate": 9.77338419252888e-07, + "loss": 0.253, + "step": 75210 + }, + { + "epoch": 3.343556918700271, + "grad_norm": 0.13189303874969482, + "learning_rate": 9.732760581791289e-07, + "loss": 0.2518, + "step": 75220 + }, + { + "epoch": 3.344001422411877, + "grad_norm": 0.13161812722682953, + "learning_rate": 9.692220744315972e-07, + "loss": 0.2548, + "step": 75230 + }, + { + "epoch": 3.344445926123483, + "grad_norm": 0.1028658077120781, + "learning_rate": 9.651764687030162e-07, + "loss": 0.2537, + "step": 75240 + }, + { + "epoch": 3.344890429835089, + "grad_norm": 0.11723987758159637, + "learning_rate": 9.611392416846776e-07, + "loss": 0.2532, + "step": 75250 + }, + { + "epoch": 3.3453349335466953, + "grad_norm": 0.11446455121040344, + "learning_rate": 9.571103940664238e-07, + "loss": 0.2537, + "step": 75260 + }, + { + "epoch": 3.345779437258301, + "grad_norm": 0.15906812250614166, + "learning_rate": 9.530899265366822e-07, + "loss": 0.2538, + "step": 75270 + }, + { + "epoch": 3.346223940969907, + "grad_norm": 0.11578403413295746, + "learning_rate": 9.490778397824307e-07, + "loss": 0.2521, + "step": 75280 + }, + { + "epoch": 3.346668444681513, + "grad_norm": 0.11551159620285034, + "learning_rate": 9.450741344892322e-07, + "loss": 0.2531, + "step": 75290 + }, + { + "epoch": 3.3471129483931192, + "grad_norm": 0.11989109963178635, + "learning_rate": 9.410788113412116e-07, + "loss": 0.2533, + "step": 75300 + }, + { + "epoch": 3.347557452104725, + "grad_norm": 0.10236359387636185, + "learning_rate": 9.370918710210563e-07, + "loss": 0.2534, + "step": 75310 + }, + { + "epoch": 3.348001955816331, + "grad_norm": 0.10326983034610748, + "learning_rate": 9.331133142100268e-07, + "loss": 0.253, + "step": 75320 + }, + { + "epoch": 3.348446459527937, + "grad_norm": 0.12009228020906448, + "learning_rate": 9.291431415879459e-07, + "loss": 0.2541, + "step": 75330 + }, + { + "epoch": 3.348890963239543, + "grad_norm": 0.09782826155424118, + "learning_rate": 9.251813538332155e-07, + "loss": 0.253, + "step": 75340 + }, + { + "epoch": 3.349335466951149, + "grad_norm": 0.12910713255405426, + "learning_rate": 9.212279516227884e-07, + "loss": 0.2537, + "step": 75350 + }, + { + "epoch": 3.349779970662755, + "grad_norm": 0.1054750308394432, + "learning_rate": 9.172829356322021e-07, + "loss": 0.2518, + "step": 75360 + }, + { + "epoch": 3.350224474374361, + "grad_norm": 0.11246376484632492, + "learning_rate": 9.133463065355452e-07, + "loss": 0.252, + "step": 75370 + }, + { + "epoch": 3.350668978085967, + "grad_norm": 0.11625514924526215, + "learning_rate": 9.094180650054796e-07, + "loss": 0.2556, + "step": 75380 + }, + { + "epoch": 3.351113481797573, + "grad_norm": 0.10984615981578827, + "learning_rate": 9.054982117132404e-07, + "loss": 0.2529, + "step": 75390 + }, + { + "epoch": 3.351557985509179, + "grad_norm": 0.11729632318019867, + "learning_rate": 9.015867473286143e-07, + "loss": 0.2548, + "step": 75400 + }, + { + "epoch": 3.352002489220785, + "grad_norm": 0.09978259354829788, + "learning_rate": 8.976836725199778e-07, + "loss": 0.2561, + "step": 75410 + }, + { + "epoch": 3.352446992932391, + "grad_norm": 0.10235623270273209, + "learning_rate": 8.937889879542416e-07, + "loss": 0.2538, + "step": 75420 + }, + { + "epoch": 3.352891496643997, + "grad_norm": 0.11517579108476639, + "learning_rate": 8.899026942969068e-07, + "loss": 0.2523, + "step": 75430 + }, + { + "epoch": 3.353336000355603, + "grad_norm": 0.1267198920249939, + "learning_rate": 8.860247922120424e-07, + "loss": 0.2539, + "step": 75440 + }, + { + "epoch": 3.353780504067209, + "grad_norm": 0.1302388459444046, + "learning_rate": 8.821552823622737e-07, + "loss": 0.2502, + "step": 75450 + }, + { + "epoch": 3.354225007778815, + "grad_norm": 0.11987818777561188, + "learning_rate": 8.782941654087884e-07, + "loss": 0.2524, + "step": 75460 + }, + { + "epoch": 3.354669511490421, + "grad_norm": 0.10962149500846863, + "learning_rate": 8.744414420113478e-07, + "loss": 0.2526, + "step": 75470 + }, + { + "epoch": 3.355114015202027, + "grad_norm": 0.10841239243745804, + "learning_rate": 8.705971128282753e-07, + "loss": 0.2558, + "step": 75480 + }, + { + "epoch": 3.3555585189136328, + "grad_norm": 0.11568029969930649, + "learning_rate": 8.66761178516473e-07, + "loss": 0.251, + "step": 75490 + }, + { + "epoch": 3.356003022625239, + "grad_norm": 0.10012368857860565, + "learning_rate": 8.629336397313781e-07, + "loss": 0.2534, + "step": 75500 + }, + { + "epoch": 3.356447526336845, + "grad_norm": 0.11414992064237595, + "learning_rate": 8.591144971270282e-07, + "loss": 0.2543, + "step": 75510 + }, + { + "epoch": 3.356892030048451, + "grad_norm": 0.11579281091690063, + "learning_rate": 8.553037513560069e-07, + "loss": 0.253, + "step": 75520 + }, + { + "epoch": 3.3573365337600567, + "grad_norm": 0.1256382316350937, + "learning_rate": 8.515014030694546e-07, + "loss": 0.2526, + "step": 75530 + }, + { + "epoch": 3.357781037471663, + "grad_norm": 0.12397853285074234, + "learning_rate": 8.47707452917107e-07, + "loss": 0.2537, + "step": 75540 + }, + { + "epoch": 3.358225541183269, + "grad_norm": 0.11617221683263779, + "learning_rate": 8.439219015472343e-07, + "loss": 0.2537, + "step": 75550 + }, + { + "epoch": 3.358670044894875, + "grad_norm": 0.10402358323335648, + "learning_rate": 8.401447496066861e-07, + "loss": 0.2525, + "step": 75560 + }, + { + "epoch": 3.359114548606481, + "grad_norm": 0.12235062569379807, + "learning_rate": 8.363759977408792e-07, + "loss": 0.2538, + "step": 75570 + }, + { + "epoch": 3.3595590523180867, + "grad_norm": 0.10382065922021866, + "learning_rate": 8.326156465937817e-07, + "loss": 0.2521, + "step": 75580 + }, + { + "epoch": 3.360003556029693, + "grad_norm": 0.12458857893943787, + "learning_rate": 8.28863696807941e-07, + "loss": 0.2534, + "step": 75590 + }, + { + "epoch": 3.360448059741299, + "grad_norm": 0.10720577090978622, + "learning_rate": 8.251201490244609e-07, + "loss": 0.2544, + "step": 75600 + }, + { + "epoch": 3.360892563452905, + "grad_norm": 0.1268102377653122, + "learning_rate": 8.213850038830129e-07, + "loss": 0.2522, + "step": 75610 + }, + { + "epoch": 3.3613370671645106, + "grad_norm": 0.13579726219177246, + "learning_rate": 8.176582620218309e-07, + "loss": 0.2505, + "step": 75620 + }, + { + "epoch": 3.3617815708761167, + "grad_norm": 0.13534648716449738, + "learning_rate": 8.139399240777057e-07, + "loss": 0.2545, + "step": 75630 + }, + { + "epoch": 3.362226074587723, + "grad_norm": 0.12690043449401855, + "learning_rate": 8.102299906860122e-07, + "loss": 0.2547, + "step": 75640 + }, + { + "epoch": 3.362670578299329, + "grad_norm": 0.10695173591375351, + "learning_rate": 8.065284624806657e-07, + "loss": 0.2531, + "step": 75650 + }, + { + "epoch": 3.3631150820109346, + "grad_norm": 0.11559760570526123, + "learning_rate": 8.028353400941601e-07, + "loss": 0.2553, + "step": 75660 + }, + { + "epoch": 3.3635595857225407, + "grad_norm": 0.11900860071182251, + "learning_rate": 7.991506241575575e-07, + "loss": 0.2528, + "step": 75670 + }, + { + "epoch": 3.3640040894341467, + "grad_norm": 0.1263621598482132, + "learning_rate": 7.954743153004541e-07, + "loss": 0.253, + "step": 75680 + }, + { + "epoch": 3.364448593145753, + "grad_norm": 0.09883347898721695, + "learning_rate": 7.918064141510528e-07, + "loss": 0.2545, + "step": 75690 + }, + { + "epoch": 3.364893096857359, + "grad_norm": 0.10239896923303604, + "learning_rate": 7.881469213360859e-07, + "loss": 0.2531, + "step": 75700 + }, + { + "epoch": 3.3653376005689646, + "grad_norm": 0.0832502469420433, + "learning_rate": 7.844958374808642e-07, + "loss": 0.251, + "step": 75710 + }, + { + "epoch": 3.3657821042805707, + "grad_norm": 0.11349808424711227, + "learning_rate": 7.808531632092608e-07, + "loss": 0.2541, + "step": 75720 + }, + { + "epoch": 3.3662266079921768, + "grad_norm": 0.13358956575393677, + "learning_rate": 7.772188991436946e-07, + "loss": 0.2533, + "step": 75730 + }, + { + "epoch": 3.366671111703783, + "grad_norm": 0.11516712605953217, + "learning_rate": 7.735930459051799e-07, + "loss": 0.2527, + "step": 75740 + }, + { + "epoch": 3.367115615415389, + "grad_norm": 0.10360122472047806, + "learning_rate": 7.699756041132655e-07, + "loss": 0.2526, + "step": 75750 + }, + { + "epoch": 3.3675601191269946, + "grad_norm": 0.11846598237752914, + "learning_rate": 7.663665743860793e-07, + "loss": 0.2573, + "step": 75760 + }, + { + "epoch": 3.3680046228386007, + "grad_norm": 0.0998924970626831, + "learning_rate": 7.627659573403001e-07, + "loss": 0.254, + "step": 75770 + }, + { + "epoch": 3.368449126550207, + "grad_norm": 0.11324959248304367, + "learning_rate": 7.591737535911802e-07, + "loss": 0.2537, + "step": 75780 + }, + { + "epoch": 3.368893630261813, + "grad_norm": 0.11790040880441666, + "learning_rate": 7.555899637525288e-07, + "loss": 0.2522, + "step": 75790 + }, + { + "epoch": 3.3693381339734185, + "grad_norm": 0.11052243411540985, + "learning_rate": 7.520145884367058e-07, + "loss": 0.2539, + "step": 75800 + }, + { + "epoch": 3.3697826376850246, + "grad_norm": 0.09958897531032562, + "learning_rate": 7.484476282546615e-07, + "loss": 0.2517, + "step": 75810 + }, + { + "epoch": 3.3702271413966307, + "grad_norm": 0.10051577538251877, + "learning_rate": 7.448890838158806e-07, + "loss": 0.2523, + "step": 75820 + }, + { + "epoch": 3.370671645108237, + "grad_norm": 0.11675813049077988, + "learning_rate": 7.413389557284267e-07, + "loss": 0.2553, + "step": 75830 + }, + { + "epoch": 3.3711161488198425, + "grad_norm": 0.12820051610469818, + "learning_rate": 7.377972445989201e-07, + "loss": 0.2536, + "step": 75840 + }, + { + "epoch": 3.3715606525314485, + "grad_norm": 0.1098136454820633, + "learning_rate": 7.342639510325377e-07, + "loss": 0.2507, + "step": 75850 + }, + { + "epoch": 3.3720051562430546, + "grad_norm": 0.11006635427474976, + "learning_rate": 7.307390756330246e-07, + "loss": 0.2556, + "step": 75860 + }, + { + "epoch": 3.3724496599546607, + "grad_norm": 0.09683168679475784, + "learning_rate": 7.272226190026876e-07, + "loss": 0.2535, + "step": 75870 + }, + { + "epoch": 3.372894163666267, + "grad_norm": 0.1049375906586647, + "learning_rate": 7.237145817423907e-07, + "loss": 0.2515, + "step": 75880 + }, + { + "epoch": 3.3733386673778725, + "grad_norm": 0.11205010116100311, + "learning_rate": 7.202149644515654e-07, + "loss": 0.2534, + "step": 75890 + }, + { + "epoch": 3.3737831710894786, + "grad_norm": 0.1332499086856842, + "learning_rate": 7.167237677281946e-07, + "loss": 0.2533, + "step": 75900 + }, + { + "epoch": 3.3742276748010847, + "grad_norm": 0.13158315420150757, + "learning_rate": 7.132409921688288e-07, + "loss": 0.2529, + "step": 75910 + }, + { + "epoch": 3.3746721785126907, + "grad_norm": 0.13372883200645447, + "learning_rate": 7.09766638368592e-07, + "loss": 0.2548, + "step": 75920 + }, + { + "epoch": 3.3751166822242964, + "grad_norm": 0.12743327021598816, + "learning_rate": 7.063007069211313e-07, + "loss": 0.2547, + "step": 75930 + }, + { + "epoch": 3.3755611859359025, + "grad_norm": 0.1157272532582283, + "learning_rate": 7.028431984187067e-07, + "loss": 0.256, + "step": 75940 + }, + { + "epoch": 3.3760056896475086, + "grad_norm": 0.09902524948120117, + "learning_rate": 6.993941134520898e-07, + "loss": 0.2531, + "step": 75950 + }, + { + "epoch": 3.3764501933591147, + "grad_norm": 0.11940702050924301, + "learning_rate": 6.959534526106537e-07, + "loss": 0.2569, + "step": 75960 + }, + { + "epoch": 3.3768946970707203, + "grad_norm": 0.11690451204776764, + "learning_rate": 6.925212164822948e-07, + "loss": 0.2538, + "step": 75970 + }, + { + "epoch": 3.3773392007823264, + "grad_norm": 0.10365737229585648, + "learning_rate": 6.890974056535049e-07, + "loss": 0.2541, + "step": 75980 + }, + { + "epoch": 3.3777837044939325, + "grad_norm": 0.11348050087690353, + "learning_rate": 6.856820207093106e-07, + "loss": 0.2529, + "step": 75990 + }, + { + "epoch": 3.3782282082055386, + "grad_norm": 0.1015450656414032, + "learning_rate": 6.822750622333063e-07, + "loss": 0.2522, + "step": 76000 + }, + { + "epoch": 3.3786727119171447, + "grad_norm": 0.10582717508077621, + "learning_rate": 6.788765308076539e-07, + "loss": 0.2517, + "step": 76010 + }, + { + "epoch": 3.3791172156287503, + "grad_norm": 0.11660266667604446, + "learning_rate": 6.754864270130668e-07, + "loss": 0.2532, + "step": 76020 + }, + { + "epoch": 3.3795617193403564, + "grad_norm": 0.09302998334169388, + "learning_rate": 6.721047514288203e-07, + "loss": 0.2565, + "step": 76030 + }, + { + "epoch": 3.3800062230519625, + "grad_norm": 0.11151724308729172, + "learning_rate": 6.687315046327469e-07, + "loss": 0.2541, + "step": 76040 + }, + { + "epoch": 3.3804507267635686, + "grad_norm": 0.11057807505130768, + "learning_rate": 6.65366687201252e-07, + "loss": 0.2544, + "step": 76050 + }, + { + "epoch": 3.3808952304751747, + "grad_norm": 0.10107952356338501, + "learning_rate": 6.620102997092814e-07, + "loss": 0.2532, + "step": 76060 + }, + { + "epoch": 3.3813397341867804, + "grad_norm": 0.1052001416683197, + "learning_rate": 6.586623427303596e-07, + "loss": 0.253, + "step": 76070 + }, + { + "epoch": 3.3817842378983864, + "grad_norm": 0.10937894880771637, + "learning_rate": 6.553228168365455e-07, + "loss": 0.2508, + "step": 76080 + }, + { + "epoch": 3.3822287416099925, + "grad_norm": 0.11369159072637558, + "learning_rate": 6.519917225984884e-07, + "loss": 0.2532, + "step": 76090 + }, + { + "epoch": 3.382673245321598, + "grad_norm": 0.11280686408281326, + "learning_rate": 6.486690605853718e-07, + "loss": 0.2532, + "step": 76100 + }, + { + "epoch": 3.3831177490332043, + "grad_norm": 0.10078223794698715, + "learning_rate": 6.453548313649527e-07, + "loss": 0.2527, + "step": 76110 + }, + { + "epoch": 3.3835622527448104, + "grad_norm": 0.12561167776584625, + "learning_rate": 6.420490355035446e-07, + "loss": 0.2551, + "step": 76120 + }, + { + "epoch": 3.3840067564564165, + "grad_norm": 0.12697744369506836, + "learning_rate": 6.387516735660071e-07, + "loss": 0.2521, + "step": 76130 + }, + { + "epoch": 3.3844512601680226, + "grad_norm": 0.11290866881608963, + "learning_rate": 6.35462746115778e-07, + "loss": 0.2539, + "step": 76140 + }, + { + "epoch": 3.384895763879628, + "grad_norm": 0.11488840728998184, + "learning_rate": 6.321822537148358e-07, + "loss": 0.255, + "step": 76150 + }, + { + "epoch": 3.3853402675912343, + "grad_norm": 0.10855366289615631, + "learning_rate": 6.289101969237432e-07, + "loss": 0.2534, + "step": 76160 + }, + { + "epoch": 3.3857847713028404, + "grad_norm": 0.11502153426408768, + "learning_rate": 6.256465763015918e-07, + "loss": 0.2531, + "step": 76170 + }, + { + "epoch": 3.3862292750144465, + "grad_norm": 0.11220455169677734, + "learning_rate": 6.22391392406052e-07, + "loss": 0.2553, + "step": 76180 + }, + { + "epoch": 3.3866737787260526, + "grad_norm": 0.09178964048624039, + "learning_rate": 6.191446457933403e-07, + "loss": 0.2541, + "step": 76190 + }, + { + "epoch": 3.3871182824376582, + "grad_norm": 0.09445033967494965, + "learning_rate": 6.159063370182406e-07, + "loss": 0.2526, + "step": 76200 + }, + { + "epoch": 3.3875627861492643, + "grad_norm": 0.1345878541469574, + "learning_rate": 6.126764666340879e-07, + "loss": 0.2528, + "step": 76210 + }, + { + "epoch": 3.3880072898608704, + "grad_norm": 0.11085931211709976, + "learning_rate": 6.094550351927852e-07, + "loss": 0.2535, + "step": 76220 + }, + { + "epoch": 3.3884517935724765, + "grad_norm": 0.10756879299879074, + "learning_rate": 6.062420432447757e-07, + "loss": 0.257, + "step": 76230 + }, + { + "epoch": 3.388896297284082, + "grad_norm": 0.10464517772197723, + "learning_rate": 6.030374913390813e-07, + "loss": 0.2534, + "step": 76240 + }, + { + "epoch": 3.3893408009956882, + "grad_norm": 0.12688186764717102, + "learning_rate": 5.998413800232694e-07, + "loss": 0.2536, + "step": 76250 + }, + { + "epoch": 3.3897853047072943, + "grad_norm": 0.10014700144529343, + "learning_rate": 5.966537098434755e-07, + "loss": 0.2497, + "step": 76260 + }, + { + "epoch": 3.3902298084189004, + "grad_norm": 0.10006972402334213, + "learning_rate": 5.934744813443694e-07, + "loss": 0.256, + "step": 76270 + }, + { + "epoch": 3.390674312130506, + "grad_norm": 0.11779050529003143, + "learning_rate": 5.903036950692054e-07, + "loss": 0.2535, + "step": 76280 + }, + { + "epoch": 3.391118815842112, + "grad_norm": 0.11193665862083435, + "learning_rate": 5.871413515597835e-07, + "loss": 0.2531, + "step": 76290 + }, + { + "epoch": 3.3915633195537183, + "grad_norm": 0.11132952570915222, + "learning_rate": 5.839874513564547e-07, + "loss": 0.2514, + "step": 76300 + }, + { + "epoch": 3.3920078232653244, + "grad_norm": 0.11150722205638885, + "learning_rate": 5.808419949981436e-07, + "loss": 0.2533, + "step": 76310 + }, + { + "epoch": 3.3924523269769304, + "grad_norm": 0.11002405732870102, + "learning_rate": 5.777049830223257e-07, + "loss": 0.2562, + "step": 76320 + }, + { + "epoch": 3.392896830688536, + "grad_norm": 0.10334715247154236, + "learning_rate": 5.745764159650114e-07, + "loss": 0.2529, + "step": 76330 + }, + { + "epoch": 3.393341334400142, + "grad_norm": 0.11668763309717178, + "learning_rate": 5.71456294360806e-07, + "loss": 0.2515, + "step": 76340 + }, + { + "epoch": 3.3937858381117483, + "grad_norm": 0.10534612834453583, + "learning_rate": 5.683446187428443e-07, + "loss": 0.2528, + "step": 76350 + }, + { + "epoch": 3.3942303418233544, + "grad_norm": 0.11988895386457443, + "learning_rate": 5.652413896428288e-07, + "loss": 0.2521, + "step": 76360 + }, + { + "epoch": 3.3946748455349605, + "grad_norm": 0.10700099915266037, + "learning_rate": 5.621466075910131e-07, + "loss": 0.2544, + "step": 76370 + }, + { + "epoch": 3.395119349246566, + "grad_norm": 0.11185029149055481, + "learning_rate": 5.590602731162187e-07, + "loss": 0.2532, + "step": 76380 + }, + { + "epoch": 3.395563852958172, + "grad_norm": 0.12165291607379913, + "learning_rate": 5.559823867458125e-07, + "loss": 0.255, + "step": 76390 + }, + { + "epoch": 3.3960083566697783, + "grad_norm": 0.10503983497619629, + "learning_rate": 5.529129490057128e-07, + "loss": 0.2526, + "step": 76400 + }, + { + "epoch": 3.396452860381384, + "grad_norm": 0.1062893345952034, + "learning_rate": 5.498519604204167e-07, + "loss": 0.2525, + "step": 76410 + }, + { + "epoch": 3.39689736409299, + "grad_norm": 0.10362455248832703, + "learning_rate": 5.467994215129501e-07, + "loss": 0.2555, + "step": 76420 + }, + { + "epoch": 3.397341867804596, + "grad_norm": 0.12308041006326675, + "learning_rate": 5.437553328049183e-07, + "loss": 0.2531, + "step": 76430 + }, + { + "epoch": 3.397786371516202, + "grad_norm": 0.10892686992883682, + "learning_rate": 5.407196948164661e-07, + "loss": 0.2532, + "step": 76440 + }, + { + "epoch": 3.3982308752278083, + "grad_norm": 0.10410770773887634, + "learning_rate": 5.376925080663009e-07, + "loss": 0.2534, + "step": 76450 + }, + { + "epoch": 3.398675378939414, + "grad_norm": 0.1134127676486969, + "learning_rate": 5.346737730716977e-07, + "loss": 0.251, + "step": 76460 + }, + { + "epoch": 3.39911988265102, + "grad_norm": 0.09418538212776184, + "learning_rate": 5.316634903484607e-07, + "loss": 0.2529, + "step": 76470 + }, + { + "epoch": 3.399564386362626, + "grad_norm": 0.0999562218785286, + "learning_rate": 5.286616604109728e-07, + "loss": 0.2528, + "step": 76480 + }, + { + "epoch": 3.4000088900742322, + "grad_norm": 0.09922986477613449, + "learning_rate": 5.256682837721627e-07, + "loss": 0.2536, + "step": 76490 + }, + { + "epoch": 3.4004533937858383, + "grad_norm": 0.11143017560243607, + "learning_rate": 5.226833609435156e-07, + "loss": 0.2532, + "step": 76500 + }, + { + "epoch": 3.400897897497444, + "grad_norm": 0.107086181640625, + "learning_rate": 5.19706892435079e-07, + "loss": 0.252, + "step": 76510 + }, + { + "epoch": 3.40134240120905, + "grad_norm": 0.1007910817861557, + "learning_rate": 5.167388787554406e-07, + "loss": 0.2546, + "step": 76520 + }, + { + "epoch": 3.401786904920656, + "grad_norm": 0.10714145749807358, + "learning_rate": 5.137793204117614e-07, + "loss": 0.2506, + "step": 76530 + }, + { + "epoch": 3.4022314086322623, + "grad_norm": 0.11507926881313324, + "learning_rate": 5.108282179097479e-07, + "loss": 0.2502, + "step": 76540 + }, + { + "epoch": 3.402675912343868, + "grad_norm": 0.1211322695016861, + "learning_rate": 5.078855717536523e-07, + "loss": 0.2506, + "step": 76550 + }, + { + "epoch": 3.403120416055474, + "grad_norm": 0.11114615201950073, + "learning_rate": 5.049513824463059e-07, + "loss": 0.2506, + "step": 76560 + }, + { + "epoch": 3.40356491976708, + "grad_norm": 0.1124727874994278, + "learning_rate": 5.020256504890742e-07, + "loss": 0.2519, + "step": 76570 + }, + { + "epoch": 3.404009423478686, + "grad_norm": 0.10372244566679001, + "learning_rate": 4.991083763818849e-07, + "loss": 0.2552, + "step": 76580 + }, + { + "epoch": 3.404453927190292, + "grad_norm": 0.10639011859893799, + "learning_rate": 4.961995606232228e-07, + "loss": 0.2517, + "step": 76590 + }, + { + "epoch": 3.404898430901898, + "grad_norm": 0.12567347288131714, + "learning_rate": 4.932992037101236e-07, + "loss": 0.2542, + "step": 76600 + }, + { + "epoch": 3.405342934613504, + "grad_norm": 0.11154461652040482, + "learning_rate": 4.904073061381798e-07, + "loss": 0.2522, + "step": 76610 + }, + { + "epoch": 3.40578743832511, + "grad_norm": 0.15475133061408997, + "learning_rate": 4.875238684015349e-07, + "loss": 0.2538, + "step": 76620 + }, + { + "epoch": 3.406231942036716, + "grad_norm": 0.11644066125154495, + "learning_rate": 4.846488909928948e-07, + "loss": 0.2529, + "step": 76630 + }, + { + "epoch": 3.406676445748322, + "grad_norm": 0.12272858619689941, + "learning_rate": 4.81782374403511e-07, + "loss": 0.2535, + "step": 76640 + }, + { + "epoch": 3.407120949459928, + "grad_norm": 0.11726494878530502, + "learning_rate": 4.789243191231918e-07, + "loss": 0.2502, + "step": 76650 + }, + { + "epoch": 3.407565453171534, + "grad_norm": 0.10405103862285614, + "learning_rate": 4.76074725640302e-07, + "loss": 0.2502, + "step": 76660 + }, + { + "epoch": 3.40800995688314, + "grad_norm": 0.10733546316623688, + "learning_rate": 4.732335944417632e-07, + "loss": 0.2538, + "step": 76670 + }, + { + "epoch": 3.408454460594746, + "grad_norm": 0.12899193167686462, + "learning_rate": 4.704009260130371e-07, + "loss": 0.2542, + "step": 76680 + }, + { + "epoch": 3.408898964306352, + "grad_norm": 0.11560926586389542, + "learning_rate": 4.675767208381587e-07, + "loss": 0.2556, + "step": 76690 + }, + { + "epoch": 3.409343468017958, + "grad_norm": 0.1194416880607605, + "learning_rate": 4.647609793997032e-07, + "loss": 0.2532, + "step": 76700 + }, + { + "epoch": 3.409787971729564, + "grad_norm": 0.10887749493122101, + "learning_rate": 4.619537021788023e-07, + "loss": 0.2525, + "step": 76710 + }, + { + "epoch": 3.4102324754411697, + "grad_norm": 0.11154114454984665, + "learning_rate": 4.5915488965515005e-07, + "loss": 0.2553, + "step": 76720 + }, + { + "epoch": 3.410676979152776, + "grad_norm": 0.10813814401626587, + "learning_rate": 4.563645423069807e-07, + "loss": 0.251, + "step": 76730 + }, + { + "epoch": 3.411121482864382, + "grad_norm": 0.10611486434936523, + "learning_rate": 4.535826606110849e-07, + "loss": 0.2528, + "step": 76740 + }, + { + "epoch": 3.411565986575988, + "grad_norm": 0.11049504578113556, + "learning_rate": 4.508092450428214e-07, + "loss": 0.2533, + "step": 76750 + }, + { + "epoch": 3.412010490287594, + "grad_norm": 0.12230516970157623, + "learning_rate": 4.480442960760778e-07, + "loss": 0.2518, + "step": 76760 + }, + { + "epoch": 3.4124549939991997, + "grad_norm": 0.12510138750076294, + "learning_rate": 4.4528781418332053e-07, + "loss": 0.2529, + "step": 76770 + }, + { + "epoch": 3.412899497710806, + "grad_norm": 0.12169535458087921, + "learning_rate": 4.4253979983555073e-07, + "loss": 0.2535, + "step": 76780 + }, + { + "epoch": 3.413344001422412, + "grad_norm": 0.11813721805810928, + "learning_rate": 4.3980025350233154e-07, + "loss": 0.2519, + "step": 76790 + }, + { + "epoch": 3.413788505134018, + "grad_norm": 0.12631681561470032, + "learning_rate": 4.370691756517664e-07, + "loss": 0.2517, + "step": 76800 + }, + { + "epoch": 3.414233008845624, + "grad_norm": 0.10009785741567612, + "learning_rate": 4.3434656675053753e-07, + "loss": 0.2533, + "step": 76810 + }, + { + "epoch": 3.4146775125572297, + "grad_norm": 0.11951269209384918, + "learning_rate": 4.3163242726385613e-07, + "loss": 0.254, + "step": 76820 + }, + { + "epoch": 3.415122016268836, + "grad_norm": 0.10729091614484787, + "learning_rate": 4.289267576554956e-07, + "loss": 0.2523, + "step": 76830 + }, + { + "epoch": 3.415566519980442, + "grad_norm": 0.11905895918607712, + "learning_rate": 4.262295583877807e-07, + "loss": 0.2532, + "step": 76840 + }, + { + "epoch": 3.416011023692048, + "grad_norm": 0.1010415256023407, + "learning_rate": 4.235408299215815e-07, + "loss": 0.2522, + "step": 76850 + }, + { + "epoch": 3.4164555274036537, + "grad_norm": 0.12310246378183365, + "learning_rate": 4.2086057271634173e-07, + "loss": 0.2529, + "step": 76860 + }, + { + "epoch": 3.4169000311152598, + "grad_norm": 0.10558044165372849, + "learning_rate": 4.181887872300394e-07, + "loss": 0.2543, + "step": 76870 + }, + { + "epoch": 3.417344534826866, + "grad_norm": 0.12037631869316101, + "learning_rate": 4.155254739192038e-07, + "loss": 0.2526, + "step": 76880 + }, + { + "epoch": 3.417789038538472, + "grad_norm": 0.10892566293478012, + "learning_rate": 4.128706332389265e-07, + "loss": 0.2535, + "step": 76890 + }, + { + "epoch": 3.4182335422500776, + "grad_norm": 0.11270609498023987, + "learning_rate": 4.1022426564284453e-07, + "loss": 0.2542, + "step": 76900 + }, + { + "epoch": 3.4186780459616837, + "grad_norm": 0.11300680786371231, + "learning_rate": 4.075863715831574e-07, + "loss": 0.253, + "step": 76910 + }, + { + "epoch": 3.4191225496732898, + "grad_norm": 0.13901782035827637, + "learning_rate": 4.0495695151059887e-07, + "loss": 0.2553, + "step": 76920 + }, + { + "epoch": 3.419567053384896, + "grad_norm": 0.09620549529790878, + "learning_rate": 4.023360058744763e-07, + "loss": 0.2513, + "step": 76930 + }, + { + "epoch": 3.420011557096502, + "grad_norm": 0.11368609219789505, + "learning_rate": 3.997235351226314e-07, + "loss": 0.2531, + "step": 76940 + }, + { + "epoch": 3.4204560608081076, + "grad_norm": 0.11619924008846283, + "learning_rate": 3.971195397014571e-07, + "loss": 0.2539, + "step": 76950 + }, + { + "epoch": 3.4209005645197137, + "grad_norm": 0.13033448159694672, + "learning_rate": 3.9452402005591417e-07, + "loss": 0.2532, + "step": 76960 + }, + { + "epoch": 3.42134506823132, + "grad_norm": 0.1088610589504242, + "learning_rate": 3.9193697662950334e-07, + "loss": 0.2544, + "step": 76970 + }, + { + "epoch": 3.421789571942926, + "grad_norm": 0.10155437886714935, + "learning_rate": 3.893584098642822e-07, + "loss": 0.2541, + "step": 76980 + }, + { + "epoch": 3.4222340756545315, + "grad_norm": 0.1199064627289772, + "learning_rate": 3.867883202008538e-07, + "loss": 0.2542, + "step": 76990 + }, + { + "epoch": 3.4226785793661376, + "grad_norm": 0.09340241551399231, + "learning_rate": 3.842267080783779e-07, + "loss": 0.2525, + "step": 77000 + }, + { + "epoch": 3.4231230830777437, + "grad_norm": 0.1275452971458435, + "learning_rate": 3.816735739345656e-07, + "loss": 0.2521, + "step": 77010 + }, + { + "epoch": 3.42356758678935, + "grad_norm": 0.14576072990894318, + "learning_rate": 3.791289182056679e-07, + "loss": 0.2547, + "step": 77020 + }, + { + "epoch": 3.4240120905009555, + "grad_norm": 0.1025630459189415, + "learning_rate": 3.7659274132650913e-07, + "loss": 0.2504, + "step": 77030 + }, + { + "epoch": 3.4244565942125615, + "grad_norm": 0.10866393148899078, + "learning_rate": 3.7406504373044826e-07, + "loss": 0.2525, + "step": 77040 + }, + { + "epoch": 3.4249010979241676, + "grad_norm": 0.12288891524076462, + "learning_rate": 3.7154582584939533e-07, + "loss": 0.2509, + "step": 77050 + }, + { + "epoch": 3.4253456016357737, + "grad_norm": 0.10350918024778366, + "learning_rate": 3.690350881138227e-07, + "loss": 0.2535, + "step": 77060 + }, + { + "epoch": 3.42579010534738, + "grad_norm": 0.10062720626592636, + "learning_rate": 3.665328309527427e-07, + "loss": 0.2525, + "step": 77070 + }, + { + "epoch": 3.4262346090589855, + "grad_norm": 0.11266709119081497, + "learning_rate": 3.640390547937245e-07, + "loss": 0.253, + "step": 77080 + }, + { + "epoch": 3.4266791127705916, + "grad_norm": 0.10918533056974411, + "learning_rate": 3.615537600628882e-07, + "loss": 0.2531, + "step": 77090 + }, + { + "epoch": 3.4271236164821977, + "grad_norm": 0.11599352210760117, + "learning_rate": 3.5907694718489427e-07, + "loss": 0.2538, + "step": 77100 + }, + { + "epoch": 3.4275681201938037, + "grad_norm": 0.1081230416893959, + "learning_rate": 3.566086165829707e-07, + "loss": 0.2529, + "step": 77110 + }, + { + "epoch": 3.42801262390541, + "grad_norm": 0.11621717363595963, + "learning_rate": 3.5414876867888024e-07, + "loss": 0.2533, + "step": 77120 + }, + { + "epoch": 3.4284571276170155, + "grad_norm": 0.15889932215213776, + "learning_rate": 3.5169740389295326e-07, + "loss": 0.2546, + "step": 77130 + }, + { + "epoch": 3.4289016313286216, + "grad_norm": 0.10865800082683563, + "learning_rate": 3.4925452264405466e-07, + "loss": 0.2535, + "step": 77140 + }, + { + "epoch": 3.4293461350402277, + "grad_norm": 0.11227429658174515, + "learning_rate": 3.468201253496062e-07, + "loss": 0.2514, + "step": 77150 + }, + { + "epoch": 3.4297906387518338, + "grad_norm": 0.10161792486906052, + "learning_rate": 3.443942124255861e-07, + "loss": 0.2503, + "step": 77160 + }, + { + "epoch": 3.4302351424634394, + "grad_norm": 0.1264238953590393, + "learning_rate": 3.4197678428650183e-07, + "loss": 0.2556, + "step": 77170 + }, + { + "epoch": 3.4306796461750455, + "grad_norm": 0.1047854945063591, + "learning_rate": 3.3956784134544504e-07, + "loss": 0.2531, + "step": 77180 + }, + { + "epoch": 3.4311241498866516, + "grad_norm": 0.10347326844930649, + "learning_rate": 3.3716738401402547e-07, + "loss": 0.2538, + "step": 77190 + }, + { + "epoch": 3.4315686535982577, + "grad_norm": 0.095842644572258, + "learning_rate": 3.3477541270241495e-07, + "loss": 0.2542, + "step": 77200 + }, + { + "epoch": 3.4320131573098633, + "grad_norm": 0.11185746639966965, + "learning_rate": 3.3239192781934215e-07, + "loss": 0.2539, + "step": 77210 + }, + { + "epoch": 3.4324576610214694, + "grad_norm": 0.12600639462471008, + "learning_rate": 3.3001692977207563e-07, + "loss": 0.2559, + "step": 77220 + }, + { + "epoch": 3.4329021647330755, + "grad_norm": 0.11158399283885956, + "learning_rate": 3.276504189664409e-07, + "loss": 0.2531, + "step": 77230 + }, + { + "epoch": 3.4333466684446816, + "grad_norm": 0.10613413900136948, + "learning_rate": 3.252923958068088e-07, + "loss": 0.2538, + "step": 77240 + }, + { + "epoch": 3.4337911721562877, + "grad_norm": 0.10170894116163254, + "learning_rate": 3.2294286069609046e-07, + "loss": 0.2534, + "step": 77250 + }, + { + "epoch": 3.4342356758678934, + "grad_norm": 0.09483248740434647, + "learning_rate": 3.2060181403577583e-07, + "loss": 0.2518, + "step": 77260 + }, + { + "epoch": 3.4346801795794994, + "grad_norm": 0.11075451225042343, + "learning_rate": 3.1826925622587823e-07, + "loss": 0.2537, + "step": 77270 + }, + { + "epoch": 3.4351246832911055, + "grad_norm": 0.13675497472286224, + "learning_rate": 3.159451876649622e-07, + "loss": 0.2532, + "step": 77280 + }, + { + "epoch": 3.4355691870027116, + "grad_norm": 0.11234071850776672, + "learning_rate": 3.1362960875015444e-07, + "loss": 0.2525, + "step": 77290 + }, + { + "epoch": 3.4360136907143173, + "grad_norm": 0.10998084396123886, + "learning_rate": 3.1132251987711637e-07, + "loss": 0.2531, + "step": 77300 + }, + { + "epoch": 3.4364581944259234, + "grad_norm": 0.11443748325109482, + "learning_rate": 3.0902392144007695e-07, + "loss": 0.2549, + "step": 77310 + }, + { + "epoch": 3.4369026981375295, + "grad_norm": 0.11971872299909592, + "learning_rate": 3.0673381383179436e-07, + "loss": 0.2532, + "step": 77320 + }, + { + "epoch": 3.4373472018491356, + "grad_norm": 0.10391572117805481, + "learning_rate": 3.0445219744358875e-07, + "loss": 0.252, + "step": 77330 + }, + { + "epoch": 3.437791705560741, + "grad_norm": 0.10837157815694809, + "learning_rate": 3.021790726653262e-07, + "loss": 0.2499, + "step": 77340 + }, + { + "epoch": 3.4382362092723473, + "grad_norm": 0.12105773389339447, + "learning_rate": 2.9991443988542366e-07, + "loss": 0.2524, + "step": 77350 + }, + { + "epoch": 3.4386807129839534, + "grad_norm": 0.12220814824104309, + "learning_rate": 2.976582994908439e-07, + "loss": 0.2551, + "step": 77360 + }, + { + "epoch": 3.4391252166955595, + "grad_norm": 0.1172560602426529, + "learning_rate": 2.954106518671007e-07, + "loss": 0.253, + "step": 77370 + }, + { + "epoch": 3.4395697204071656, + "grad_norm": 0.11401546001434326, + "learning_rate": 2.9317149739825356e-07, + "loss": 0.2537, + "step": 77380 + }, + { + "epoch": 3.4400142241187712, + "grad_norm": 0.12211573868989944, + "learning_rate": 2.909408364669075e-07, + "loss": 0.2538, + "step": 77390 + }, + { + "epoch": 3.4404587278303773, + "grad_norm": 0.11109502613544464, + "learning_rate": 2.887186694542299e-07, + "loss": 0.2538, + "step": 77400 + }, + { + "epoch": 3.4409032315419834, + "grad_norm": 0.1201922670006752, + "learning_rate": 2.865049967399225e-07, + "loss": 0.253, + "step": 77410 + }, + { + "epoch": 3.4413477352535895, + "grad_norm": 0.12141899019479752, + "learning_rate": 2.842998187022439e-07, + "loss": 0.2519, + "step": 77420 + }, + { + "epoch": 3.4417922389651956, + "grad_norm": 0.11280371248722076, + "learning_rate": 2.8210313571800374e-07, + "loss": 0.253, + "step": 77430 + }, + { + "epoch": 3.4422367426768012, + "grad_norm": 0.10785233974456787, + "learning_rate": 2.7991494816255184e-07, + "loss": 0.2524, + "step": 77440 + }, + { + "epoch": 3.4426812463884073, + "grad_norm": 0.11073419451713562, + "learning_rate": 2.777352564097779e-07, + "loss": 0.2525, + "step": 77450 + }, + { + "epoch": 3.4431257501000134, + "grad_norm": 0.11665894091129303, + "learning_rate": 2.755640608321508e-07, + "loss": 0.2539, + "step": 77460 + }, + { + "epoch": 3.4435702538116195, + "grad_norm": 0.12864013016223907, + "learning_rate": 2.73401361800657e-07, + "loss": 0.2544, + "step": 77470 + }, + { + "epoch": 3.444014757523225, + "grad_norm": 0.10693027824163437, + "learning_rate": 2.7124715968484537e-07, + "loss": 0.2535, + "step": 77480 + }, + { + "epoch": 3.4444592612348313, + "grad_norm": 0.13490842282772064, + "learning_rate": 2.691014548528104e-07, + "loss": 0.2549, + "step": 77490 + }, + { + "epoch": 3.4449037649464374, + "grad_norm": 0.12368050962686539, + "learning_rate": 2.669642476711864e-07, + "loss": 0.2542, + "step": 77500 + }, + { + "epoch": 3.4453482686580434, + "grad_norm": 0.12128566950559616, + "learning_rate": 2.648355385051815e-07, + "loss": 0.2532, + "step": 77510 + }, + { + "epoch": 3.445792772369649, + "grad_norm": 0.10608658194541931, + "learning_rate": 2.627153277185157e-07, + "loss": 0.252, + "step": 77520 + }, + { + "epoch": 3.446237276081255, + "grad_norm": 0.10785210132598877, + "learning_rate": 2.606036156734881e-07, + "loss": 0.2555, + "step": 77530 + }, + { + "epoch": 3.4466817797928613, + "grad_norm": 0.10151468962430954, + "learning_rate": 2.5850040273092127e-07, + "loss": 0.2539, + "step": 77540 + }, + { + "epoch": 3.4471262835044674, + "grad_norm": 0.1189999133348465, + "learning_rate": 2.5640568925020536e-07, + "loss": 0.2524, + "step": 77550 + }, + { + "epoch": 3.4475707872160735, + "grad_norm": 0.11439283937215805, + "learning_rate": 2.5431947558927083e-07, + "loss": 0.2521, + "step": 77560 + }, + { + "epoch": 3.448015290927679, + "grad_norm": 0.10731824487447739, + "learning_rate": 2.522417621045825e-07, + "loss": 0.2539, + "step": 77570 + }, + { + "epoch": 3.448459794639285, + "grad_norm": 0.09469849616289139, + "learning_rate": 2.501725491511786e-07, + "loss": 0.2514, + "step": 77580 + }, + { + "epoch": 3.4489042983508913, + "grad_norm": 0.1331077516078949, + "learning_rate": 2.481118370826263e-07, + "loss": 0.2522, + "step": 77590 + }, + { + "epoch": 3.4493488020624974, + "grad_norm": 0.10727374255657196, + "learning_rate": 2.4605962625104393e-07, + "loss": 0.253, + "step": 77600 + }, + { + "epoch": 3.449793305774103, + "grad_norm": 0.11170374602079391, + "learning_rate": 2.440159170070955e-07, + "loss": 0.2496, + "step": 77610 + }, + { + "epoch": 3.450237809485709, + "grad_norm": 0.10746929794549942, + "learning_rate": 2.41980709699996e-07, + "loss": 0.2519, + "step": 77620 + }, + { + "epoch": 3.450682313197315, + "grad_norm": 0.09579674154520035, + "learning_rate": 2.3995400467751174e-07, + "loss": 0.2537, + "step": 77630 + }, + { + "epoch": 3.4511268169089213, + "grad_norm": 0.10227407515048981, + "learning_rate": 2.3793580228594902e-07, + "loss": 0.2514, + "step": 77640 + }, + { + "epoch": 3.451571320620527, + "grad_norm": 0.15070436894893646, + "learning_rate": 2.3592610287015982e-07, + "loss": 0.2529, + "step": 77650 + }, + { + "epoch": 3.452015824332133, + "grad_norm": 0.1190425381064415, + "learning_rate": 2.3392490677354718e-07, + "loss": 0.2517, + "step": 77660 + }, + { + "epoch": 3.452460328043739, + "grad_norm": 0.10888480395078659, + "learning_rate": 2.3193221433806533e-07, + "loss": 0.2516, + "step": 77670 + }, + { + "epoch": 3.4529048317553452, + "grad_norm": 0.1390042006969452, + "learning_rate": 2.299480259042086e-07, + "loss": 0.2531, + "step": 77680 + }, + { + "epoch": 3.4533493354669513, + "grad_norm": 0.10086729377508163, + "learning_rate": 2.2797234181102244e-07, + "loss": 0.25, + "step": 77690 + }, + { + "epoch": 3.453793839178557, + "grad_norm": 0.09591632336378098, + "learning_rate": 2.2600516239609238e-07, + "loss": 0.2554, + "step": 77700 + }, + { + "epoch": 3.454238342890163, + "grad_norm": 0.094894178211689, + "learning_rate": 2.240464879955606e-07, + "loss": 0.2528, + "step": 77710 + }, + { + "epoch": 3.454682846601769, + "grad_norm": 0.09172450751066208, + "learning_rate": 2.2209631894410387e-07, + "loss": 0.2526, + "step": 77720 + }, + { + "epoch": 3.4551273503133753, + "grad_norm": 0.151905357837677, + "learning_rate": 2.2015465557496117e-07, + "loss": 0.2576, + "step": 77730 + }, + { + "epoch": 3.4555718540249813, + "grad_norm": 0.13150475919246674, + "learning_rate": 2.1822149821990602e-07, + "loss": 0.2532, + "step": 77740 + }, + { + "epoch": 3.456016357736587, + "grad_norm": 0.09635671973228455, + "learning_rate": 2.1629684720926303e-07, + "loss": 0.255, + "step": 77750 + }, + { + "epoch": 3.456460861448193, + "grad_norm": 0.11004320532083511, + "learning_rate": 2.1438070287189693e-07, + "loss": 0.2535, + "step": 77760 + }, + { + "epoch": 3.456905365159799, + "grad_norm": 0.10927089303731918, + "learning_rate": 2.1247306553523472e-07, + "loss": 0.2533, + "step": 77770 + }, + { + "epoch": 3.457349868871405, + "grad_norm": 0.09669263660907745, + "learning_rate": 2.105739355252323e-07, + "loss": 0.2525, + "step": 77780 + }, + { + "epoch": 3.457794372583011, + "grad_norm": 0.12295547872781754, + "learning_rate": 2.0868331316639678e-07, + "loss": 0.2539, + "step": 77790 + }, + { + "epoch": 3.458238876294617, + "grad_norm": 0.11932806670665741, + "learning_rate": 2.0680119878179193e-07, + "loss": 0.2521, + "step": 77800 + }, + { + "epoch": 3.458683380006223, + "grad_norm": 0.1279679387807846, + "learning_rate": 2.0492759269301054e-07, + "loss": 0.2528, + "step": 77810 + }, + { + "epoch": 3.459127883717829, + "grad_norm": 0.09958943724632263, + "learning_rate": 2.0306249522021315e-07, + "loss": 0.2536, + "step": 77820 + }, + { + "epoch": 3.459572387429435, + "grad_norm": 0.1561235636472702, + "learning_rate": 2.0120590668207816e-07, + "loss": 0.2522, + "step": 77830 + }, + { + "epoch": 3.460016891141041, + "grad_norm": 0.10387687385082245, + "learning_rate": 1.993578273958574e-07, + "loss": 0.2518, + "step": 77840 + }, + { + "epoch": 3.460461394852647, + "grad_norm": 0.11584583669900894, + "learning_rate": 1.975182576773371e-07, + "loss": 0.2559, + "step": 77850 + }, + { + "epoch": 3.460905898564253, + "grad_norm": 0.10518820583820343, + "learning_rate": 1.9568719784083812e-07, + "loss": 0.2522, + "step": 77860 + }, + { + "epoch": 3.461350402275859, + "grad_norm": 0.10532836616039276, + "learning_rate": 1.9386464819924898e-07, + "loss": 0.2538, + "step": 77870 + }, + { + "epoch": 3.461794905987465, + "grad_norm": 0.11810948699712753, + "learning_rate": 1.9205060906399285e-07, + "loss": 0.2512, + "step": 77880 + }, + { + "epoch": 3.462239409699071, + "grad_norm": 0.10682544112205505, + "learning_rate": 1.9024508074503845e-07, + "loss": 0.2514, + "step": 77890 + }, + { + "epoch": 3.462683913410677, + "grad_norm": 0.09845206141471863, + "learning_rate": 1.8844806355089452e-07, + "loss": 0.2508, + "step": 77900 + }, + { + "epoch": 3.463128417122283, + "grad_norm": 0.10773993283510208, + "learning_rate": 1.8665955778863208e-07, + "loss": 0.2516, + "step": 77910 + }, + { + "epoch": 3.463572920833889, + "grad_norm": 0.13641858100891113, + "learning_rate": 1.848795637638512e-07, + "loss": 0.2523, + "step": 77920 + }, + { + "epoch": 3.464017424545495, + "grad_norm": 0.10045608133077621, + "learning_rate": 1.831080817807085e-07, + "loss": 0.2511, + "step": 77930 + }, + { + "epoch": 3.464461928257101, + "grad_norm": 0.11084600538015366, + "learning_rate": 1.813451121418952e-07, + "loss": 0.2531, + "step": 77940 + }, + { + "epoch": 3.464906431968707, + "grad_norm": 0.10995782166719437, + "learning_rate": 1.795906551486648e-07, + "loss": 0.2508, + "step": 77950 + }, + { + "epoch": 3.4653509356803127, + "grad_norm": 0.11414887011051178, + "learning_rate": 1.7784471110079414e-07, + "loss": 0.2552, + "step": 77960 + }, + { + "epoch": 3.465795439391919, + "grad_norm": 0.11214238405227661, + "learning_rate": 1.7610728029662793e-07, + "loss": 0.2507, + "step": 77970 + }, + { + "epoch": 3.466239943103525, + "grad_norm": 0.09916684776544571, + "learning_rate": 1.7437836303303979e-07, + "loss": 0.2546, + "step": 77980 + }, + { + "epoch": 3.466684446815131, + "grad_norm": 0.12775462865829468, + "learning_rate": 1.726579596054545e-07, + "loss": 0.2553, + "step": 77990 + }, + { + "epoch": 3.467128950526737, + "grad_norm": 0.13896089792251587, + "learning_rate": 1.7094607030784803e-07, + "loss": 0.2526, + "step": 78000 + }, + { + "epoch": 3.4675734542383427, + "grad_norm": 0.10662584006786346, + "learning_rate": 1.6924269543272532e-07, + "loss": 0.2514, + "step": 78010 + }, + { + "epoch": 3.468017957949949, + "grad_norm": 0.10583456605672836, + "learning_rate": 1.6754783527115348e-07, + "loss": 0.2503, + "step": 78020 + }, + { + "epoch": 3.468462461661555, + "grad_norm": 0.11063287407159805, + "learning_rate": 1.6586149011273422e-07, + "loss": 0.2566, + "step": 78030 + }, + { + "epoch": 3.468906965373161, + "grad_norm": 0.10802114754915237, + "learning_rate": 1.6418366024562038e-07, + "loss": 0.2517, + "step": 78040 + }, + { + "epoch": 3.469351469084767, + "grad_norm": 0.1256585568189621, + "learning_rate": 1.6251434595651037e-07, + "loss": 0.2539, + "step": 78050 + }, + { + "epoch": 3.4697959727963728, + "grad_norm": 0.10033389925956726, + "learning_rate": 1.608535475306372e-07, + "loss": 0.2521, + "step": 78060 + }, + { + "epoch": 3.470240476507979, + "grad_norm": 0.10769359022378922, + "learning_rate": 1.5920126525179048e-07, + "loss": 0.25, + "step": 78070 + }, + { + "epoch": 3.470684980219585, + "grad_norm": 0.09365054965019226, + "learning_rate": 1.5755749940229435e-07, + "loss": 0.2508, + "step": 78080 + }, + { + "epoch": 3.4711294839311906, + "grad_norm": 0.10699941962957382, + "learning_rate": 1.559222502630353e-07, + "loss": 0.2522, + "step": 78090 + }, + { + "epoch": 3.4715739876427967, + "grad_norm": 0.11137987673282623, + "learning_rate": 1.5429551811341757e-07, + "loss": 0.2506, + "step": 78100 + }, + { + "epoch": 3.4720184913544028, + "grad_norm": 0.10398388653993607, + "learning_rate": 1.5267730323141882e-07, + "loss": 0.253, + "step": 78110 + }, + { + "epoch": 3.472462995066009, + "grad_norm": 0.08620724827051163, + "learning_rate": 1.5106760589353454e-07, + "loss": 0.2526, + "step": 78120 + }, + { + "epoch": 3.472907498777615, + "grad_norm": 0.10478822141885757, + "learning_rate": 1.4946642637483355e-07, + "loss": 0.2547, + "step": 78130 + }, + { + "epoch": 3.4733520024892206, + "grad_norm": 0.13886916637420654, + "learning_rate": 1.4787376494889703e-07, + "loss": 0.2561, + "step": 78140 + }, + { + "epoch": 3.4737965062008267, + "grad_norm": 0.10288087278604507, + "learning_rate": 1.4628962188787955e-07, + "loss": 0.2522, + "step": 78150 + }, + { + "epoch": 3.474241009912433, + "grad_norm": 0.12346364557743073, + "learning_rate": 1.4471399746247006e-07, + "loss": 0.253, + "step": 78160 + }, + { + "epoch": 3.474685513624039, + "grad_norm": 0.11464046686887741, + "learning_rate": 1.4314689194188103e-07, + "loss": 0.2545, + "step": 78170 + }, + { + "epoch": 3.475130017335645, + "grad_norm": 0.10626962780952454, + "learning_rate": 1.4158830559390933e-07, + "loss": 0.2511, + "step": 78180 + }, + { + "epoch": 3.4755745210472506, + "grad_norm": 0.13636575639247894, + "learning_rate": 1.4003823868486422e-07, + "loss": 0.2518, + "step": 78190 + }, + { + "epoch": 3.4760190247588567, + "grad_norm": 0.10113105922937393, + "learning_rate": 1.3849669147960598e-07, + "loss": 0.2524, + "step": 78200 + }, + { + "epoch": 3.476463528470463, + "grad_norm": 0.10374266654253006, + "learning_rate": 1.3696366424155726e-07, + "loss": 0.2537, + "step": 78210 + }, + { + "epoch": 3.476908032182069, + "grad_norm": 0.12528806924819946, + "learning_rate": 1.35439157232653e-07, + "loss": 0.2538, + "step": 78220 + }, + { + "epoch": 3.4773525358936745, + "grad_norm": 0.14083559811115265, + "learning_rate": 1.3392317071340144e-07, + "loss": 0.2536, + "step": 78230 + }, + { + "epoch": 3.4777970396052806, + "grad_norm": 0.11543862521648407, + "learning_rate": 1.3241570494283984e-07, + "loss": 0.2529, + "step": 78240 + }, + { + "epoch": 3.4782415433168867, + "grad_norm": 0.09748401492834091, + "learning_rate": 1.3091676017855103e-07, + "loss": 0.2512, + "step": 78250 + }, + { + "epoch": 3.478686047028493, + "grad_norm": 0.1011383906006813, + "learning_rate": 1.2942633667666904e-07, + "loss": 0.2535, + "step": 78260 + }, + { + "epoch": 3.4791305507400985, + "grad_norm": 0.11186369508504868, + "learning_rate": 1.2794443469185679e-07, + "loss": 0.2521, + "step": 78270 + }, + { + "epoch": 3.4795750544517046, + "grad_norm": 0.10209912061691284, + "learning_rate": 1.2647105447734508e-07, + "loss": 0.2547, + "step": 78280 + }, + { + "epoch": 3.4800195581633107, + "grad_norm": 0.10360021889209747, + "learning_rate": 1.2500619628488254e-07, + "loss": 0.2526, + "step": 78290 + }, + { + "epoch": 3.4804640618749167, + "grad_norm": 0.12408848851919174, + "learning_rate": 1.2354986036477446e-07, + "loss": 0.2542, + "step": 78300 + }, + { + "epoch": 3.480908565586523, + "grad_norm": 0.11487981677055359, + "learning_rate": 1.221020469658718e-07, + "loss": 0.2527, + "step": 78310 + }, + { + "epoch": 3.4813530692981285, + "grad_norm": 0.14218604564666748, + "learning_rate": 1.2066275633556556e-07, + "loss": 0.2524, + "step": 78320 + }, + { + "epoch": 3.4817975730097346, + "grad_norm": 0.13131918013095856, + "learning_rate": 1.192319887197979e-07, + "loss": 0.2523, + "step": 78330 + }, + { + "epoch": 3.4822420767213407, + "grad_norm": 0.10813215374946594, + "learning_rate": 1.1780974436303438e-07, + "loss": 0.2556, + "step": 78340 + }, + { + "epoch": 3.4826865804329468, + "grad_norm": 0.100992850959301, + "learning_rate": 1.1639602350830836e-07, + "loss": 0.2543, + "step": 78350 + }, + { + "epoch": 3.4831310841445524, + "grad_norm": 0.11954477429389954, + "learning_rate": 1.1499082639718217e-07, + "loss": 0.2545, + "step": 78360 + }, + { + "epoch": 3.4835755878561585, + "grad_norm": 0.10143271088600159, + "learning_rate": 1.1359415326976374e-07, + "loss": 0.2515, + "step": 78370 + }, + { + "epoch": 3.4840200915677646, + "grad_norm": 0.11263185739517212, + "learning_rate": 1.1220600436470663e-07, + "loss": 0.2525, + "step": 78380 + }, + { + "epoch": 3.4844645952793707, + "grad_norm": 0.10289410501718521, + "learning_rate": 1.1082637991920996e-07, + "loss": 0.2509, + "step": 78390 + }, + { + "epoch": 3.4849090989909763, + "grad_norm": 0.11000076681375504, + "learning_rate": 1.0945528016901851e-07, + "loss": 0.2533, + "step": 78400 + }, + { + "epoch": 3.4853536027025824, + "grad_norm": 0.11888682842254639, + "learning_rate": 1.0809270534840599e-07, + "loss": 0.2544, + "step": 78410 + }, + { + "epoch": 3.4857981064141885, + "grad_norm": 0.10958676040172577, + "learning_rate": 1.067386556902028e-07, + "loss": 0.2523, + "step": 78420 + }, + { + "epoch": 3.4862426101257946, + "grad_norm": 0.10021772235631943, + "learning_rate": 1.05393131425785e-07, + "loss": 0.2524, + "step": 78430 + }, + { + "epoch": 3.4866871138374007, + "grad_norm": 0.10160278528928757, + "learning_rate": 1.0405613278505199e-07, + "loss": 0.2533, + "step": 78440 + }, + { + "epoch": 3.4871316175490064, + "grad_norm": 0.10985435545444489, + "learning_rate": 1.027276599964766e-07, + "loss": 0.2541, + "step": 78450 + }, + { + "epoch": 3.4875761212606124, + "grad_norm": 0.10748538374900818, + "learning_rate": 1.0140771328704391e-07, + "loss": 0.2541, + "step": 78460 + }, + { + "epoch": 3.4880206249722185, + "grad_norm": 0.1031230017542839, + "learning_rate": 1.0009629288231237e-07, + "loss": 0.2511, + "step": 78470 + }, + { + "epoch": 3.4884651286838246, + "grad_norm": 0.10060876607894897, + "learning_rate": 9.879339900635276e-08, + "loss": 0.2535, + "step": 78480 + }, + { + "epoch": 3.4889096323954307, + "grad_norm": 0.10499806702136993, + "learning_rate": 9.74990318817981e-08, + "loss": 0.2511, + "step": 78490 + }, + { + "epoch": 3.4893541361070364, + "grad_norm": 0.09334462881088257, + "learning_rate": 9.621319172982701e-08, + "loss": 0.2557, + "step": 78500 + }, + { + "epoch": 3.4897986398186425, + "grad_norm": 0.12311068922281265, + "learning_rate": 9.493587877015264e-08, + "loss": 0.2549, + "step": 78510 + }, + { + "epoch": 3.4902431435302486, + "grad_norm": 0.11931228637695312, + "learning_rate": 9.366709322102263e-08, + "loss": 0.2538, + "step": 78520 + }, + { + "epoch": 3.4906876472418547, + "grad_norm": 0.1305672526359558, + "learning_rate": 9.240683529924688e-08, + "loss": 0.2526, + "step": 78530 + }, + { + "epoch": 3.4911321509534603, + "grad_norm": 0.1019035279750824, + "learning_rate": 9.115510522016979e-08, + "loss": 0.2511, + "step": 78540 + }, + { + "epoch": 3.4915766546650664, + "grad_norm": 0.09515734016895294, + "learning_rate": 8.991190319767583e-08, + "loss": 0.2519, + "step": 78550 + }, + { + "epoch": 3.4920211583766725, + "grad_norm": 0.1240135207772255, + "learning_rate": 8.867722944419509e-08, + "loss": 0.2521, + "step": 78560 + }, + { + "epoch": 3.4924656620882786, + "grad_norm": 0.1104990765452385, + "learning_rate": 8.745108417069214e-08, + "loss": 0.2563, + "step": 78570 + }, + { + "epoch": 3.4929101657998842, + "grad_norm": 0.10292094945907593, + "learning_rate": 8.623346758669381e-08, + "loss": 0.252, + "step": 78580 + }, + { + "epoch": 3.4933546695114903, + "grad_norm": 0.10158982872962952, + "learning_rate": 8.502437990025037e-08, + "loss": 0.2488, + "step": 78590 + }, + { + "epoch": 3.4937991732230964, + "grad_norm": 0.09354785829782486, + "learning_rate": 8.382382131795764e-08, + "loss": 0.2516, + "step": 78600 + }, + { + "epoch": 3.4942436769347025, + "grad_norm": 0.10609739273786545, + "learning_rate": 8.263179204496818e-08, + "loss": 0.2557, + "step": 78610 + }, + { + "epoch": 3.4946881806463086, + "grad_norm": 0.10584845393896103, + "learning_rate": 8.144829228496354e-08, + "loss": 0.2531, + "step": 78620 + }, + { + "epoch": 3.4951326843579142, + "grad_norm": 0.09974125027656555, + "learning_rate": 8.027332224016526e-08, + "loss": 0.2534, + "step": 78630 + }, + { + "epoch": 3.4955771880695203, + "grad_norm": 0.11482489109039307, + "learning_rate": 7.910688211135164e-08, + "loss": 0.2555, + "step": 78640 + }, + { + "epoch": 3.4960216917811264, + "grad_norm": 0.10391406714916229, + "learning_rate": 7.794897209783546e-08, + "loss": 0.2532, + "step": 78650 + }, + { + "epoch": 3.4964661954927325, + "grad_norm": 0.08881762623786926, + "learning_rate": 7.679959239746403e-08, + "loss": 0.2523, + "step": 78660 + }, + { + "epoch": 3.496910699204338, + "grad_norm": 0.09386631846427917, + "learning_rate": 7.565874320664135e-08, + "loss": 0.2525, + "step": 78670 + }, + { + "epoch": 3.4973552029159443, + "grad_norm": 0.09348367899656296, + "learning_rate": 7.45264247203059e-08, + "loss": 0.2539, + "step": 78680 + }, + { + "epoch": 3.4977997066275504, + "grad_norm": 0.1146547719836235, + "learning_rate": 7.340263713194184e-08, + "loss": 0.2545, + "step": 78690 + }, + { + "epoch": 3.4982442103391564, + "grad_norm": 0.10895980149507523, + "learning_rate": 7.228738063356777e-08, + "loss": 0.2546, + "step": 78700 + }, + { + "epoch": 3.498688714050762, + "grad_norm": 0.10524742305278778, + "learning_rate": 7.118065541575903e-08, + "loss": 0.2544, + "step": 78710 + }, + { + "epoch": 3.499133217762368, + "grad_norm": 0.11102671176195145, + "learning_rate": 7.008246166761435e-08, + "loss": 0.2535, + "step": 78720 + }, + { + "epoch": 3.4995777214739743, + "grad_norm": 0.12360444664955139, + "learning_rate": 6.899279957679472e-08, + "loss": 0.2544, + "step": 78730 + }, + { + "epoch": 3.5000222251855804, + "grad_norm": 0.11543530225753784, + "learning_rate": 6.791166932949011e-08, + "loss": 0.2536, + "step": 78740 + }, + { + "epoch": 3.5004667288971865, + "grad_norm": 0.1330411732196808, + "learning_rate": 6.683907111043608e-08, + "loss": 0.2529, + "step": 78750 + }, + { + "epoch": 3.500911232608792, + "grad_norm": 0.10499600321054459, + "learning_rate": 6.577500510290824e-08, + "loss": 0.255, + "step": 78760 + }, + { + "epoch": 3.501355736320398, + "grad_norm": 0.11831541359424591, + "learning_rate": 6.47194714887278e-08, + "loss": 0.2542, + "step": 78770 + }, + { + "epoch": 3.5018002400320043, + "grad_norm": 0.11561041325330734, + "learning_rate": 6.367247044825608e-08, + "loss": 0.2534, + "step": 78780 + }, + { + "epoch": 3.5022447437436104, + "grad_norm": 0.14833347499370575, + "learning_rate": 6.263400216039994e-08, + "loss": 0.2551, + "step": 78790 + }, + { + "epoch": 3.5026892474552165, + "grad_norm": 0.10606381297111511, + "learning_rate": 6.160406680260078e-08, + "loss": 0.2492, + "step": 78800 + }, + { + "epoch": 3.503133751166822, + "grad_norm": 0.10856189578771591, + "learning_rate": 6.058266455084561e-08, + "loss": 0.2539, + "step": 78810 + }, + { + "epoch": 3.503578254878428, + "grad_norm": 0.12247073650360107, + "learning_rate": 5.956979557967257e-08, + "loss": 0.2528, + "step": 78820 + }, + { + "epoch": 3.5040227585900343, + "grad_norm": 0.12464594095945358, + "learning_rate": 5.856546006214325e-08, + "loss": 0.2519, + "step": 78830 + }, + { + "epoch": 3.50446726230164, + "grad_norm": 0.10905623435974121, + "learning_rate": 5.7569658169881466e-08, + "loss": 0.256, + "step": 78840 + }, + { + "epoch": 3.504911766013246, + "grad_norm": 0.12081333249807358, + "learning_rate": 5.658239007303445e-08, + "loss": 0.2524, + "step": 78850 + }, + { + "epoch": 3.505356269724852, + "grad_norm": 0.1138848215341568, + "learning_rate": 5.560365594030059e-08, + "loss": 0.2503, + "step": 78860 + }, + { + "epoch": 3.5058007734364582, + "grad_norm": 0.1201435923576355, + "learning_rate": 5.463345593891833e-08, + "loss": 0.2541, + "step": 78870 + }, + { + "epoch": 3.5062452771480643, + "grad_norm": 0.10296991467475891, + "learning_rate": 5.367179023467173e-08, + "loss": 0.2545, + "step": 78880 + }, + { + "epoch": 3.50668978085967, + "grad_norm": 0.10327580571174622, + "learning_rate": 5.271865899187378e-08, + "loss": 0.2537, + "step": 78890 + }, + { + "epoch": 3.507134284571276, + "grad_norm": 0.10053130984306335, + "learning_rate": 5.177406237340532e-08, + "loss": 0.2516, + "step": 78900 + }, + { + "epoch": 3.507578788282882, + "grad_norm": 0.1291593313217163, + "learning_rate": 5.083800054065946e-08, + "loss": 0.2535, + "step": 78910 + }, + { + "epoch": 3.5080232919944883, + "grad_norm": 0.09514418989419937, + "learning_rate": 4.991047365358603e-08, + "loss": 0.2514, + "step": 78920 + }, + { + "epoch": 3.5084677957060943, + "grad_norm": 0.10469251871109009, + "learning_rate": 4.899148187067493e-08, + "loss": 0.254, + "step": 78930 + }, + { + "epoch": 3.5089122994177, + "grad_norm": 0.10405942052602768, + "learning_rate": 4.808102534895609e-08, + "loss": 0.2544, + "step": 78940 + }, + { + "epoch": 3.509356803129306, + "grad_norm": 0.10086852312088013, + "learning_rate": 4.717910424400507e-08, + "loss": 0.2533, + "step": 78950 + }, + { + "epoch": 3.509801306840912, + "grad_norm": 0.1094249039888382, + "learning_rate": 4.628571870993193e-08, + "loss": 0.252, + "step": 78960 + }, + { + "epoch": 3.5102458105525183, + "grad_norm": 0.10251540690660477, + "learning_rate": 4.5400868899392324e-08, + "loss": 0.2527, + "step": 78970 + }, + { + "epoch": 3.5106903142641244, + "grad_norm": 0.1587730199098587, + "learning_rate": 4.4524554963576436e-08, + "loss": 0.2522, + "step": 78980 + }, + { + "epoch": 3.51113481797573, + "grad_norm": 0.10556896030902863, + "learning_rate": 4.36567770522367e-08, + "loss": 0.2559, + "step": 78990 + }, + { + "epoch": 3.511579321687336, + "grad_norm": 0.08655247837305069, + "learning_rate": 4.279753531364339e-08, + "loss": 0.2516, + "step": 79000 + }, + { + "epoch": 3.512023825398942, + "grad_norm": 0.12089703232049942, + "learning_rate": 4.1946829894617955e-08, + "loss": 0.2545, + "step": 79010 + }, + { + "epoch": 3.512468329110548, + "grad_norm": 0.12470707297325134, + "learning_rate": 4.11046609405219e-08, + "loss": 0.2557, + "step": 79020 + }, + { + "epoch": 3.512912832822154, + "grad_norm": 0.09307227283716202, + "learning_rate": 4.027102859526233e-08, + "loss": 0.2522, + "step": 79030 + }, + { + "epoch": 3.51335733653376, + "grad_norm": 0.09537755697965622, + "learning_rate": 3.9445933001280856e-08, + "loss": 0.2506, + "step": 79040 + }, + { + "epoch": 3.513801840245366, + "grad_norm": 0.11136385053396225, + "learning_rate": 3.8629374299564704e-08, + "loss": 0.2533, + "step": 79050 + }, + { + "epoch": 3.514246343956972, + "grad_norm": 0.09989645332098007, + "learning_rate": 3.78213526296467e-08, + "loss": 0.2547, + "step": 79060 + }, + { + "epoch": 3.514690847668578, + "grad_norm": 0.10700109601020813, + "learning_rate": 3.702186812958308e-08, + "loss": 0.2555, + "step": 79070 + }, + { + "epoch": 3.515135351380184, + "grad_norm": 0.12922303378582, + "learning_rate": 3.623092093599789e-08, + "loss": 0.2519, + "step": 79080 + }, + { + "epoch": 3.51557985509179, + "grad_norm": 0.15330404043197632, + "learning_rate": 3.544851118403303e-08, + "loss": 0.2524, + "step": 79090 + }, + { + "epoch": 3.516024358803396, + "grad_norm": 0.10103847831487656, + "learning_rate": 3.46746390073871e-08, + "loss": 0.2535, + "step": 79100 + }, + { + "epoch": 3.5164688625150022, + "grad_norm": 0.11397425085306168, + "learning_rate": 3.390930453829322e-08, + "loss": 0.2518, + "step": 79110 + }, + { + "epoch": 3.516913366226608, + "grad_norm": 0.12396841496229172, + "learning_rate": 3.3152507907519005e-08, + "loss": 0.252, + "step": 79120 + }, + { + "epoch": 3.517357869938214, + "grad_norm": 0.12325581163167953, + "learning_rate": 3.240424924438323e-08, + "loss": 0.2547, + "step": 79130 + }, + { + "epoch": 3.51780237364982, + "grad_norm": 0.1380370408296585, + "learning_rate": 3.1664528676750295e-08, + "loss": 0.254, + "step": 79140 + }, + { + "epoch": 3.5182468773614257, + "grad_norm": 0.09944088757038116, + "learning_rate": 3.093334633100797e-08, + "loss": 0.2558, + "step": 79150 + }, + { + "epoch": 3.518691381073032, + "grad_norm": 0.10433501750230789, + "learning_rate": 3.021070233210077e-08, + "loss": 0.2531, + "step": 79160 + }, + { + "epoch": 3.519135884784638, + "grad_norm": 0.10761716961860657, + "learning_rate": 2.9496596803507692e-08, + "loss": 0.2536, + "step": 79170 + }, + { + "epoch": 3.519580388496244, + "grad_norm": 0.11431697010993958, + "learning_rate": 2.879102986725335e-08, + "loss": 0.2528, + "step": 79180 + }, + { + "epoch": 3.52002489220785, + "grad_norm": 0.11234256625175476, + "learning_rate": 2.809400164389131e-08, + "loss": 0.2499, + "step": 79190 + }, + { + "epoch": 3.5204693959194557, + "grad_norm": 0.11801670491695404, + "learning_rate": 2.740551225253185e-08, + "loss": 0.2573, + "step": 79200 + }, + { + "epoch": 3.520913899631062, + "grad_norm": 0.11716710776090622, + "learning_rate": 2.6725561810819754e-08, + "loss": 0.2538, + "step": 79210 + }, + { + "epoch": 3.521358403342668, + "grad_norm": 0.10787206143140793, + "learning_rate": 2.605415043493431e-08, + "loss": 0.2523, + "step": 79220 + }, + { + "epoch": 3.521802907054274, + "grad_norm": 0.10564250499010086, + "learning_rate": 2.5391278239605965e-08, + "loss": 0.2517, + "step": 79230 + }, + { + "epoch": 3.52224741076588, + "grad_norm": 0.11029903590679169, + "learning_rate": 2.4736945338094118e-08, + "loss": 0.2554, + "step": 79240 + }, + { + "epoch": 3.5226919144774858, + "grad_norm": 0.1170106828212738, + "learning_rate": 2.4091151842214887e-08, + "loss": 0.2522, + "step": 79250 + }, + { + "epoch": 3.523136418189092, + "grad_norm": 0.10190876573324203, + "learning_rate": 2.3453897862318884e-08, + "loss": 0.2536, + "step": 79260 + }, + { + "epoch": 3.523580921900698, + "grad_norm": 0.12620799243450165, + "learning_rate": 2.2825183507285686e-08, + "loss": 0.255, + "step": 79270 + }, + { + "epoch": 3.5240254256123036, + "grad_norm": 0.13741664588451385, + "learning_rate": 2.220500888455157e-08, + "loss": 0.2525, + "step": 79280 + }, + { + "epoch": 3.52446992932391, + "grad_norm": 0.12084964662790298, + "learning_rate": 2.1593374100081777e-08, + "loss": 0.2517, + "step": 79290 + }, + { + "epoch": 3.5249144330355158, + "grad_norm": 0.11578010022640228, + "learning_rate": 2.09902792583927e-08, + "loss": 0.2525, + "step": 79300 + }, + { + "epoch": 3.525358936747122, + "grad_norm": 0.11388853192329407, + "learning_rate": 2.0395724462540788e-08, + "loss": 0.2521, + "step": 79310 + }, + { + "epoch": 3.525803440458728, + "grad_norm": 0.10931450873613358, + "learning_rate": 1.9809709814111453e-08, + "loss": 0.2546, + "step": 79320 + }, + { + "epoch": 3.5262479441703336, + "grad_norm": 0.1245216578245163, + "learning_rate": 1.923223541324126e-08, + "loss": 0.2537, + "step": 79330 + }, + { + "epoch": 3.5266924478819397, + "grad_norm": 0.1075534000992775, + "learning_rate": 1.8663301358606833e-08, + "loss": 0.2535, + "step": 79340 + }, + { + "epoch": 3.527136951593546, + "grad_norm": 0.10568396747112274, + "learning_rate": 1.8102907747419295e-08, + "loss": 0.2529, + "step": 79350 + }, + { + "epoch": 3.527581455305152, + "grad_norm": 0.09886333346366882, + "learning_rate": 1.7551054675435385e-08, + "loss": 0.2535, + "step": 79360 + }, + { + "epoch": 3.528025959016758, + "grad_norm": 0.11108209192752838, + "learning_rate": 1.7007742236957447e-08, + "loss": 0.2541, + "step": 79370 + }, + { + "epoch": 3.5284704627283636, + "grad_norm": 0.12106505036354065, + "learning_rate": 1.647297052481678e-08, + "loss": 0.2537, + "step": 79380 + }, + { + "epoch": 3.5289149664399697, + "grad_norm": 0.10027696192264557, + "learning_rate": 1.5946739630390285e-08, + "loss": 0.2519, + "step": 79390 + }, + { + "epoch": 3.529359470151576, + "grad_norm": 0.11147934198379517, + "learning_rate": 1.5429049643606032e-08, + "loss": 0.2536, + "step": 79400 + }, + { + "epoch": 3.529803973863182, + "grad_norm": 0.0995330661535263, + "learning_rate": 1.4919900652909935e-08, + "loss": 0.2548, + "step": 79410 + }, + { + "epoch": 3.530248477574788, + "grad_norm": 0.09813707321882248, + "learning_rate": 1.4419292745310175e-08, + "loss": 0.2526, + "step": 79420 + }, + { + "epoch": 3.5306929812863936, + "grad_norm": 0.10027368366718292, + "learning_rate": 1.3927226006343885e-08, + "loss": 0.2532, + "step": 79430 + }, + { + "epoch": 3.5311374849979997, + "grad_norm": 0.11349767446517944, + "learning_rate": 1.3443700520093805e-08, + "loss": 0.2545, + "step": 79440 + }, + { + "epoch": 3.531581988709606, + "grad_norm": 0.12720850110054016, + "learning_rate": 1.296871636917718e-08, + "loss": 0.2544, + "step": 79450 + }, + { + "epoch": 3.5320264924212115, + "grad_norm": 0.10123108327388763, + "learning_rate": 1.2502273634762419e-08, + "loss": 0.2536, + "step": 79460 + }, + { + "epoch": 3.5324709961328176, + "grad_norm": 0.10079477727413177, + "learning_rate": 1.2044372396546876e-08, + "loss": 0.2554, + "step": 79470 + }, + { + "epoch": 3.5329154998444237, + "grad_norm": 0.15108118951320648, + "learning_rate": 1.1595012732773524e-08, + "loss": 0.2534, + "step": 79480 + }, + { + "epoch": 3.5333600035560297, + "grad_norm": 0.09142272174358368, + "learning_rate": 1.1154194720225386e-08, + "loss": 0.2519, + "step": 79490 + }, + { + "epoch": 3.533804507267636, + "grad_norm": 0.10497289896011353, + "learning_rate": 1.0721918434231093e-08, + "loss": 0.2528, + "step": 79500 + }, + { + "epoch": 3.5342490109792415, + "grad_norm": 0.11723963171243668, + "learning_rate": 1.0298183948648232e-08, + "loss": 0.2534, + "step": 79510 + }, + { + "epoch": 3.5346935146908476, + "grad_norm": 0.11473075300455093, + "learning_rate": 9.88299133588555e-09, + "loss": 0.2538, + "step": 79520 + }, + { + "epoch": 3.5351380184024537, + "grad_norm": 0.11447083950042725, + "learning_rate": 9.476340666891847e-09, + "loss": 0.2561, + "step": 79530 + }, + { + "epoch": 3.5355825221140598, + "grad_norm": 0.11261186003684998, + "learning_rate": 9.078232011139332e-09, + "loss": 0.2537, + "step": 79540 + }, + { + "epoch": 3.536027025825666, + "grad_norm": 0.10227998346090317, + "learning_rate": 8.68866543666802e-09, + "loss": 0.2509, + "step": 79550 + }, + { + "epoch": 3.5364715295372715, + "grad_norm": 0.11850996315479279, + "learning_rate": 8.307641010035782e-09, + "loss": 0.2542, + "step": 79560 + }, + { + "epoch": 3.5369160332488776, + "grad_norm": 0.08569584041833878, + "learning_rate": 7.93515879635165e-09, + "loss": 0.2527, + "step": 79570 + }, + { + "epoch": 3.5373605369604837, + "grad_norm": 0.10122355818748474, + "learning_rate": 7.571218859264706e-09, + "loss": 0.2528, + "step": 79580 + }, + { + "epoch": 3.5378050406720893, + "grad_norm": 0.11186535656452179, + "learning_rate": 7.215821260958544e-09, + "loss": 0.2512, + "step": 79590 + }, + { + "epoch": 3.538249544383696, + "grad_norm": 0.09212121367454529, + "learning_rate": 6.868966062162363e-09, + "loss": 0.2524, + "step": 79600 + }, + { + "epoch": 3.5386940480953015, + "grad_norm": 0.13183856010437012, + "learning_rate": 6.530653322145419e-09, + "loss": 0.2525, + "step": 79610 + }, + { + "epoch": 3.5391385518069076, + "grad_norm": 0.12313632667064667, + "learning_rate": 6.200883098717025e-09, + "loss": 0.2522, + "step": 79620 + }, + { + "epoch": 3.5395830555185137, + "grad_norm": 0.10145590454339981, + "learning_rate": 5.879655448226551e-09, + "loss": 0.2542, + "step": 79630 + }, + { + "epoch": 3.5400275592301194, + "grad_norm": 0.12082265317440033, + "learning_rate": 5.566970425557872e-09, + "loss": 0.2569, + "step": 79640 + }, + { + "epoch": 3.5404720629417255, + "grad_norm": 0.12914401292800903, + "learning_rate": 5.26282808414047e-09, + "loss": 0.2521, + "step": 79650 + }, + { + "epoch": 3.5409165666533315, + "grad_norm": 0.11410703510046005, + "learning_rate": 4.967228475949437e-09, + "loss": 0.2549, + "step": 79660 + }, + { + "epoch": 3.5413610703649376, + "grad_norm": 0.09953679144382477, + "learning_rate": 4.680171651494369e-09, + "loss": 0.2551, + "step": 79670 + }, + { + "epoch": 3.5418055740765437, + "grad_norm": 0.1242046058177948, + "learning_rate": 4.40165765981937e-09, + "loss": 0.2527, + "step": 79680 + }, + { + "epoch": 3.5422500777881494, + "grad_norm": 0.11072325706481934, + "learning_rate": 4.131686548519698e-09, + "loss": 0.2532, + "step": 79690 + }, + { + "epoch": 3.5426945814997555, + "grad_norm": 0.11372417956590652, + "learning_rate": 3.8702583637251214e-09, + "loss": 0.2542, + "step": 79700 + }, + { + "epoch": 3.5431390852113616, + "grad_norm": 0.14911508560180664, + "learning_rate": 3.617373150105463e-09, + "loss": 0.2548, + "step": 79710 + }, + { + "epoch": 3.5435835889229677, + "grad_norm": 0.1133931502699852, + "learning_rate": 3.3730309508706036e-09, + "loss": 0.2516, + "step": 79720 + }, + { + "epoch": 3.5440280926345737, + "grad_norm": 0.11584270000457764, + "learning_rate": 3.137231807781582e-09, + "loss": 0.2527, + "step": 79730 + }, + { + "epoch": 3.5444725963461794, + "grad_norm": 0.10114267468452454, + "learning_rate": 2.9099757611172894e-09, + "loss": 0.253, + "step": 79740 + }, + { + "epoch": 3.5449171000577855, + "grad_norm": 0.11600851267576218, + "learning_rate": 2.6912628497133275e-09, + "loss": 0.2562, + "step": 79750 + }, + { + "epoch": 3.5453616037693916, + "grad_norm": 0.10292109102010727, + "learning_rate": 2.481093110945354e-09, + "loss": 0.2516, + "step": 79760 + }, + { + "epoch": 3.5458061074809972, + "grad_norm": 0.12866434454917908, + "learning_rate": 2.2794665807235327e-09, + "loss": 0.2518, + "step": 79770 + }, + { + "epoch": 3.5462506111926033, + "grad_norm": 0.09446853399276733, + "learning_rate": 2.0863832934980843e-09, + "loss": 0.2533, + "step": 79780 + }, + { + "epoch": 3.5466951149042094, + "grad_norm": 0.11076155304908752, + "learning_rate": 1.901843282264837e-09, + "loss": 0.2548, + "step": 79790 + }, + { + "epoch": 3.5471396186158155, + "grad_norm": 0.11512062698602676, + "learning_rate": 1.7258465785541245e-09, + "loss": 0.2542, + "step": 79800 + }, + { + "epoch": 3.5475841223274216, + "grad_norm": 0.12165887653827667, + "learning_rate": 1.5583932124418887e-09, + "loss": 0.2528, + "step": 79810 + }, + { + "epoch": 3.5480286260390272, + "grad_norm": 0.10896533727645874, + "learning_rate": 1.3994832125441282e-09, + "loss": 0.2517, + "step": 79820 + }, + { + "epoch": 3.5484731297506333, + "grad_norm": 0.1152489185333252, + "learning_rate": 1.2491166060057958e-09, + "loss": 0.2514, + "step": 79830 + }, + { + "epoch": 3.5489176334622394, + "grad_norm": 0.10224618762731552, + "learning_rate": 1.1072934185230032e-09, + "loss": 0.2526, + "step": 79840 + }, + { + "epoch": 3.5493621371738455, + "grad_norm": 0.11042051017284393, + "learning_rate": 9.740136743319195e-10, + "loss": 0.2514, + "step": 79850 + }, + { + "epoch": 3.5498066408854516, + "grad_norm": 0.11000953614711761, + "learning_rate": 8.492773962087696e-10, + "loss": 0.252, + "step": 79860 + }, + { + "epoch": 3.5502511445970573, + "grad_norm": 0.11292394995689392, + "learning_rate": 7.330846054587337e-10, + "loss": 0.2532, + "step": 79870 + }, + { + "epoch": 3.5506956483086634, + "grad_norm": 0.10665334016084671, + "learning_rate": 6.254353219492526e-10, + "loss": 0.248, + "step": 79880 + }, + { + "epoch": 3.5511401520202694, + "grad_norm": 0.133614182472229, + "learning_rate": 5.263295640600685e-10, + "loss": 0.2522, + "step": 79890 + }, + { + "epoch": 3.551584655731875, + "grad_norm": 0.12080930173397064, + "learning_rate": 4.357673487387359e-10, + "loss": 0.2503, + "step": 79900 + }, + { + "epoch": 3.552029159443481, + "grad_norm": 0.11538636684417725, + "learning_rate": 3.537486914506616e-10, + "loss": 0.2525, + "step": 79910 + }, + { + "epoch": 3.5524736631550873, + "grad_norm": 0.10538680851459503, + "learning_rate": 2.8027360621241117e-10, + "loss": 0.2533, + "step": 79920 + }, + { + "epoch": 3.5529181668666934, + "grad_norm": 0.11703132838010788, + "learning_rate": 2.153421055806071e-10, + "loss": 0.2542, + "step": 79930 + }, + { + "epoch": 3.5533626705782995, + "grad_norm": 0.1070922315120697, + "learning_rate": 1.5895420064637733e-10, + "loss": 0.2507, + "step": 79940 + }, + { + "epoch": 3.553807174289905, + "grad_norm": 0.11490818858146667, + "learning_rate": 1.1110990105200891e-10, + "loss": 0.2536, + "step": 79950 + }, + { + "epoch": 3.554251678001511, + "grad_norm": 0.10112487524747849, + "learning_rate": 7.180921496874326e-11, + "loss": 0.2535, + "step": 79960 + }, + { + "epoch": 3.5546961817131173, + "grad_norm": 0.12884949147701263, + "learning_rate": 4.10521491134297e-11, + "loss": 0.2536, + "step": 79970 + }, + { + "epoch": 3.5551406854247234, + "grad_norm": 0.11067686229944229, + "learning_rate": 1.883870874297422e-11, + "loss": 0.2508, + "step": 79980 + }, + { + "epoch": 3.5555851891363295, + "grad_norm": 0.11678064614534378, + "learning_rate": 5.1688976432373584e-12, + "loss": 0.2529, + "step": 79990 + }, + { + "epoch": 3.556029692847935, + "grad_norm": 0.14569410681724548, + "learning_rate": 4.271816234080461e-14, + "loss": 0.254, + "step": 80000 + } + ], + "logging_steps": 10, + "max_steps": 80000, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 20000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}